commit 14c678ceac362b99ae37a448a6859dfa1227898b Author: Zhongwei Li Date: Sat Nov 29 18:47:40 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..f61a7ec --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "tailscale-sshsync-agent", + "description": "Manages distributed workloads and file sharing across Tailscale SSH-connected machines. Automates remote command execution, intelligent load balancing, file synchronization workflows, host health monitoring, and multi-machine orchestration using sshsync.", + "version": "0.0.0-2025.11.28", + "author": { + "name": "William VanSickle III", + "email": "noreply@humanfrontierlabs.com" + }, + "skills": [ + "./" + ] +} \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..281b1f4 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,163 @@ +# Changelog + +All notable changes to Tailscale SSH Sync Agent will be documented here. + +Format based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +Versioning follows [Semantic Versioning](https://semver.org/). + +## [1.0.0] - 2025-10-19 + +### Added + +**Core Functionality:** +- `sshsync_wrapper.py`: Python interface to sshsync CLI operations + - `get_host_status()`: Check online/offline status of hosts + - `execute_on_all()`: Run commands on all configured hosts + - `execute_on_group()`: Run commands on specific groups + - `execute_on_host()`: Run commands on single host + - `push_to_hosts()`: Push files to multiple hosts (with groups support) + - `pull_from_host()`: Pull files from hosts + - `list_hosts()`: List all configured hosts + - `get_groups()`: Get group configuration + +- `tailscale_manager.py`: Tailscale-specific operations + - `get_tailscale_status()`: Get complete network status + - `check_connectivity()`: Ping hosts via Tailscale + - `get_peer_info()`: Get detailed peer information + - `list_online_machines()`: List all online Tailscale machines + - `validate_tailscale_ssh()`: Check if Tailscale SSH works for a host + - `get_network_summary()`: Human-readable network summary + +- `load_balancer.py`: Intelligent task distribution + - `get_machine_load()`: Get CPU, memory, disk metrics for a machine + - `select_optimal_host()`: Pick best host based on current load + - `get_group_capacity()`: Get aggregate capacity of a group + - `distribute_tasks()`: Distribute multiple tasks optimally across hosts + - `format_load_report()`: Format load metrics as human-readable report + +- `workflow_executor.py`: Common multi-machine workflows + - `deploy_workflow()`: Full deployment pipeline (staging → test → production) + - `backup_workflow()`: Backup files from multiple hosts + - `sync_workflow()`: Sync files from one host to many + - `rolling_restart()`: Zero-downtime service restart across group + - `health_check_workflow()`: Check health endpoints across group + +**Utilities:** +- `utils/helpers.py`: Common formatting and parsing functions + - Byte formatting (`format_bytes`) + - Duration formatting (`format_duration`) + - Percentage formatting (`format_percentage`) + - SSH config parsing (`parse_ssh_config`) + - sshsync config parsing (`parse_sshsync_config`) + - System metrics parsing (`parse_disk_usage`, `parse_memory_usage`, `parse_cpu_load`) + - Load score calculation (`calculate_load_score`) + - Status classification (`classify_load_status`, `classify_latency`) + - Safe command execution (`run_command`, `safe_execute`) + +- `utils/validators/`: Comprehensive validation system + - `parameter_validator.py`: Input validation (hosts, groups, paths, timeouts, commands) + - `host_validator.py`: Host configuration and availability validation + - `connection_validator.py`: SSH and Tailscale connection validation + +**Testing:** +- `tests/test_integration.py`: 11 end-to-end integration tests +- `tests/test_helpers.py`: 11 helper function tests +- `tests/test_validation.py`: 7 validation tests +- **Total: 29 tests** covering all major functionality + +**Documentation:** +- `SKILL.md`: Complete skill documentation (6,000+ words) + - When to use this skill + - How it works + - Data sources (sshsync CLI, Tailscale) + - Detailed workflows for each operation type + - Available scripts and functions + - Error handling and validations + - Performance and caching strategies + - Usage examples +- `references/sshsync-guide.md`: Complete sshsync CLI reference +- `references/tailscale-integration.md`: Tailscale integration guide +- `README.md`: Installation and quick start guide +- `INSTALLATION.md`: Detailed setup tutorial +- `DECISIONS.md`: Architecture decisions and rationale + +### Data Sources + +**sshsync CLI:** +- Installation: `pip install sshsync` +- Configuration: `~/.config/sshsync/config.yaml` +- SSH config integration: `~/.ssh/config` +- Group-based host management +- Remote command execution with timeouts +- File push/pull operations (single or recursive) +- Status checking and connectivity validation + +**Tailscale:** +- Zero-config VPN with WireGuard encryption +- MagicDNS for easy host addressing +- Built-in SSH capabilities +- Seamless integration with standard SSH +- Peer-to-peer connections +- Works across NATs and firewalls + +### Coverage + +**Operations:** +- Host status monitoring and availability checks +- Intelligent load-based task distribution +- Multi-host command execution (all hosts, groups, individual) +- File synchronization workflows (push/pull) +- Deployment pipelines (staging → production) +- Backup and sync workflows +- Rolling restarts with zero downtime +- Health checking across services + +**Geographic Coverage:** All hosts in Tailscale network (global) + +**Temporal Coverage:** Real-time status and operations + +### Known Limitations + +**v1.0.0:** +- sshsync must be installed separately (`pip install sshsync`) +- Tailscale must be configured separately +- SSH keys must be set up manually on each host +- Load balancing uses simple metrics (CPU, memory, disk) +- No built-in monitoring dashboards (terminal output only) +- No persistence of operation history (logs only) +- Requires SSH config and sshsync config to be manually maintained + +### Planned for v2.0 + +**Enhanced Features:** +- Automated SSH key distribution across hosts +- Built-in operation history and logging database +- Web dashboard for monitoring and operations +- Advanced load balancing with custom metrics +- Scheduled operations and cron integration +- Operation rollback capabilities +- Integration with configuration management tools (Ansible, Terraform) +- Cost tracking for cloud resources +- Performance metrics collection and visualization +- Alert system for failed operations +- Multi-tenancy support for team environments + +**Integrations:** +- Prometheus metrics export +- Grafana dashboard templates +- Slack/Discord notifications +- CI/CD pipeline integration +- Container orchestration support (Docker, Kubernetes) + +## [Unreleased] + +### Planned + +- Add support for Windows hosts (PowerShell remoting) +- Improve performance for large host groups (100+) +- Add SSH connection pooling for faster operations +- Implement operation queueing for long-running tasks +- Add support for custom validation plugins +- Expand coverage to Docker containers via SSH +- Add retry strategies with exponential backoff +- Implement circuit breaker pattern for failing hosts diff --git a/DECISIONS.md b/DECISIONS.md new file mode 100644 index 0000000..2fd49ea --- /dev/null +++ b/DECISIONS.md @@ -0,0 +1,458 @@ +# Architecture Decisions + +Documentation of all technical decisions made for Tailscale SSH Sync Agent. + +## Tool Selection + +### Selected Tool: sshsync + +**Justification:** + +✅ **Advantages:** +- **Ready-to-use**: Available via `pip install sshsync` +- **Group management**: Built-in support for organizing hosts into groups +- **Integration**: Works with existing SSH config (`~/.ssh/config`) +- **Simple API**: Easy-to-wrap CLI interface +- **Parallel execution**: Commands run concurrently across hosts +- **File operations**: Push/pull with recursive support +- **Timeout handling**: Per-command timeouts for reliability +- **Active maintenance**: Regular updates and bug fixes +- **Python-based**: Easy to extend and integrate + +✅ **Coverage:** +- All SSH-accessible hosts +- Works with any SSH server (Linux, macOS, BSD, etc.) +- Platform-agnostic (runs on any OS with Python) + +✅ **Cost:** +- Free and open-source +- No API keys or subscriptions required +- No rate limits + +✅ **Documentation:** +- Clear command-line interface +- PyPI documentation available +- GitHub repository with examples + +**Alternatives Considered:** + +❌ **Fabric (Python library)** +- Pros: Pure Python, very flexible +- Cons: Requires writing more code, no built-in group management +- **Rejected because**: sshsync provides ready-made functionality + +❌ **Ansible** +- Pros: Industry standard, very powerful +- Cons: Requires learning YAML playbooks, overkill for simple operations +- **Rejected because**: Too heavyweight for ad-hoc commands and file transfers + +❌ **pssh (parallel-ssh)** +- Pros: Simple parallel SSH +- Cons: No group management, no file transfer built-in, less actively maintained +- **Rejected because**: sshsync has better group management and file operations + +❌ **Custom SSH wrapper** +- Pros: Full control +- Cons: Reinventing the wheel, maintaining parallel execution logic +- **Rejected because**: sshsync already provides what we need + +**Conclusion:** + +sshsync is the best tool for this use case because it: +1. Provides group-based host management out of the box +2. Handles parallel execution automatically +3. Integrates with existing SSH configuration +4. Supports both command execution and file transfers +5. Requires minimal wrapper code + +## Integration: Tailscale + +**Decision**: Integrate with Tailscale for network connectivity + +**Justification:** + +✅ **Why Tailscale:** +- **Zero-config VPN**: No manual firewall/NAT configuration +- **Secure by default**: WireGuard encryption +- **Works everywhere**: Coffee shop, home, office, cloud +- **MagicDNS**: Easy addressing (machine-name.tailnet.ts.net) +- **Standard SSH**: Works with all SSH tools including sshsync +- **No overhead**: Uses regular SSH protocol over Tailscale network + +✅ **Integration approach:** +- Tailscale provides the network layer +- Standard SSH works over Tailscale +- sshsync operates normally using Tailscale hostnames/IPs +- No Tailscale-specific code needed in core operations +- Tailscale status checking for diagnostics + +**Alternatives:** + +❌ **Direct public internet + port forwarding** +- Cons: Complex firewall setup, security risks, doesn't work on mobile/restricted networks +- **Rejected because**: Requires too much configuration and has security concerns + +❌ **Other VPNs (WireGuard, OpenVPN, ZeroTier)** +- Cons: More manual configuration, less zero-config +- **Rejected because**: Tailscale is easier to set up and use + +**Conclusion:** + +Tailscale + standard SSH is the optimal combination: +- Secure connectivity without configuration +- Works with existing SSH tools +- No vendor lock-in (can use other VPNs if needed) + +## Architecture + +### Structure: Modular Scripts + Utilities + +**Decision**: Separate concerns into focused modules + +``` +scripts/ +├── sshsync_wrapper.py # sshsync CLI interface +├── tailscale_manager.py # Tailscale operations +├── load_balancer.py # Task distribution logic +├── workflow_executor.py # Common workflows +└── utils/ + ├── helpers.py # Formatting, parsing + └── validators/ # Input validation +``` + +**Justification:** + +✅ **Modularity:** +- Each script has single responsibility +- Easy to test independently +- Easy to extend without breaking others + +✅ **Reusability:** +- Helpers used across all scripts +- Validators prevent duplicate validation logic +- Workflows compose lower-level operations + +✅ **Maintainability:** +- Clear file organization +- Easy to locate specific functionality +- Separation of concerns + +**Alternatives:** + +❌ **Monolithic single script** +- Cons: Hard to test, hard to maintain, becomes too large +- **Rejected because**: Doesn't scale well + +❌ **Over-engineered class hierarchy** +- Cons: Unnecessary complexity for this use case +- **Rejected because**: Simple functions are sufficient + +**Conclusion:** + +Modular functional approach provides good balance of simplicity and maintainability. + +### Validation Strategy: Multi-Layer + +**Decision**: Validate at multiple layers + +**Layers:** + +1. **Parameter validation** (`parameter_validator.py`) + - Validates user inputs before any operations + - Prevents invalid hosts, groups, paths, etc. + +2. **Host validation** (`host_validator.py`) + - Validates SSH configuration exists + - Checks host reachability + - Validates group membership + +3. **Connection validation** (`connection_validator.py`) + - Tests actual SSH connectivity + - Verifies Tailscale status + - Checks SSH key authentication + +**Justification:** + +✅ **Early failure:** +- Catch errors before expensive operations +- Clear error messages at each layer + +✅ **Comprehensive:** +- Multiple validation points catch different issues +- Reduces runtime failures + +✅ **User-friendly:** +- Helpful error messages with suggestions +- Clear indication of what went wrong + +**Conclusion:** + +Multi-layer validation provides robust error handling and great user experience. + +## Load Balancing Strategy + +### Decision: Simple Composite Score + +**Formula:** +```python +score = (cpu_pct * 0.4) + (mem_pct * 0.3) + (disk_pct * 0.3) +``` + +**Weights:** +- CPU: 40% (most important for compute tasks) +- Memory: 30% (important for data processing) +- Disk: 30% (important for I/O operations) + +**Justification:** + +✅ **Simple and effective:** +- Easy to understand +- Fast to calculate +- Works well for most workloads + +✅ **Balanced:** +- Considers multiple resource types +- No single metric dominates + +**Alternatives:** + +❌ **CPU only** +- Cons: Ignores memory-bound and I/O-bound tasks +- **Rejected because**: Too narrow + +❌ **Complex ML-based prediction** +- Cons: Overkill, slow, requires training data +- **Rejected because**: Unnecessary complexity + +❌ **Fixed round-robin** +- Cons: Doesn't consider actual load +- **Rejected because**: Can overload already-busy hosts + +**Conclusion:** + +Simple weighted score provides good balance without complexity. + +## Error Handling Philosophy + +### Decision: Graceful Degradation + Clear Messages + +**Principles:** + +1. **Fail early with validation**: Catch errors before operations +2. **Isolate failures**: One host failure doesn't stop others +3. **Clear messages**: Tell user exactly what went wrong and how to fix +4. **Automatic retry**: Retry transient errors (network, timeout) +5. **Dry-run support**: Preview operations before execution + +**Implementation:** + +```python +# Example error handling pattern +try: + validate_host(host) + validate_ssh_connection(host) + result = execute_command(host, command) +except ValidationError as e: + return {'error': str(e), 'suggestion': 'Fix: ...'} +except ConnectionError as e: + return {'error': str(e), 'diagnostics': get_diagnostics(host)} +``` + +**Justification:** + +✅ **Better UX:** +- Users know exactly what's wrong +- Suggestions help fix issues quickly + +✅ **Reliability:** +- Automatic retry handles transient issues +- Dry-run prevents mistakes + +✅ **Debugging:** +- Clear error messages speed up troubleshooting +- Diagnostics provide actionable information + +**Conclusion:** + +Graceful degradation with helpful messages creates better user experience. + +## Caching Strategy + +**Decision**: Minimal caching for real-time accuracy + +**What we cache:** +- Nothing (v1.0.0) + +**Why no caching:** +- Host status changes frequently +- Load metrics change constantly +- Operations need real-time data +- Cache invalidation is complex + +**Future consideration (v2.0):** +- Cache Tailscale status (60s TTL) +- Cache group configuration (5min TTL) +- Cache SSH config parsing (5min TTL) + +**Justification:** + +✅ **Simplicity:** +- No cache invalidation logic needed +- No stale data issues + +✅ **Accuracy:** +- Always get current state +- No surprises from cached data + +**Trade-off:** +- Slightly slower repeated operations +- More network calls + +**Conclusion:** + +For v1.0.0, simplicity and accuracy outweigh performance concerns. Real-time data is more valuable than speed. + +## Testing Strategy + +### Decision: Comprehensive Unit + Integration Tests + +**Coverage:** + +- **29 tests total:** + - 11 integration tests (end-to-end workflows) + - 11 helper tests (formatting, parsing, calculations) + - 7 validation tests (input validation, safety checks) + +**Test Philosophy:** + +1. **Test real functionality**: Integration tests use actual functions +2. **Test edge cases**: Validation tests cover error conditions +3. **Test helpers**: Ensure formatting/parsing works correctly +4. **Fast execution**: All tests run in < 10 seconds +5. **No external dependencies**: Tests don't require Tailscale or sshsync to be running + +**Justification:** + +✅ **Confidence:** +- Tests verify code works as expected +- Catches regressions when modifying code + +✅ **Documentation:** +- Tests show how to use functions +- Examples of expected behavior + +✅ **Reliability:** +- Production-ready code from v1.0.0 + +**Conclusion:** + +Comprehensive testing ensures reliable code from the start. + +## Performance Considerations + +### Parallel Execution + +**Decision**: Leverage sshsync's built-in parallelization + +- sshsync runs commands concurrently across hosts automatically +- No need to implement custom threading/multiprocessing +- Timeout applies per-host independently + +**Trade-offs:** + +✅ **Pros:** +- Simple to use +- Fast for large host groups +- No concurrency bugs + +⚠️ **Cons:** +- Less control over parallelism level +- Can overwhelm network with too many concurrent connections + +**Conclusion:** + +Built-in parallelization is sufficient for most use cases. Custom control can be added in v2.0 if needed. + +## Security Considerations + +### SSH Key Authentication + +**Decision**: Require SSH keys (no password auth) + +**Justification:** + +✅ **Security:** +- Keys are more secure than passwords +- Can't be brute-forced +- Can be revoked per-host + +✅ **Automation:** +- Non-interactive (no password prompts) +- Works in scripts and CI/CD + +**Implementation:** +- Validators check SSH key auth works +- Clear error messages guide users to set up keys +- Documentation explains SSH key setup + +### Command Safety + +**Decision**: Validate dangerous commands + +**Dangerous patterns blocked:** +- `rm -rf /` (root deletion) +- `mkfs.*` (filesystem formatting) +- `dd.*of=/dev/` (direct disk writes) +- Fork bombs +- Direct disk writes + +**Override**: Use `allow_dangerous=True` to bypass + +**Justification:** + +✅ **Safety:** +- Prevents accidental destructive operations +- Dry-run provides preview + +✅ **Flexibility:** +- Can still run dangerous commands if explicitly allowed + +**Conclusion:** + +Safety by default with escape hatch for advanced users. + +## Decisions Summary + +| Decision | Choice | Rationale | +|----------|--------|-----------| +| **CLI Tool** | sshsync | Best balance of features, ease of use, and maintenance | +| **Network** | Tailscale | Zero-config secure VPN, works everywhere | +| **Architecture** | Modular scripts | Clear separation of concerns, maintainable | +| **Validation** | Multi-layer | Catch errors early with helpful messages | +| **Load Balancing** | Composite score | Simple, effective, considers multiple resources | +| **Caching** | None (v1.0) | Simplicity and real-time accuracy | +| **Testing** | 29 tests | Comprehensive coverage for reliability | +| **Security** | SSH keys + validation | Secure and automation-friendly | + +## Trade-offs Accepted + +1. **No caching** → Slightly slower, but always accurate +2. **sshsync dependency** → External tool, but saves development time +3. **SSH key requirement** → Setup needed, but more secure +4. **Simple load balancing** → Less sophisticated, but fast and easy to understand +5. **Terminal UI only** → No web dashboard, but simpler to develop and maintain + +## Future Improvements + +### v2.0 Considerations + +1. **Add caching** for frequently-accessed data (Tailscale status, groups) +2. **Web dashboard** for visualization and monitoring +3. **Operation history** database for audit trail +4. **Advanced load balancing** with custom metrics +5. **Automated SSH key distribution** across hosts +6. **Integration with config management** tools (Ansible, Terraform) +7. **Container support** via SSH to Docker containers +8. **Custom validation plugins** for domain-specific checks + +All decisions prioritize **simplicity**, **security**, and **maintainability** for v1.0.0. diff --git a/INSTALLATION.md b/INSTALLATION.md new file mode 100644 index 0000000..4592bdd --- /dev/null +++ b/INSTALLATION.md @@ -0,0 +1,707 @@ +# Installation Guide + +Complete step-by-step tutorial for setting up Tailscale SSH Sync Agent. + +## Table of Contents + +1. [Prerequisites](#prerequisites) +2. [Step 1: Install Tailscale](#step-1-install-tailscale) +3. [Step 2: Install sshsync](#step-2-install-sshsync) +4. [Step 3: Configure SSH](#step-3-configure-ssh) +5. [Step 4: Configure sshsync Groups](#step-4-configure-sshsync-groups) +6. [Step 5: Install Agent](#step-5-install-agent) +7. [Step 6: Test Installation](#step-6-test-installation) +8. [Troubleshooting](#troubleshooting) + +## Prerequisites + +Before you begin, ensure you have: + +- **Operating System**: macOS, Linux, or BSD +- **Python**: Version 3.10 or higher +- **pip**: Python package installer +- **Claude Code**: Installed and running +- **Remote machines**: At least one machine you want to manage +- **SSH access**: Ability to SSH to remote machines + +**Check Python version**: +```bash +python3 --version +# Should show: Python 3.10.x or higher +``` + +**Check pip**: +```bash +pip3 --version +# Should show: pip xx.x.x from ... +``` + +## Step 1: Install Tailscale + +Tailscale provides secure networking between your machines. + +### macOS + +```bash +# Install via Homebrew +brew install tailscale + +# Start Tailscale +sudo tailscale up + +# Follow authentication link in terminal +# This will open browser to log in +``` + +### Linux (Ubuntu/Debian) + +```bash +# Install Tailscale +curl -fsSL https://tailscale.com/install.sh | sh + +# Start and authenticate +sudo tailscale up + +# Follow authentication link +``` + +### Linux (Fedora/RHEL) + +```bash +# Add repository +sudo dnf config-manager --add-repo https://pkgs.tailscale.com/stable/fedora/tailscale.repo + +# Install +sudo dnf install tailscale + +# Enable and start +sudo systemctl enable --now tailscaled +sudo tailscale up +``` + +### Verify Installation + +```bash +# Check Tailscale status +tailscale status + +# Should show list of machines in your tailnet +# Example output: +# 100.64.1.10 homelab-1 user@ linux - +# 100.64.1.11 laptop user@ macOS - +``` + +**Important**: Install and authenticate Tailscale on **all machines** you want to manage. + +## Step 2: Install sshsync + +sshsync is the CLI tool for managing SSH operations across multiple hosts. + +```bash +# Install via pip +pip3 install sshsync + +# Or use pipx for isolated installation +pipx install sshsync +``` + +### Verify Installation + +```bash +# Check version +sshsync --version + +# Should show: sshsync, version x.x.x +``` + +### Common Installation Issues + +**Issue**: `pip3: command not found` + +**Solution**: +```bash +# macOS +brew install python3 + +# Linux (Ubuntu/Debian) +sudo apt install python3-pip + +# Linux (Fedora/RHEL) +sudo dnf install python3-pip +``` + +**Issue**: Permission denied during install + +**Solution**: +```bash +# Install for current user only +pip3 install --user sshsync + +# Or use pipx +pip3 install --user pipx +pipx install sshsync +``` + +## Step 3: Configure SSH + +SSH configuration defines how to connect to each machine. + +### Step 3.1: Generate SSH Keys (if you don't have them) + +```bash +# Generate ed25519 key (recommended) +ssh-keygen -t ed25519 -C "your_email@example.com" + +# Press Enter to use default location (~/.ssh/id_ed25519) +# Enter passphrase (or leave empty for no passphrase) +``` + +**Output**: +``` +Your identification has been saved in /Users/you/.ssh/id_ed25519 +Your public key has been saved in /Users/you/.ssh/id_ed25519.pub +``` + +### Step 3.2: Copy Public Key to Remote Machines + +For each remote machine: + +```bash +# Copy SSH key to remote +ssh-copy-id user@machine-hostname + +# Example: +ssh-copy-id admin@100.64.1.10 +``` + +**Manual method** (if ssh-copy-id doesn't work): + +```bash +# Display public key +cat ~/.ssh/id_ed25519.pub + +# SSH to remote machine +ssh user@remote-host + +# On remote machine: +mkdir -p ~/.ssh +chmod 700 ~/.ssh +echo "your-public-key-here" >> ~/.ssh/authorized_keys +chmod 600 ~/.ssh/authorized_keys +exit +``` + +### Step 3.3: Test SSH Connection + +```bash +# Test connection (should not ask for password) +ssh user@remote-host "hostname" + +# If successful, should print remote hostname +``` + +### Step 3.4: Create SSH Config File + +Edit `~/.ssh/config`: + +```bash +vim ~/.ssh/config +``` + +**Add host entries**: + +``` +# Production servers +Host prod-web-01 + HostName prod-web-01.tailnet.ts.net + User deploy + IdentityFile ~/.ssh/id_ed25519 + Port 22 + +Host prod-web-02 + HostName 100.64.1.21 + User deploy + IdentityFile ~/.ssh/id_ed25519 + +Host prod-db-01 + HostName 100.64.1.30 + User deploy + IdentityFile ~/.ssh/id_ed25519 + +# Development +Host dev-laptop + HostName dev-laptop.tailnet.ts.net + User developer + IdentityFile ~/.ssh/id_ed25519 + +Host dev-desktop + HostName 100.64.1.40 + User developer + IdentityFile ~/.ssh/id_ed25519 + +# Homelab +Host homelab-1 + HostName 100.64.1.10 + User admin + IdentityFile ~/.ssh/id_ed25519 + +Host homelab-2 + HostName 100.64.1.11 + User admin + IdentityFile ~/.ssh/id_ed25519 +``` + +**Important fields**: +- **Host**: Alias you'll use (e.g., "homelab-1") +- **HostName**: Actual hostname or IP (Tailscale hostname or IP) +- **User**: SSH username on remote machine +- **IdentityFile**: Path to SSH private key + +### Step 3.5: Set Correct Permissions + +```bash +# SSH config should be readable only by you +chmod 600 ~/.ssh/config + +# SSH directory permissions +chmod 700 ~/.ssh + +# Private key permissions +chmod 600 ~/.ssh/id_ed25519 + +# Public key permissions +chmod 644 ~/.ssh/id_ed25519.pub +``` + +### Step 3.6: Verify All Hosts + +Test each host in your config: + +```bash +# Test each host +ssh homelab-1 "echo 'Connection successful'" +ssh prod-web-01 "echo 'Connection successful'" +ssh dev-laptop "echo 'Connection successful'" + +# Should connect without asking for password +``` + +## Step 4: Configure sshsync Groups + +Groups organize your hosts for easy management. + +### Step 4.1: Initialize sshsync Configuration + +```bash +# Sync hosts and create groups +sshsync sync +``` + +**What this does**: +1. Reads all hosts from `~/.ssh/config` +2. Prompts you to assign hosts to groups +3. Creates `~/.config/sshsync/config.yaml` + +### Step 4.2: Follow Interactive Prompts + +``` +Found 7 ungrouped hosts: +1. homelab-1 +2. homelab-2 +3. prod-web-01 +4. prod-web-02 +5. prod-db-01 +6. dev-laptop +7. dev-desktop + +Assign groups now? [Y/n]: Y + +Enter group name for homelab-1 (or skip): homelab +Enter group name for homelab-2 (or skip): homelab +Enter group name for prod-web-01 (or skip): production,web +Enter group name for prod-web-02 (or skip): production,web +Enter group name for prod-db-01 (or skip): production,database +Enter group name for dev-laptop (or skip): development +Enter group name for dev-desktop (or skip): development +``` + +**Tips**: +- Hosts can belong to multiple groups (separate with commas) +- Use meaningful group names (production, development, web, database, homelab) +- Skip hosts you don't want to group yet + +### Step 4.3: Verify Configuration + +```bash +# View generated config +cat ~/.config/sshsync/config.yaml +``` + +**Expected output**: +```yaml +groups: + production: + - prod-web-01 + - prod-web-02 + - prod-db-01 + web: + - prod-web-01 + - prod-web-02 + database: + - prod-db-01 + development: + - dev-laptop + - dev-desktop + homelab: + - homelab-1 + - homelab-2 +``` + +### Step 4.4: Test sshsync + +```bash +# List hosts +sshsync ls + +# List with status +sshsync ls --with-status + +# Test command execution +sshsync all "hostname" + +# Test group execution +sshsync group homelab "uptime" +``` + +## Step 5: Install Agent + +### Step 5.1: Navigate to Agent Directory + +```bash +cd /path/to/tailscale-sshsync-agent +``` + +### Step 5.2: Verify Agent Structure + +```bash +# List files +ls -la + +# Should see: +# .claude-plugin/ +# scripts/ +# tests/ +# references/ +# SKILL.md +# README.md +# VERSION +# CHANGELOG.md +# etc. +``` + +### Step 5.3: Validate marketplace.json + +```bash +# Check JSON is valid +python3 -c "import json; json.load(open('.claude-plugin/marketplace.json')); print('✅ Valid JSON')" + +# Should output: ✅ Valid JSON +``` + +### Step 5.4: Install via Claude Code + +In Claude Code: + +``` +/plugin marketplace add /absolute/path/to/tailscale-sshsync-agent +``` + +**Example**: +``` +/plugin marketplace add /Users/you/tailscale-sshsync-agent +``` + +**Expected output**: +``` +✓ Plugin installed successfully +✓ Skill: tailscale-sshsync-agent +✓ Description: Manages distributed workloads and file sharing... +``` + +### Step 5.5: Verify Installation + +In Claude Code: + +``` +"Which of my machines are online?" +``` + +**Expected response**: Agent should activate and check your Tailscale network. + +## Step 6: Test Installation + +### Test 1: Host Status + +**Query**: +``` +"Which of my machines are online?" +``` + +**Expected**: List of hosts with online/offline status + +### Test 2: List Groups + +**Query**: +``` +"What groups do I have configured?" +``` + +**Expected**: List of your sshsync groups + +### Test 3: Execute Command + +**Query**: +``` +"Check disk space on homelab machines" +``` + +**Expected**: Disk usage for hosts in homelab group + +### Test 4: Dry-Run + +**Query**: +``` +"Show me what would happen if I ran 'uptime' on all machines (dry-run)" +``` + +**Expected**: Preview without execution + +### Test 5: Run Test Suite + +```bash +cd /path/to/tailscale-sshsync-agent + +# Run all tests +python3 tests/test_integration.py + +# Should show: +# Results: 11/11 passed +# 🎉 All tests passed! +``` + +## Troubleshooting + +### Agent Not Activating + +**Symptoms**: Agent doesn't respond to queries about machines/hosts + +**Solutions**: + +1. **Check installation**: + ``` + /plugin list + ``` + Should show `tailscale-sshsync-agent` in list. + +2. **Reinstall**: + ``` + /plugin remove tailscale-sshsync-agent + /plugin marketplace add /path/to/tailscale-sshsync-agent + ``` + +3. **Check marketplace.json**: + ```bash + cat .claude-plugin/marketplace.json + # Verify "description" field matches SKILL.md frontmatter + ``` + +### SSH Connection Fails + +**Symptoms**: "Permission denied" or "Connection refused" + +**Solutions**: + +1. **Check SSH key**: + ```bash + ssh-add -l + # Should list your SSH key + ``` + + If not listed: + ```bash + ssh-add ~/.ssh/id_ed25519 + ``` + +2. **Test SSH directly**: + ```bash + ssh -v hostname + # -v shows verbose debug info + ``` + +3. **Verify authorized_keys on remote**: + ```bash + ssh hostname "cat ~/.ssh/authorized_keys" + # Should contain your public key + ``` + +### Tailscale Connection Issues + +**Symptoms**: Hosts show as offline in Tailscale + +**Solutions**: + +1. **Check Tailscale status**: + ```bash + tailscale status + ``` + +2. **Restart Tailscale**: + ```bash + # macOS + brew services restart tailscale + + # Linux + sudo systemctl restart tailscaled + ``` + +3. **Re-authenticate**: + ```bash + sudo tailscale up + ``` + +### sshsync Errors + +**Symptoms**: "sshsync: command not found" + +**Solutions**: + +1. **Reinstall sshsync**: + ```bash + pip3 install --upgrade sshsync + ``` + +2. **Check PATH**: + ```bash + which sshsync + # Should show path to sshsync + ``` + + If not found, add to PATH: + ```bash + echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc + source ~/.bashrc + ``` + +### Config File Issues + +**Symptoms**: "Group not found" or "Host not found" + +**Solutions**: + +1. **Verify SSH config**: + ```bash + cat ~/.ssh/config + # Check host aliases are correct + ``` + +2. **Verify sshsync config**: + ```bash + cat ~/.config/sshsync/config.yaml + # Check groups are defined + ``` + +3. **Re-sync**: + ```bash + sshsync sync + ``` + +### Test Failures + +**Symptoms**: Tests fail with errors + +**Solutions**: + +1. **Check dependencies**: + ```bash + pip3 list | grep -E "sshsync|pyyaml" + ``` + +2. **Check Python version**: + ```bash + python3 --version + # Must be 3.10+ + ``` + +3. **Run tests individually**: + ```bash + python3 tests/test_helpers.py + python3 tests/test_validation.py + python3 tests/test_integration.py + ``` + +## Post-Installation + +### Recommended Next Steps + +1. **Create more groups** for better organization: + ```bash + sshsync gadd staging + sshsync gadd backup-servers + ``` + +2. **Test file operations**: + ``` + "Push test file to homelab machines (dry-run)" + ``` + +3. **Set up automation**: + - Create scripts for common tasks + - Schedule backups + - Automate deployments + +4. **Review documentation**: + - Read `references/sshsync-guide.md` for advanced sshsync usage + - Read `references/tailscale-integration.md` for Tailscale tips + +### Security Checklist + +- ✅ SSH keys are password-protected +- ✅ SSH config has correct permissions (600) +- ✅ Private keys have correct permissions (600) +- ✅ Tailscale ACLs configured (if using teams) +- ✅ Only necessary hosts have SSH access +- ✅ Regularly review connected devices in Tailscale + +## Summary + +You now have: + +1. ✅ Tailscale installed and connected +2. ✅ sshsync installed and configured +3. ✅ SSH keys set up on all machines +4. ✅ SSH config with all hosts +5. ✅ sshsync groups organized +6. ✅ Agent installed in Claude Code +7. ✅ Tests passing + +**Start using**: + +``` +"Which machines are online?" +"Run this on the least loaded machine" +"Push files to production servers" +"Deploy to staging then production" +``` + +For more examples, see README.md and SKILL.md. + +## Support + +If you encounter issues: + +1. Check this troubleshooting section +2. Review references/ for detailed guides +3. Check DECISIONS.md for architecture rationale +4. Run tests to verify installation + +Happy automating! 🚀 diff --git a/README.md b/README.md new file mode 100644 index 0000000..d2a2afc --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# tailscale-sshsync-agent + +Manages distributed workloads and file sharing across Tailscale SSH-connected machines. Automates remote command execution, intelligent load balancing, file synchronization workflows, host health monitoring, and multi-machine orchestration using sshsync. diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..71b00d4 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,1204 @@ +--- +name: tailscale-sshsync-agent +description: Manages distributed workloads and file sharing across Tailscale SSH-connected machines. Automates remote command execution, intelligent load balancing, file synchronization workflows, host health monitoring, and multi-machine orchestration using sshsync. Activates when discussing remote machines, Tailscale SSH, workload distribution, file sharing, or multi-host operations. +--- + +# Tailscale SSH Sync Agent + +## When to Use This Skill + +This skill automatically activates when you need to: + +✅ **Distribute workloads** across multiple machines +- "Run this on my least loaded machine" +- "Execute this task on the machine with most resources" +- "Balance work across my Tailscale network" + +✅ **Share files** between Tailscale-connected hosts +- "Push this directory to all my development machines" +- "Sync code across my homelab servers" +- "Deploy configuration to production group" + +✅ **Execute commands** remotely across host groups +- "Run system updates on all servers" +- "Check disk space across web-servers group" +- "Restart services on database hosts" + +✅ **Monitor machine availability** and health +- "Which machines are online?" +- "Show status of my Tailscale network" +- "Check connectivity to remote hosts" + +✅ **Automate multi-machine workflows** +- "Deploy to staging, test, then production" +- "Backup files from all machines" +- "Synchronize development environment across laptops" + +## How It Works + +This agent provides intelligent workload distribution and file sharing management across Tailscale SSH-connected machines using the `sshsync` CLI tool. + +**Core Architecture**: + +1. **SSH Sync Wrapper**: Python interface to sshsync CLI operations +2. **Tailscale Manager**: Tailscale-specific connectivity and status management +3. **Load Balancer**: Intelligent task distribution based on machine resources +4. **Workflow Executor**: Common multi-machine workflow automation +5. **Validators**: Parameter, host, and connection validation +6. **Helpers**: Temporal context, formatting, and utilities + +**Key Features**: + +- **Automatic host discovery** via Tailscale and SSH config +- **Intelligent load balancing** based on CPU, memory, and current load +- **Group-based operations** (execute on all web servers, databases, etc.) +- **Dry-run mode** for preview before execution +- **Parallel execution** across multiple hosts +- **Comprehensive error handling** and retry logic +- **Connection validation** before operations +- **Progress tracking** for long-running operations + +## Data Sources + +### sshsync CLI Tool + +**What is sshsync?** + +sshsync is a Python CLI tool for managing SSH connections and executing operations across multiple hosts. It provides: + +- Group-based host management +- Remote command execution with timeouts +- File push/pull operations (single or recursive) +- Integration with existing SSH config (~/.ssh/config) +- Status checking and connectivity validation + +**Installation**: +```bash +pip install sshsync +``` + +**Configuration**: + +sshsync uses two configuration sources: + +1. **SSH Config** (`~/.ssh/config`): Host connection details +2. **sshsync Config** (`~/.config/sshsync/config.yaml`): Group assignments + +**Example SSH Config**: +``` +Host homelab-1 + HostName 100.64.1.10 + User admin + IdentityFile ~/.ssh/id_ed25519 + +Host prod-web-01 + HostName 100.64.1.20 + User deploy + Port 22 +``` + +**Example sshsync Config**: +```yaml +groups: + homelab: + - homelab-1 + - homelab-2 + production: + - prod-web-01 + - prod-web-02 + - prod-db-01 + development: + - dev-laptop + - dev-desktop +``` + +**sshsync Commands Used**: + +| Command | Purpose | Example | +|---------|---------|---------| +| `sshsync all` | Execute on all hosts | `sshsync all "df -h"` | +| `sshsync group` | Execute on group | `sshsync group web "systemctl status nginx"` | +| `sshsync push` | Push files to hosts | `sshsync push --group prod ./app /var/www/` | +| `sshsync pull` | Pull files from hosts | `sshsync pull --host db /var/log/mysql ./logs/` | +| `sshsync ls` | List hosts | `sshsync ls --with-status` | +| `sshsync sync` | Sync ungrouped hosts | `sshsync sync` | + +### Tailscale Integration + +**What is Tailscale?** + +Tailscale is a zero-config VPN that creates a secure network between your devices. It provides: + +- **Automatic peer-to-peer connections** via WireGuard +- **Magic DNS** for easy host addressing (e.g., `machine-name.tailnet-name.ts.net`) +- **SSH capabilities** built-in to Tailscale CLI +- **ACLs** for access control + +**Tailscale SSH**: + +Tailscale includes SSH functionality that works seamlessly with standard SSH: + +```bash +# Standard SSH via Tailscale +ssh user@machine-name + +# Tailscale-specific SSH command +tailscale ssh machine-name +``` + +**Integration with sshsync**: + +Since Tailscale SSH uses standard SSH protocol, it works perfectly with sshsync. Just configure your SSH config with Tailscale hostnames: + +``` +Host homelab-1 + HostName homelab-1.tailnet.ts.net + User admin +``` + +**Tailscale Commands Used**: + +| Command | Purpose | Example | +|---------|---------|---------| +| `tailscale status` | Show network status | Lists all connected machines | +| `tailscale ping` | Check connectivity | `tailscale ping machine-name` | +| `tailscale ssh` | SSH to machine | `tailscale ssh user@machine` | + +## Workflows + +### 1. Host Health Monitoring + +**User Query**: "Which of my machines are online?" + +**Workflow**: + +1. Load SSH config and sshsync groups +2. Execute `sshsync ls --with-status` +3. Parse connectivity results +4. Query Tailscale status for additional context +5. Return formatted health report with: + - Online/offline status per host + - Group memberships + - Tailscale connection state + - Last seen timestamp + +**Implementation**: `scripts/sshsync_wrapper.py` → `get_host_status()` + +**Output Format**: +``` +🟢 homelab-1 (homelab) - Online - Tailscale: Connected +🟢 prod-web-01 (production, web-servers) - Online - Tailscale: Connected +🔴 dev-laptop (development) - Offline - Last seen: 2h ago +🟢 prod-db-01 (production, databases) - Online - Tailscale: Connected + +Summary: 3/4 hosts online (75%) +``` + +### 2. Intelligent Load Balancing + +**User Query**: "Run this task on the least loaded machine" + +**Workflow**: + +1. Get list of candidate hosts (from group or all) +2. For each online host, check: + - CPU load (via `uptime` or `top`) + - Memory usage (via `free` or `vm_stat`) + - Disk space (via `df`) +3. Calculate composite load score +4. Select host with lowest score +5. Execute task on selected host +6. Return result with performance metrics + +**Implementation**: `scripts/load_balancer.py` → `select_optimal_host()` + +**Load Score Calculation**: +``` +score = (cpu_pct * 0.4) + (mem_pct * 0.3) + (disk_pct * 0.3) +``` + +Lower score = better candidate for task execution. + +**Output Format**: +``` +✓ Selected host: prod-web-02 + Reason: Lowest load score (0.32) + - CPU: 15% (vs avg 45%) + - Memory: 30% (vs avg 60%) + - Disk: 40% (vs avg 55%) + +Executing: npm run build +[Task output...] + +✓ Completed in 2m 15s +``` + +### 3. File Synchronization Workflows + +**User Query**: "Sync my code to all development machines" + +**Workflow**: + +1. Validate source path exists locally +2. Identify target group ("development") +3. Check connectivity to all group members +4. Show dry-run preview (files to be synced, sizes) +5. Execute parallel push to all hosts +6. Validate successful transfer on each host +7. Return summary with per-host status + +**Implementation**: `scripts/sshsync_wrapper.py` → `push_to_group()` + +**Supported Operations**: + +- **Push to all**: Sync files to every configured host +- **Push to group**: Sync to specific group (dev, prod, etc.) +- **Pull from host**: Retrieve files from single host +- **Pull from group**: Collect files from multiple hosts +- **Recursive sync**: Entire directory trees with `--recurse` + +**Output Format**: +``` +📤 Syncing: ~/projects/myapp → /var/www/myapp +Group: development (3 hosts) + +Preview (dry-run): + - dev-laptop: 145 files, 12.3 MB + - dev-desktop: 145 files, 12.3 MB + - dev-server: 145 files, 12.3 MB + +Execute? [Proceeding...] + +✓ dev-laptop: Synced 145 files in 8s +✓ dev-desktop: Synced 145 files in 6s +✓ dev-server: Synced 145 files in 10s + +Summary: 3/3 successful (435 files, 36.9 MB total) +``` + +### 4. Remote Command Orchestration + +**User Query**: "Check disk space on all web servers" + +**Workflow**: + +1. Identify target group ("web-servers") +2. Validate group exists and has members +3. Check connectivity to group members +4. Execute command in parallel across group +5. Collect and parse outputs +6. Format results with per-host breakdown + +**Implementation**: `scripts/sshsync_wrapper.py` → `execute_on_group()` + +**Features**: + +- **Parallel execution**: Commands run simultaneously on all hosts +- **Timeout handling**: Configurable per-command timeout (default 10s) +- **Error isolation**: Failure on one host doesn't stop others +- **Output aggregation**: Collect and correlate all outputs +- **Dry-run mode**: Preview what would execute without running + +**Output Format**: +``` +🔧 Executing on group 'web-servers': df -h /var/www + +web-01: + Filesystem: /dev/sda1 + Size: 100G, Used: 45G, Available: 50G (45% used) + +web-02: + Filesystem: /dev/sda1 + Size: 100G, Used: 67G, Available: 28G (67% used) ⚠️ + +web-03: + Filesystem: /dev/sda1 + Size: 100G, Used: 52G, Available: 43G (52% used) + +⚠️ Alert: web-02 is above 60% disk usage +``` + +### 5. Multi-Stage Deployment Workflow + +**User Query**: "Deploy to staging, test, then production" + +**Workflow**: + +1. **Stage 1 - Staging Deploy**: + - Push code to staging group + - Run build process + - Execute automated tests + - If tests fail: STOP and report error + +2. **Stage 2 - Validation**: + - Check staging health endpoints + - Validate database migrations + - Run smoke tests + +3. **Stage 3 - Production Deploy**: + - Push to production group (one at a time for zero-downtime) + - Restart services gracefully + - Verify each host before proceeding to next + +4. **Stage 4 - Verification**: + - Check production health + - Monitor for errors + - Rollback if issues detected + +**Implementation**: `scripts/workflow_executor.py` → `deploy_workflow()` + +**Output Format**: +``` +🚀 Multi-Stage Deployment Workflow + +Stage 1: Staging Deployment + ✓ Pushed code to staging-01 + ✓ Build completed (2m 15s) + ✓ Tests passed (145/145) + +Stage 2: Validation + ✓ Health check passed + ✓ Database migration OK + ✓ Smoke tests passed (12/12) + +Stage 3: Production Deployment + ✓ prod-web-01: Deployed & verified + ✓ prod-web-02: Deployed & verified + ✓ prod-web-03: Deployed & verified + +Stage 4: Verification + ✓ All health checks passed + ✓ No errors in logs (5min window) + +✅ Deployment completed successfully in 12m 45s +``` + +## Available Scripts + +### scripts/sshsync_wrapper.py + +**Purpose**: Python wrapper around sshsync CLI for programmatic access + +**Functions**: + +- `get_host_status(group=None)`: Get online/offline status of hosts +- `execute_on_all(command, timeout=10, dry_run=False)`: Run command on all hosts +- `execute_on_group(group, command, timeout=10, dry_run=False)`: Run on specific group +- `execute_on_host(host, command, timeout=10)`: Run on single host +- `push_to_hosts(local_path, remote_path, hosts=None, group=None, recurse=False, dry_run=False)`: Push files +- `pull_from_host(host, remote_path, local_path, recurse=False, dry_run=False)`: Pull files +- `list_hosts(with_status=True)`: List all configured hosts +- `get_groups()`: Get all defined groups and their members +- `add_hosts_to_group(group, hosts)`: Add hosts to a group + +**Usage Example**: +```python +from sshsync_wrapper import execute_on_group, push_to_hosts + +# Execute command +result = execute_on_group( + group="web-servers", + command="systemctl status nginx", + timeout=15 +) + +# Push files +push_to_hosts( + local_path="./dist", + remote_path="/var/www/app", + group="production", + recurse=True +) +``` + +### scripts/tailscale_manager.py + +**Purpose**: Tailscale-specific operations and status management + +**Functions**: + +- `get_tailscale_status()`: Get Tailscale network status (all peers) +- `check_connectivity(host)`: Ping host via Tailscale +- `get_peer_info(hostname)`: Get detailed info about peer +- `list_online_machines()`: List all online Tailscale machines +- `get_machine_ip(hostname)`: Get Tailscale IP for machine +- `validate_tailscale_ssh(host)`: Check if Tailscale SSH is working + +**Usage Example**: +```python +from tailscale_manager import get_tailscale_status, check_connectivity + +# Get network status +status = get_tailscale_status() +print(f"Online machines: {status['online_count']}") + +# Check specific host +is_online = check_connectivity("homelab-1") +``` + +### scripts/load_balancer.py + +**Purpose**: Intelligent task distribution based on machine resources + +**Functions**: + +- `get_machine_load(host)`: Get CPU, memory, disk metrics +- `calculate_load_score(metrics)`: Calculate composite load score +- `select_optimal_host(candidates, prefer_group=None)`: Pick best host +- `get_group_capacity()`: Get aggregate capacity of group +- `distribute_tasks(tasks, hosts)`: Distribute multiple tasks optimally + +**Usage Example**: +```python +from load_balancer import select_optimal_host + +# Find best machine for task +best_host = select_optimal_host( + candidates=["web-01", "web-02", "web-03"], + prefer_group="production" +) + +# Execute on selected host +execute_on_host(best_host, "npm run build") +``` + +### scripts/workflow_executor.py + +**Purpose**: Common multi-machine workflow automation + +**Functions**: + +- `deploy_workflow(code_path, staging_group, prod_group)`: Full deployment pipeline +- `backup_workflow(hosts, backup_paths, destination)`: Backup from multiple hosts +- `sync_workflow(source_host, target_group, paths)`: Sync from one to many +- `rolling_restart(group, service_name)`: Zero-downtime service restart +- `health_check_workflow(group, endpoint)`: Check health across group + +**Usage Example**: +```python +from workflow_executor import deploy_workflow, backup_workflow + +# Deploy with testing +deploy_workflow( + code_path="./dist", + staging_group="staging", + prod_group="production" +) + +# Backup from all databases +backup_workflow( + hosts=["db-01", "db-02"], + backup_paths=["/var/lib/mysql"], + destination="./backups" +) +``` + +### scripts/utils/helpers.py + +**Purpose**: Common utilities and formatting functions + +**Functions**: + +- `format_bytes(bytes)`: Human-readable byte formatting (1.2 GB) +- `format_duration(seconds)`: Human-readable duration (2m 15s) +- `parse_ssh_config()`: Parse ~/.ssh/config for host details +- `parse_sshsync_config()`: Parse sshsync group configuration +- `get_timestamp()`: Get ISO timestamp for logging +- `safe_execute(func, *args, **kwargs)`: Execute with error handling +- `validate_path(path)`: Check if path exists and is accessible + +### scripts/utils/validators/parameter_validator.py + +**Purpose**: Validate user inputs and parameters + +**Functions**: + +- `validate_host(host, valid_hosts=None)`: Validate host exists +- `validate_group(group, valid_groups=None)`: Validate group exists +- `validate_path_exists(path)`: Check local path exists +- `validate_timeout(timeout)`: Ensure timeout is reasonable +- `validate_command(command)`: Basic command safety validation + +### scripts/utils/validators/host_validator.py + +**Purpose**: Validate host configuration and availability + +**Functions**: + +- `validate_ssh_config(host)`: Check host has SSH config entry +- `validate_host_reachable(host, timeout=5)`: Check host is reachable +- `validate_group_members(group)`: Ensure group has valid members +- `get_invalid_hosts(hosts)`: Find hosts without valid config + +### scripts/utils/validators/connection_validator.py + +**Purpose**: Validate SSH and Tailscale connections + +**Functions**: + +- `validate_ssh_connection(host)`: Test SSH connection works +- `validate_tailscale_connection(host)`: Test Tailscale connectivity +- `validate_ssh_key(host)`: Check SSH key authentication +- `get_connection_diagnostics(host)`: Comprehensive connection testing + +## Available Analyses + +### 1. Host Availability Analysis + +**Function**: `analyze_host_availability(group=None)` + +**Objective**: Determine which machines are online and accessible + +**Inputs**: +- `group` (optional): Specific group to check (None = all hosts) + +**Outputs**: +```python +{ + 'total_hosts': 10, + 'online_hosts': 8, + 'offline_hosts': 2, + 'availability_pct': 80.0, + 'by_group': { + 'production': {'online': 3, 'total': 3, 'pct': 100.0}, + 'development': {'online': 2, 'total': 3, 'pct': 66.7}, + 'homelab': {'online': 3, 'total': 4, 'pct': 75.0} + }, + 'offline_hosts_details': [ + {'host': 'dev-laptop', 'last_seen': '2h ago', 'groups': ['development']}, + {'host': 'homelab-4', 'last_seen': '1d ago', 'groups': ['homelab']} + ] +} +``` + +**Interpretation**: +- **> 90%**: Excellent availability +- **70-90%**: Good availability, monitor offline hosts +- **< 70%**: Poor availability, investigate issues + +### 2. Load Distribution Analysis + +**Function**: `analyze_load_distribution(group=None)` + +**Objective**: Understand resource usage across machines + +**Inputs**: +- `group` (optional): Specific group to analyze + +**Outputs**: +```python +{ + 'hosts': [ + { + 'host': 'web-01', + 'cpu_pct': 45, + 'mem_pct': 60, + 'disk_pct': 40, + 'load_score': 0.49, + 'status': 'moderate' + }, + # ... more hosts + ], + 'aggregate': { + 'avg_cpu': 35, + 'avg_mem': 55, + 'avg_disk': 45, + 'total_capacity': 1200 # GB + }, + 'recommendations': [ + { + 'host': 'web-02', + 'issue': 'High CPU usage (85%)', + 'action': 'Consider migrating workloads' + } + ] +} +``` + +**Load Status**: +- **Low** (score < 0.4): Good capacity for more work +- **Moderate** (0.4-0.7): Normal operation +- **High** (> 0.7): May need to offload work + +### 3. File Sync Status Analysis + +**Function**: `analyze_sync_status(local_path, remote_path, group)` + +**Objective**: Compare local files with remote versions + +**Inputs**: +- `local_path`: Local directory to compare +- `remote_path`: Remote directory path +- `group`: Group to check + +**Outputs**: +```python +{ + 'local_files': 145, + 'local_size': 12582912, # bytes + 'hosts': [ + { + 'host': 'web-01', + 'status': 'in_sync', + 'files_match': 145, + 'files_different': 0, + 'missing_files': 0 + }, + { + 'host': 'web-02', + 'status': 'out_of_sync', + 'files_match': 140, + 'files_different': 3, + 'missing_files': 2, + 'details': ['config.json modified', 'index.html modified', ...] + } + ], + 'sync_percentage': 96.7, + 'recommended_action': 'Push to web-02' +} +``` + +### 4. Network Latency Analysis + +**Function**: `analyze_network_latency(hosts=None)` + +**Objective**: Measure connection latency to hosts + +**Inputs**: +- `hosts` (optional): Specific hosts to test (None = all) + +**Outputs**: +```python +{ + 'hosts': [ + {'host': 'web-01', 'latency_ms': 15, 'status': 'excellent'}, + {'host': 'web-02', 'latency_ms': 45, 'status': 'good'}, + {'host': 'db-01', 'latency_ms': 150, 'status': 'fair'} + ], + 'avg_latency': 70, + 'min_latency': 15, + 'max_latency': 150, + 'recommendations': [ + {'host': 'db-01', 'issue': 'High latency', 'action': 'Check network path'} + ] +} +``` + +**Latency Classification**: +- **Excellent** (< 50ms): Ideal for interactive tasks +- **Good** (50-100ms): Suitable for most operations +- **Fair** (100-200ms): May impact interactive workflows +- **Poor** (> 200ms): Investigate network issues + +### 5. Comprehensive Infrastructure Report + +**Function**: `comprehensive_infrastructure_report(group=None)` + +**Objective**: One-stop function for complete infrastructure overview + +**Inputs**: +- `group` (optional): Limit to specific group (None = all) + +**Outputs**: +```python +{ + 'report_timestamp': '2025-10-19T19:43:41Z', + 'group': 'production', # or 'all' + 'metrics': { + 'availability': {...}, # from analyze_host_availability + 'load_distribution': {...}, # from analyze_load_distribution + 'network_latency': {...}, # from analyze_network_latency + 'tailscale_status': {...} # from Tailscale integration + }, + 'summary': "Production infrastructure: 3/3 hosts online, avg load 45%, network latency 35ms", + 'alerts': [ + "⚠ web-02: High CPU usage (85%)", + "⚠ db-01: Elevated latency (150ms)" + ], + 'recommendations': [ + "Consider rebalancing workload from web-02", + "Investigate network path to db-01" + ], + 'overall_health': 'good' # excellent | good | fair | poor +} +``` + +**Overall Health Classification**: +- **Excellent**: All metrics green, no alerts +- **Good**: Most metrics healthy, minor alerts +- **Fair**: Some concerning metrics, action recommended +- **Poor**: Critical issues, immediate action required + +## Error Handling + +### Connection Errors + +**Error**: Cannot connect to host + +**Causes**: +- Host is offline +- Tailscale not connected +- SSH key missing/invalid +- Firewall blocking connection + +**Handling**: +```python +try: + execute_on_host("web-01", "ls") +except ConnectionError as e: + # Try Tailscale ping first + if not check_connectivity("web-01"): + return { + 'error': 'Host unreachable', + 'suggestion': 'Check Tailscale connection', + 'diagnostics': get_connection_diagnostics("web-01") + } + # Then check SSH + if not validate_ssh_connection("web-01"): + return { + 'error': 'SSH authentication failed', + 'suggestion': 'Check SSH keys: ssh-add -l' + } +``` + +### Timeout Errors + +**Error**: Operation timed out + +**Causes**: +- Command taking too long +- Network latency +- Host overloaded + +**Handling**: +- Automatic retry with exponential backoff (3 attempts) +- Increase timeout for known slow operations +- Fall back to alternative host if available + +### File Transfer Errors + +**Error**: File sync failed + +**Causes**: +- Insufficient disk space +- Permission denied +- Path doesn't exist + +**Handling**: +- Pre-check disk space on target +- Validate permissions before transfer +- Create directories if needed +- Partial transfer recovery + +### Validation Errors + +**Error**: Invalid parameter + +**Examples**: +- Unknown host +- Non-existent group +- Invalid path + +**Handling**: +- Validate all inputs before execution +- Provide suggestions for similar valid options +- Clear error messages with corrective actions + +## Mandatory Validations + +### Before Any Operation + +1. **Parameter Validation**: + ```python + host = validate_host(host, valid_hosts=get_all_hosts()) + group = validate_group(group, valid_groups=get_groups()) + timeout = validate_timeout(timeout) + ``` + +2. **Connection Validation**: + ```python + if not validate_host_reachable(host, timeout=5): + raise ConnectionError(f"Host {host} is not reachable") + ``` + +3. **Path Validation** (for file operations): + ```python + if not validate_path_exists(local_path): + raise ValueError(f"Path does not exist: {local_path}") + ``` + +### During Operation + +1. **Timeout Monitoring**: Every operation has configurable timeout +2. **Progress Tracking**: Long operations show progress +3. **Error Isolation**: Failure on one host doesn't stop others + +### After Operation + +1. **Result Validation**: + ```python + report = validate_operation_result(result) + if report.has_critical_issues(): + raise OperationError(report.get_summary()) + ``` + +2. **State Verification**: Confirm operation succeeded +3. **Logging**: Record all operations for audit trail + +## Performance and Caching + +### Caching Strategy + +**Host Status Cache**: +- **TTL**: 60 seconds +- **Why**: Host status doesn't change rapidly +- **Invalidation**: Manual invalidate when connectivity changes + +**Load Metrics Cache**: +- **TTL**: 30 seconds +- **Why**: Load changes frequently +- **Invalidation**: Automatic on timeout + +**Group Configuration Cache**: +- **TTL**: 5 minutes +- **Why**: Group membership rarely changes +- **Invalidation**: Manual when groups modified + +### Performance Optimizations + +1. **Parallel Execution**: + - Commands execute concurrently across hosts + - ThreadPoolExecutor with max 10 workers + - Prevents sequential bottleneck + +2. **Connection Pooling**: + - Reuse SSH connections when possible + - ControlMaster in SSH config + +3. **Lazy Loading**: + - Only fetch data when needed + - Don't load all host status unless required + +4. **Progressive Results**: + - Stream results as they complete + - Don't wait for slowest host + +## Automatic Detection Keywords + +This skill automatically activates when you mention: + +**Hosts & Machines**: +- remote machine, remote host, remote server +- Tailscale machine, Tailscale host +- SSH host, SSH server +- homelab, servers, cluster + +**Tools**: +- sshsync, ssh sync +- Tailscale, tailnet +- SSH, secure shell + +**Operations**: +- distribute workload, load balance +- file sharing, file sync, file transfer +- push files, pull files, sync files +- remote execution, execute remotely +- deploy to, deploy across +- backup from, sync to + +**Groups**: +- host group, machine group, server group +- production servers, development machines +- web servers, database servers +- staging environment, production environment + +**Actions**: +- check status, check connectivity +- run on all, execute on group +- sync across, distribute to +- deploy to, push to, pull from + +**Monitoring**: +- which machines are online +- host availability, server status +- machine health, connectivity check +- network status, latency check + +**Workflows**: +- multi-stage deployment +- rolling restart, zero-downtime +- backup workflow, sync workflow +- distributed task, parallel execution + +## Usage Examples + +### Example 1: Check Host Status + +**User**: "Which of my Tailscale machines are online?" + +**Agent Response**: +``` +Checking Tailscale network status... + +🟢 homelab-1 (homelab) - Online - Tailscale: Connected + IP: 100.64.1.10 | Latency: 15ms + +🟢 homelab-2 (homelab) - Online - Tailscale: Connected + IP: 100.64.1.11 | Latency: 18ms + +🔴 dev-laptop (development) - Offline + Last seen: 2 hours ago | Tailscale: Not connected + +🟢 prod-web-01 (production, web-servers) - Online - Tailscale: Connected + IP: 100.64.1.20 | Latency: 25ms + +🟢 prod-db-01 (production, databases) - Online - Tailscale: Connected + IP: 100.64.1.30 | Latency: 30ms + +Summary: 4/5 hosts online (80%) +Average latency: 22ms +``` + +### Example 2: Execute on Group + +**User**: "Check disk space on all web servers" + +**Agent Response**: +``` +Executing on group 'web-servers': df -h /var/www + +🔧 web-01: + Filesystem: /dev/sda1 + Size: 100G | Used: 45G | Available: 50G (45% used) + Status: ✓ Healthy + +🔧 web-02: + Filesystem: /dev/sda1 + Size: 100G | Used: 67G | Available: 28G (67% used) + Status: ⚠ Warning - Above 60% + +🔧 web-03: + Filesystem: /dev/sda1 + Size: 100G | Used: 52G | Available: 43G (52% used) + Status: ✓ Healthy + +⚠️ Alert: web-02 is above 60% disk usage +Recommendation: Clear old logs or increase disk space +``` + +### Example 3: Load-Balanced Execution + +**User**: "Run this npm build on the machine with most available resources" + +**Agent Response**: +``` +Analyzing resource availability across 5 machines... + +Load Analysis: + web-01: CPU 45%, Mem 60%, Disk 40% → Score: 0.49 (moderate) + web-02: CPU 85%, Mem 70%, Disk 65% → Score: 0.75 (high load) + web-03: CPU 20%, Mem 35%, Disk 30% → Score: 0.28 (low load) ✓ + +✓ Selected: web-03 + Reason: Lowest load score (0.28) + Available: CPU 80%, Memory 65%, Disk 70% + +Executing on web-03: npm run build + +[Build output...] + ✓ Build completed successfully + Duration: 2m 15s + Output: ./dist (145 files, 12.3 MB) + +Final load: CPU 65%, Mem 50% (still below average) +``` + +### Example 4: File Sync + +**User**: "Push my app directory to all production servers" + +**Agent Response**: +``` +📤 Syncing: ~/projects/myapp → /var/www/myapp +Target group: production (3 hosts) + +Analyzing source... + 145 files, 12.3 MB total + +Preview (dry-run): + prod-web-01: 145 files → /var/www/myapp + prod-web-02: 145 files → /var/www/myapp + prod-db-01: Skipped (not in web-servers group) + +Checking target capacity... + ✓ prod-web-01: 50 GB available + ✓ prod-web-02: 28 GB available + +Executing push... + +✓ prod-web-01: Synced 145 files in 8.2s + Modified: 12 files + Added: 3 files + +✓ prod-web-02: Synced 145 files in 6.5s + Modified: 12 files + Added: 3 files + +Summary: 2/2 successful (290 files, 24.6 MB total) +``` + +### Example 5: Multi-Stage Deployment + +**User**: "Deploy this to staging, run tests, then push to production" + +**Agent Response**: +``` +🚀 Multi-Stage Deployment Workflow + +Stage 1: Staging Deployment + 📤 Pushing to staging-01... + ✓ Code synced (145 files, 12.3 MB) - 8s + + 🔨 Running build... + $ npm run build + ✓ Build completed - 2m 15s + + 🧪 Running tests... + $ npm test + ✓ All tests passed (145/145) - 45s + +Stage 2: Staging Validation + ✓ Health endpoint: 200 OK + ✓ Database migrations: Up to date + ✓ Smoke tests: 12/12 passed + +Stage 3: Production Deployment + Rolling deployment to 3 hosts... + + ⏳ prod-web-01: + ✓ Code synced + ✓ Build completed + ✓ Service restarted gracefully + ✓ Health check passed + Duration: 3m 10s + + ⏳ prod-web-02: + ✓ Code synced + ✓ Build completed + ✓ Service restarted gracefully + ✓ Health check passed + Duration: 3m 5s + + ⏳ prod-web-03: + ✓ Code synced + ✓ Build completed + ✓ Service restarted gracefully + ✓ Health check passed + Duration: 3m 12s + +Stage 4: Production Verification + ✓ All health checks passed (3/3) + ✓ No errors in logs (5min window) + ✓ Response times normal (avg 45ms) + +✅ Deployment completed successfully! + Total duration: 12m 45s + Hosts updated: 3 + Zero downtime: ✓ +``` + +### Example 6: Comprehensive Infrastructure Report + +**User**: "Give me a complete status report of my infrastructure" + +**Agent Response**: +``` +📊 Infrastructure Report +Generated: 2025-10-19 19:43:41 + +=== AVAILABILITY === +Total Hosts: 10 +Online: 8 (80%) +Offline: 2 + +By Group: + production: 3/3 (100%) ✓ + development: 2/3 (66.7%) ⚠ + homelab: 3/4 (75%) + +Offline Hosts: + • dev-laptop (development) - Last seen: 2h ago + • homelab-4 (homelab) - Last seen: 1d ago + +=== LOAD DISTRIBUTION === +Average Load: 45% CPU, 55% Memory, 45% Disk + +Top 3 Loaded Hosts: + 1. web-02: 85% CPU, 70% Mem, 65% Disk (Score: 0.75) ⚠ + 2. db-01: 60% CPU, 75% Mem, 55% Disk (Score: 0.65) + 3. web-01: 45% CPU, 60% Mem, 40% Disk (Score: 0.49) + +Top 3 Available Hosts: + 1. web-03: 20% CPU, 35% Mem, 30% Disk (Score: 0.28) ✓ + 2. homelab-1: 25% CPU, 40% Mem, 35% Disk (Score: 0.33) + 3. homelab-2: 30% CPU, 45% Mem, 40% Disk (Score: 0.38) + +=== NETWORK LATENCY === +Average: 35ms +Range: 15ms - 150ms + +Excellent (< 50ms): 6 hosts +Good (50-100ms): 1 host +Fair (100-200ms): 1 host (db-01: 150ms) ⚠ + +=== TAILSCALE STATUS === +Network: Connected +Peers Online: 8/10 +Exit Node: None +MagicDNS: Enabled + +=== ALERTS === +⚠ web-02: High CPU usage (85%) - Consider load balancing +⚠ db-01: Elevated latency (150ms) - Check network path +⚠ dev-laptop: Offline for 2 hours - May need attention + +=== RECOMMENDATIONS === +1. Rebalance workload from web-02 to web-03 +2. Investigate network latency to db-01 +3. Check status of dev-laptop and homelab-4 +4. Consider scheduling maintenance for web-02 + +Overall Health: GOOD ✓ +``` + +## Installation + +See INSTALLATION.md for detailed setup instructions. + +Quick start: +```bash +# 1. Install sshsync +pip install sshsync + +# 2. Configure SSH hosts +vim ~/.ssh/config + +# 3. Sync host groups +sshsync sync + +# 4. Install agent +/plugin marketplace add ./tailscale-sshsync-agent + +# 5. Test +"Which of my machines are online?" +``` + +## Version + +Current version: 1.0.0 + +See CHANGELOG.md for release history. + +## Architecture Decisions + +See DECISIONS.md for detailed rationale behind tool selection, architecture choices, and trade-offs considered. diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..3eefcb9 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +1.0.0 diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..6ab8644 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,117 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:Human-Frontier-Labs-Inc/human-frontier-labs-marketplace:plugins/tailscale-sshsync-agent", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "3a7cbe9632f245c6b9a4c4bf2731da65c857a7f4", + "treeHash": "832bc62ce02c782663e60a2eb97932166fef39c681a9ca01b9d5dc170860b805", + "generatedAt": "2025-11-28T10:11:41.356928Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "tailscale-sshsync-agent", + "description": "Manages distributed workloads and file sharing across Tailscale SSH-connected machines. Automates remote command execution, intelligent load balancing, file synchronization workflows, host health monitoring, and multi-machine orchestration using sshsync.", + "version": null + }, + "content": { + "files": [ + { + "path": "CHANGELOG.md", + "sha256": "74dbda933868b7cab410144a831b43e4f1ae6161f2402edcb068a8232c50bfe4" + }, + { + "path": "README.md", + "sha256": "470f165d8ac61a8942e6fb3568c49febb7f803bfa0f4010d14e09f807c34c88e" + }, + { + "path": "VERSION", + "sha256": "59854984853104df5c353e2f681a15fc7924742f9a2e468c29af248dce45ce03" + }, + { + "path": "SKILL.md", + "sha256": "31c8f237f9b3617c32c6ff381ae83d427b50eb0877d3763d9826e00ece6618f1" + }, + { + "path": "INSTALLATION.md", + "sha256": "9313ea1bbb0a03e4c078c41b207f3febe800cd38eb57b7205c7b5188238ca46a" + }, + { + "path": "DECISIONS.md", + "sha256": "59549e84aaa8e32d4bdf64d46855714f5cde7f061906e1c74976658883472c82" + }, + { + "path": "references/tailscale-integration.md", + "sha256": "6553b3ceeaca5118a7b005368223ea4b3ab70eb2492ccaf5c2b7f7758b65dd42" + }, + { + "path": "references/sshsync-guide.md", + "sha256": "697ce0b56eda258732a0b924f821e9e24eb6b977934153bdd2045be961e58de2" + }, + { + "path": "tests/test_validation.py", + "sha256": "716ae0d2e86f0e6657903aef6bb714fbd3b5b72d3b109fab4da3f75f90cc2c0a" + }, + { + "path": "tests/test_helpers.py", + "sha256": "3be88e30825414eb3ade048b766c84995dc98a01cb7236ce75201716179279a8" + }, + { + "path": "tests/test_integration.py", + "sha256": "12f7cb857fda23531a9c74caf072cf73b739672b1e99c55f42a2ef8e11238523" + }, + { + "path": "scripts/load_balancer.py", + "sha256": "9d87476562ac848a026e42116e381f733d520e9330da33de3d905585af14398d" + }, + { + "path": "scripts/tailscale_manager.py", + "sha256": "4b75ebb9423d221b9788eb9352b274e0256c101185de11064a7b4cb00684016e" + }, + { + "path": "scripts/workflow_executor.py", + "sha256": "9f23f3bb421e940766e65949e6efa485a313115e297d4c5f1088589155a7bac1" + }, + { + "path": "scripts/sshsync_wrapper.py", + "sha256": "fc2062ebbc72e3ddc6c6bfb5f22019b23050f5c2ed9ac35c315018a96871fb19" + }, + { + "path": "scripts/utils/helpers.py", + "sha256": "b01979ee56ab92037b8f8054a883124d600b8337cf461855092b866091aed24a" + }, + { + "path": "scripts/utils/validators/connection_validator.py", + "sha256": "9ac82108e69690b74d9aa89ca51f7d06fe860e880aaa1983d08242d7199d1601" + }, + { + "path": "scripts/utils/validators/parameter_validator.py", + "sha256": "157dfcb7f1937df88344647a37a124d52e1de1b992b72c9b9e69d3b717ca0195" + }, + { + "path": "scripts/utils/validators/__init__.py", + "sha256": "2d109ad1b5d253578a095c8354159fdf9318154b4f62d9b16eaa1a88a422382d" + }, + { + "path": "scripts/utils/validators/host_validator.py", + "sha256": "79cab42587435a799349ba8a562c4ec0f3d54f3f2790562c894c6289beade6d6" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "0ec7466bbf2e8dc2fe1607feff0cc0ef0ebebf44ff54f17dcce96255e2c21215" + } + ], + "dirSha256": "832bc62ce02c782663e60a2eb97932166fef39c681a9ca01b9d5dc170860b805" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/references/sshsync-guide.md b/references/sshsync-guide.md new file mode 100644 index 0000000..55fb541 --- /dev/null +++ b/references/sshsync-guide.md @@ -0,0 +1,466 @@ +# sshsync CLI Tool Guide + +Complete reference for using sshsync with Tailscale SSH Sync Agent. + +## Table of Contents + +1. [Installation](#installation) +2. [Configuration](#configuration) +3. [Core Commands](#core-commands) +4. [Advanced Usage](#advanced-usage) +5. [Troubleshooting](#troubleshooting) + +## Installation + +### Via pip + +```bash +pip install sshsync +``` + +### Verify Installation + +```bash +sshsync --version +``` + +## Configuration + +### 1. SSH Config Setup + +sshsync uses your existing SSH configuration. Edit `~/.ssh/config`: + +``` +# Example host entries +Host homelab-1 + HostName 100.64.1.10 + User admin + IdentityFile ~/.ssh/id_ed25519 + Port 22 + +Host prod-web-01 + HostName 100.64.1.20 + User deploy + IdentityFile ~/.ssh/id_rsa + Port 22 + +Host dev-laptop + HostName 100.64.1.30 + User developer +``` + +**Important Notes**: +- sshsync uses the **Host alias** (e.g., "homelab-1"), not the actual hostname +- Ensure SSH key authentication is configured +- Test each host with `ssh host-alias` before using with sshsync + +### 2. Initialize sshsync Configuration + +First run: + +```bash +sshsync sync +``` + +This will: +1. Read all hosts from your SSH config +2. Prompt you to assign hosts to groups +3. Create `~/.config/sshsync/config.yaml` + +### 3. sshsync Config File + +Location: `~/.config/sshsync/config.yaml` + +Structure: +```yaml +groups: + production: + - prod-web-01 + - prod-web-02 + - prod-db-01 + development: + - dev-laptop + - dev-desktop + homelab: + - homelab-1 + - homelab-2 +``` + +**Manual Editing**: +- Groups are arbitrary labels (use what makes sense for you) +- Hosts can belong to multiple groups +- Use consistent host aliases from SSH config + +## Core Commands + +### List Hosts + +```bash +# List all configured hosts +sshsync ls + +# List with online/offline status +sshsync ls --with-status +``` + +**Output Example**: +``` +Host Status +homelab-1 online +homelab-2 offline +prod-web-01 online +dev-laptop online +``` + +### Execute Commands + +#### On All Hosts + +```bash +# Execute on all configured hosts +sshsync all "df -h" + +# With custom timeout (default: 10s) +sshsync all --timeout 20 "systemctl status nginx" + +# Dry-run (preview without executing) +sshsync all --dry-run "reboot" +``` + +#### On Specific Group + +```bash +# Execute on group +sshsync group production "uptime" + +# With timeout +sshsync group web-servers --timeout 30 "npm run build" + +# Filter with regex +sshsync group production --regex "web-.*" "df -h" +``` + +**Regex Filtering**: +- Filters group members by alias matching pattern +- Uses Python regex syntax +- Example: `--regex "web-0[1-3]"` matches web-01, web-02, web-03 + +### File Transfer + +#### Push Files + +```bash +# Push to specific host +sshsync push --host web-01 ./app /var/www/app + +# Push to group +sshsync push --group production ./dist /var/www/app + +# Push to all hosts +sshsync push --all ./config.yml /etc/app/config.yml + +# Recursive push (directory with contents) +sshsync push --group web --recurse ./app /var/www/app + +# Dry-run +sshsync push --group production --dry-run ./dist /var/www/app +``` + +**Important**: +- Local path comes first, remote path second +- Use `--recurse` for directories +- Dry-run shows what would be transferred without executing + +#### Pull Files + +```bash +# Pull from specific host +sshsync pull --host db-01 /var/log/mysql/error.log ./logs/ + +# Pull from group (creates separate directories per host) +sshsync pull --group databases /var/backups ./backups/ + +# Recursive pull +sshsync pull --host web-01 --recurse /var/www/app ./backup/ +``` + +**Pull Behavior**: +- When pulling from groups, creates subdirectory per host +- Use `--recurse` to pull entire directory trees +- Destination directory created if doesn't exist + +### Group Management + +#### Add Hosts to Group + +```bash +# Interactive: prompts to select hosts +sshsync gadd production + +# Follow prompts to select which hosts to add +``` + +#### Add Host to SSH Config + +```bash +# Interactive host addition +sshsync hadd + +# Follow prompts for: +# - Host alias +# - Hostname/IP +# - Username +# - Port (optional) +# - Identity file (optional) +``` + +#### Sync Ungrouped Hosts + +```bash +# Assign groups to hosts not yet in any group +sshsync sync +``` + +## Advanced Usage + +### Parallel Execution + +sshsync automatically executes commands in parallel across hosts: + +```bash +# This runs simultaneously on all hosts in group +sshsync group web-servers "npm run build" +``` + +**Performance**: +- Commands execute concurrently +- Results collected as they complete +- Timeout applies per-host independently + +### Timeout Strategies + +Different operations need different timeouts: + +```bash +# Quick checks (5-10s) +sshsync all --timeout 5 "hostname" + +# Moderate operations (30-60s) +sshsync group web --timeout 60 "npm install" + +# Long-running tasks (300s+) +sshsync group build --timeout 300 "docker build ." +``` + +**Timeout Best Practices**: +- Set timeout 20-30% longer than expected duration +- Use dry-run first to estimate timing +- Increase timeout for network-intensive operations + +### Combining with Other Tools + +#### With xargs + +```bash +# Get list of online hosts +sshsync ls --with-status | grep online | awk '{print $1}' | xargs -I {} echo "Host {} is online" +``` + +#### With jq (if using JSON output) + +```bash +# Parse structured output (if sshsync supports --json flag) +sshsync ls --json | jq '.hosts[] | select(.status=="online") | .name' +``` + +#### In Shell Scripts + +```bash +#!/bin/bash + +# Deploy script using sshsync +echo "Deploying to staging..." +sshsync push --group staging --recurse ./dist /var/www/app + +if [ $? -eq 0 ]; then + echo "Staging deployment successful" + + echo "Running tests..." + sshsync group staging "cd /var/www/app && npm test" + + if [ $? -eq 0 ]; then + echo "Tests passed, deploying to production..." + sshsync push --group production --recurse ./dist /var/www/app + fi +fi +``` + +## Troubleshooting + +### Common Issues + +#### 1. "Permission denied (publickey)" + +**Cause**: SSH key not configured or not added to ssh-agent + +**Solution**: +```bash +# Add SSH key to agent +ssh-add ~/.ssh/id_ed25519 + +# Verify it's added +ssh-add -l + +# Copy public key to remote +ssh-copy-id user@host +``` + +#### 2. "Connection timed out" + +**Cause**: Host is offline or network issue + +**Solution**: +```bash +# Test connectivity +ping hostname + +# Test Tailscale specifically +tailscale ping hostname + +# Check Tailscale status +tailscale status +``` + +#### 3. "Host not found in SSH config" + +**Cause**: Host alias not in `~/.ssh/config` + +**Solution**: +```bash +# Add host to SSH config +sshsync hadd + +# Or manually edit ~/.ssh/config +vim ~/.ssh/config +``` + +#### 4. "Group not found" + +**Cause**: Group doesn't exist in sshsync config + +**Solution**: +```bash +# Add hosts to new group +sshsync gadd mygroup + +# Or manually edit config +vim ~/.config/sshsync/config.yaml +``` + +#### 5. File Transfer Fails + +**Cause**: Insufficient permissions, disk space, or path doesn't exist + +**Solution**: +```bash +# Check remote disk space +sshsync group production "df -h" + +# Check remote path exists +sshsync group production "ls -ld /target/path" + +# Check permissions +sshsync group production "ls -la /target/path" +``` + +### Debug Mode + +While sshsync doesn't have a built-in verbose mode, you can debug underlying SSH: + +```bash +# Increase SSH verbosity +SSH_VERBOSE=1 sshsync all "uptime" + +# Or use dry-run to see what would execute +sshsync all --dry-run "command" +``` + +### Performance Issues + +If operations are slow: + +1. **Reduce parallelism** (run on fewer hosts at once) +2. **Increase timeout** for network-bound operations +3. **Check network latency**: + ```bash + sshsync all "echo $HOSTNAME" --timeout 5 + ``` + +### Configuration Validation + +```bash +# Verify SSH config is readable +cat ~/.ssh/config + +# Verify sshsync config +cat ~/.config/sshsync/config.yaml + +# Test hosts individually +for host in $(sshsync ls | awk '{print $1}'); do + echo "Testing $host..." + ssh $host "echo OK" || echo "FAILED: $host" +done +``` + +## Best Practices + +1. **Use meaningful host aliases** in SSH config +2. **Organize groups logically** (by function, environment, location) +3. **Always dry-run first** for destructive operations +4. **Set appropriate timeouts** based on operation type +5. **Test SSH keys** before using sshsync +6. **Keep groups updated** as infrastructure changes +7. **Use --with-status** to check availability before operations + +## Integration with Tailscale + +sshsync works seamlessly with Tailscale SSH: + +```bash +# SSH config using Tailscale hostname +Host homelab-1 + HostName homelab-1.tailnet.ts.net + User admin + +# Or using Tailscale IP directly +Host homelab-1 + HostName 100.64.1.10 + User admin +``` + +**Tailscale Advantages**: +- No need for port forwarding +- Encrypted connections +- MagicDNS for easy hostnames +- Works across NATs + +**Verify Tailscale**: +```bash +# Check Tailscale network +tailscale status + +# Ping host via Tailscale +tailscale ping homelab-1 +``` + +## Summary + +sshsync simplifies multi-host SSH operations: +- ✅ Execute commands across host groups +- ✅ Transfer files to/from multiple hosts +- ✅ Organize hosts into logical groups +- ✅ Parallel execution for speed +- ✅ Dry-run mode for safety +- ✅ Works great with Tailscale + +For more help: `sshsync --help` diff --git a/references/tailscale-integration.md b/references/tailscale-integration.md new file mode 100644 index 0000000..b4301e5 --- /dev/null +++ b/references/tailscale-integration.md @@ -0,0 +1,468 @@ +# Tailscale Integration Guide + +How to use Tailscale SSH with sshsync for secure, zero-config remote access. + +## What is Tailscale? + +Tailscale is a zero-config VPN that creates a secure network between your devices using WireGuard. It provides: + +- **Peer-to-peer encrypted connections** +- **No port forwarding required** +- **Works across NATs and firewalls** +- **MagicDNS for easy device addressing** +- **Built-in SSH functionality** +- **Access control lists (ACLs)** + +## Why Tailscale + sshsync? + +Combining Tailscale with sshsync gives you: + +1. **Secure connections** everywhere (Tailscale encryption) +2. **Simple addressing** (MagicDNS hostnames) +3. **Multi-host operations** (sshsync groups and execution) +4. **No firewall configuration** needed +5. **Works from anywhere** (coffee shop, home, office) + +## Setup + +### 1. Install Tailscale + +**macOS**: +```bash +brew install tailscale +``` + +**Linux**: +```bash +curl -fsSL https://tailscale.com/install.sh | sh +``` + +**Verify Installation**: +```bash +tailscale version +``` + +### 2. Connect to Tailscale + +```bash +# Start Tailscale +sudo tailscale up + +# Follow the authentication link +# This opens browser to authenticate + +# Verify connection +tailscale status +``` + +### 3. Configure SSH via Tailscale + +Tailscale provides two SSH options: + +#### Option A: Tailscale SSH (Built-in) + +**Enable on each machine**: +```bash +sudo tailscale up --ssh +``` + +**Use**: +```bash +tailscale ssh user@machine-name +``` + +**Advantages**: +- No SSH server configuration needed +- Uses Tailscale authentication +- Automatic key management + +#### Option B: Standard SSH over Tailscale (Recommended for sshsync) + +**Configure SSH config** to use Tailscale hostnames: + +```bash +# ~/.ssh/config + +Host homelab-1 + HostName homelab-1.tailnet-name.ts.net + User admin + IdentityFile ~/.ssh/id_ed25519 + +# Or use Tailscale IP directly +Host homelab-2 + HostName 100.64.1.10 + User admin + IdentityFile ~/.ssh/id_ed25519 +``` + +**Advantages**: +- Works with all SSH tools (including sshsync) +- Standard SSH key authentication +- More flexibility + +## Getting Tailscale Hostnames and IPs + +### View All Machines + +```bash +tailscale status +``` + +**Output**: +``` +100.64.1.10 homelab-1 user@ linux - +100.64.1.11 homelab-2 user@ linux - +100.64.1.20 laptop user@ macOS - +100.64.1.30 phone user@ iOS offline +``` + +### Get MagicDNS Hostname + +**Format**: `machine-name.tailnet-name.ts.net` + +**Find your tailnet name**: +```bash +tailscale status --json | grep -i tailnet +``` + +Or check in Tailscale admin console: https://login.tailscale.com/admin/machines + +### Get Tailscale IP + +```bash +# Your own IP +tailscale ip -4 + +# Another machine's IP (from status output) +tailscale status | grep machine-name +``` + +## Testing Connectivity + +### Ping via Tailscale + +```bash +# Ping by hostname +tailscale ping homelab-1 + +# Ping by IP +tailscale ping 100.64.1.10 +``` + +**Successful output**: +``` +pong from homelab-1 (100.64.1.10) via DERP(nyc) in 45ms +pong from homelab-1 (100.64.1.10) via DERP(nyc) in 43ms +``` + +**Failed output**: +``` +timeout waiting for pong +``` + +### SSH Test + +```bash +# Test SSH connection +ssh user@homelab-1.tailnet.ts.net + +# Or with IP +ssh user@100.64.1.10 +``` + +## Configuring sshsync with Tailscale + +### Step 1: Add Tailscale Hosts to SSH Config + +```bash +vim ~/.ssh/config +``` + +**Example configuration**: +``` +# Production servers +Host prod-web-01 + HostName prod-web-01.tailnet.ts.net + User deploy + IdentityFile ~/.ssh/id_ed25519 + +Host prod-web-02 + HostName prod-web-02.tailnet.ts.net + User deploy + IdentityFile ~/.ssh/id_ed25519 + +Host prod-db-01 + HostName prod-db-01.tailnet.ts.net + User deploy + IdentityFile ~/.ssh/id_ed25519 + +# Homelab +Host homelab-1 + HostName 100.64.1.10 + User admin + IdentityFile ~/.ssh/id_ed25519 + +Host homelab-2 + HostName 100.64.1.11 + User admin + IdentityFile ~/.ssh/id_ed25519 + +# Development +Host dev-laptop + HostName dev-laptop.tailnet.ts.net + User developer + IdentityFile ~/.ssh/id_ed25519 +``` + +### Step 2: Test Each Host + +```bash +# Test connectivity to each host +ssh prod-web-01 "hostname" +ssh homelab-1 "hostname" +ssh dev-laptop "hostname" +``` + +### Step 3: Initialize sshsync + +```bash +# Sync hosts and create groups +sshsync sync + +# Add hosts to groups +sshsync gadd production +# Select: prod-web-01, prod-web-02, prod-db-01 + +sshsync gadd homelab +# Select: homelab-1, homelab-2 + +sshsync gadd development +# Select: dev-laptop +``` + +### Step 4: Verify Configuration + +```bash +# List all hosts with status +sshsync ls --with-status + +# Test command execution +sshsync all "uptime" + +# Test group execution +sshsync group production "df -h" +``` + +## Advanced Tailscale Features + +### Tailnet Lock + +Prevents unauthorized device additions: + +```bash +tailscale lock status +``` + +### Exit Nodes + +Route all traffic through a specific machine: + +```bash +# Enable exit node on a machine +sudo tailscale up --advertise-exit-node + +# Use exit node from another machine +sudo tailscale set --exit-node=exit-node-name +``` + +### Subnet Routing + +Access networks behind Tailscale machines: + +```bash +# Advertise subnet routes +sudo tailscale up --advertise-routes=192.168.1.0/24 +``` + +### ACLs (Access Control Lists) + +Control who can access what: https://login.tailscale.com/admin/acls + +**Example ACL**: +```json +{ + "acls": [ + { + "action": "accept", + "src": ["group:admins"], + "dst": ["*:22", "*:80", "*:443"] + }, + { + "action": "accept", + "src": ["group:developers"], + "dst": ["tag:development:*"] + } + ] +} +``` + +## Troubleshooting + +### Machine Shows Offline + +**Check Tailscale status**: +```bash +tailscale status +``` + +**Restart Tailscale**: +```bash +# macOS +brew services restart tailscale + +# Linux +sudo systemctl restart tailscaled +``` + +**Re-authenticate**: +```bash +sudo tailscale up +``` + +### Cannot Connect via SSH + +1. **Verify Tailscale connectivity**: + ```bash + tailscale ping machine-name + ``` + +2. **Check SSH is running** on remote: + ```bash + tailscale ssh machine-name "systemctl status sshd" + ``` + +3. **Verify SSH keys**: + ```bash + ssh-add -l + ``` + +4. **Test SSH directly**: + ```bash + ssh -v user@machine-name.tailnet.ts.net + ``` + +### High Latency + +**Check connection method**: +```bash +tailscale status +``` + +Look for "direct" vs "DERP relay": +- **Direct**: Low latency (< 50ms) +- **DERP relay**: Higher latency (100-200ms) + +**Force direct connection**: +```bash +# Ensure both machines can establish P2P +# May require NAT traversal +``` + +### MagicDNS Not Working + +**Enable MagicDNS**: +1. Go to https://login.tailscale.com/admin/dns +2. Enable MagicDNS + +**Verify**: +```bash +nslookup machine-name.tailnet.ts.net +``` + +## Security Best Practices + +1. **Use SSH keys**, not passwords +2. **Enable Tailnet Lock** to prevent unauthorized devices +3. **Use ACLs** to restrict access +4. **Regularly review** connected devices +5. **Set up key expiry** for team members who leave +6. **Use tags** for machine roles +7. **Enable two-factor auth** for Tailscale account + +## Monitoring + +### Check Network Status + +```bash +# All machines +tailscale status + +# Self status +tailscale status --self + +# JSON format for parsing +tailscale status --json +``` + +### View Logs + +```bash +# macOS +tail -f /var/log/tailscaled.log + +# Linux +journalctl -u tailscaled -f +``` + +## Use Cases with sshsync + +### 1. Deploy to All Production Servers + +```bash +sshsync push --group production --recurse ./dist /var/www/app +sshsync group production "cd /var/www/app && pm2 restart all" +``` + +### 2. Collect Logs from All Servers + +```bash +sshsync pull --group production /var/log/app/error.log ./logs/ +``` + +### 3. Update All Homelab Machines + +```bash +sshsync group homelab "sudo apt update && sudo apt upgrade -y" +``` + +### 4. Check Disk Space Everywhere + +```bash +sshsync all "df -h /" +``` + +### 5. Sync Configuration Across Machines + +```bash +sshsync push --all ~/dotfiles/.bashrc ~/.bashrc +sshsync push --all ~/dotfiles/.vimrc ~/.vimrc +``` + +## Summary + +Tailscale + sshsync = **Powerful Remote Management** + +- ✅ Secure connections everywhere (WireGuard encryption) +- ✅ No firewall configuration needed +- ✅ Easy addressing (MagicDNS) +- ✅ Multi-host operations (sshsync groups) +- ✅ Works from anywhere + +**Quick Start**: +1. Install Tailscale: `brew install tailscale` +2. Connect: `sudo tailscale up` +3. Configure SSH config with Tailscale hostnames +4. Initialize sshsync: `sshsync sync` +5. Start managing: `sshsync all "uptime"` + +For more: https://tailscale.com/kb/ diff --git a/scripts/load_balancer.py b/scripts/load_balancer.py new file mode 100644 index 0000000..9b162a1 --- /dev/null +++ b/scripts/load_balancer.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +""" +Load balancer for Tailscale SSH Sync Agent. +Intelligent task distribution based on machine resources. +""" + +import sys +from pathlib import Path +from typing import Dict, List, Optional, Tuple +from dataclasses import dataclass +import logging + +# Add utils to path +sys.path.insert(0, str(Path(__file__).parent)) + +from utils.helpers import parse_cpu_load, parse_memory_usage, parse_disk_usage, calculate_load_score, classify_load_status +from sshsync_wrapper import execute_on_host + +logger = logging.getLogger(__name__) + + +@dataclass +class MachineMetrics: + """Resource metrics for a machine.""" + host: str + cpu_pct: float + mem_pct: float + disk_pct: float + load_score: float + status: str + + +def get_machine_load(host: str, timeout: int = 10) -> Optional[MachineMetrics]: + """ + Get CPU, memory, disk metrics for a machine. + + Args: + host: Host to check + timeout: Command timeout + + Returns: + MachineMetrics object or None on failure + + Example: + >>> metrics = get_machine_load("web-01") + >>> metrics.cpu_pct + 45.2 + >>> metrics.load_score + 0.49 + """ + try: + # Get CPU load + cpu_result = execute_on_host(host, "uptime", timeout=timeout) + cpu_data = {} + if cpu_result.get('success'): + cpu_data = parse_cpu_load(cpu_result['stdout']) + + # Get memory usage + mem_result = execute_on_host(host, "free -m 2>/dev/null || vm_stat", timeout=timeout) + mem_data = {} + if mem_result.get('success'): + mem_data = parse_memory_usage(mem_result['stdout']) + + # Get disk usage + disk_result = execute_on_host(host, "df -h / | tail -1", timeout=timeout) + disk_data = {} + if disk_result.get('success'): + disk_data = parse_disk_usage(disk_result['stdout']) + + # Calculate metrics + # CPU: Use 1-min load average, normalize by assuming 4 cores (adjust as needed) + cpu_pct = (cpu_data.get('load_1min', 0) / 4.0) * 100 if cpu_data else 50.0 + + # Memory: Direct percentage + mem_pct = mem_data.get('use_pct', 50.0) + + # Disk: Direct percentage + disk_pct = disk_data.get('use_pct', 50.0) + + # Calculate load score + score = calculate_load_score(cpu_pct, mem_pct, disk_pct) + status = classify_load_status(score) + + return MachineMetrics( + host=host, + cpu_pct=cpu_pct, + mem_pct=mem_pct, + disk_pct=disk_pct, + load_score=score, + status=status + ) + + except Exception as e: + logger.error(f"Error getting load for {host}: {e}") + return None + + +def select_optimal_host(candidates: List[str], + prefer_group: Optional[str] = None, + timeout: int = 10) -> Tuple[Optional[str], Optional[MachineMetrics]]: + """ + Pick best host from candidates based on load. + + Args: + candidates: List of candidate hosts + prefer_group: Prefer hosts from this group if available + timeout: Timeout for metric gathering + + Returns: + Tuple of (selected_host, metrics) + + Example: + >>> host, metrics = select_optimal_host(["web-01", "web-02", "web-03"]) + >>> host + "web-03" + >>> metrics.load_score + 0.28 + """ + if not candidates: + return None, None + + # Get metrics for all candidates + metrics_list: List[MachineMetrics] = [] + + for host in candidates: + metrics = get_machine_load(host, timeout=timeout) + if metrics: + metrics_list.append(metrics) + + if not metrics_list: + logger.warning("No valid metrics collected from candidates") + return None, None + + # Sort by load score (lower is better) + metrics_list.sort(key=lambda m: m.load_score) + + # If prefer_group specified, prioritize those hosts if load is similar + if prefer_group: + from utils.helpers import parse_sshsync_config, get_groups_for_host + groups_config = parse_sshsync_config() + + # Find hosts in preferred group + preferred_metrics = [ + m for m in metrics_list + if prefer_group in get_groups_for_host(m.host, groups_config) + ] + + # Use preferred if load score within 20% of absolute best + if preferred_metrics: + best_score = metrics_list[0].load_score + for m in preferred_metrics: + if m.load_score <= best_score * 1.2: + return m.host, m + + # Return absolute best + best = metrics_list[0] + return best.host, best + + +def get_group_capacity(group: str, timeout: int = 10) -> Dict: + """ + Get aggregate capacity of a group. + + Args: + group: Group name + timeout: Timeout for metric gathering + + Returns: + Dict with aggregate metrics: + { + 'hosts': List[MachineMetrics], + 'total_hosts': int, + 'avg_cpu': float, + 'avg_mem': float, + 'avg_disk': float, + 'avg_load_score': float, + 'total_capacity': str # descriptive + } + + Example: + >>> capacity = get_group_capacity("production") + >>> capacity['avg_load_score'] + 0.45 + """ + from utils.helpers import parse_sshsync_config + + groups_config = parse_sshsync_config() + group_hosts = groups_config.get(group, []) + + if not group_hosts: + return { + 'error': f'Group {group} not found or has no members', + 'hosts': [] + } + + # Get metrics for all hosts in group + metrics_list: List[MachineMetrics] = [] + + for host in group_hosts: + metrics = get_machine_load(host, timeout=timeout) + if metrics: + metrics_list.append(metrics) + + if not metrics_list: + return { + 'error': f'Could not get metrics for any hosts in {group}', + 'hosts': [] + } + + # Calculate aggregates + avg_cpu = sum(m.cpu_pct for m in metrics_list) / len(metrics_list) + avg_mem = sum(m.mem_pct for m in metrics_list) / len(metrics_list) + avg_disk = sum(m.disk_pct for m in metrics_list) / len(metrics_list) + avg_score = sum(m.load_score for m in metrics_list) / len(metrics_list) + + # Determine overall capacity description + if avg_score < 0.4: + capacity_desc = "High capacity available" + elif avg_score < 0.7: + capacity_desc = "Moderate capacity" + else: + capacity_desc = "Limited capacity" + + return { + 'group': group, + 'hosts': metrics_list, + 'total_hosts': len(metrics_list), + 'available_hosts': len(group_hosts), + 'avg_cpu': avg_cpu, + 'avg_mem': avg_mem, + 'avg_disk': avg_disk, + 'avg_load_score': avg_score, + 'total_capacity': capacity_desc + } + + +def distribute_tasks(tasks: List[Dict], hosts: List[str], + timeout: int = 10) -> Dict[str, List[Dict]]: + """ + Distribute multiple tasks optimally across hosts. + + Args: + tasks: List of task dicts (each with 'command', 'priority', etc) + hosts: Available hosts + timeout: Timeout for metric gathering + + Returns: + Dict mapping hosts to assigned tasks + + Algorithm: + - Get current load for all hosts + - Assign tasks to least loaded hosts + - Balance by estimated task weight + + Example: + >>> tasks = [ + ... {'command': 'npm run build', 'weight': 3}, + ... {'command': 'npm test', 'weight': 2} + ... ] + >>> distribution = distribute_tasks(tasks, ["web-01", "web-02"]) + >>> distribution["web-01"] + [{'command': 'npm run build', 'weight': 3}] + """ + if not tasks or not hosts: + return {} + + # Get current load for all hosts + host_metrics = {} + for host in hosts: + metrics = get_machine_load(host, timeout=timeout) + if metrics: + host_metrics[host] = metrics + + if not host_metrics: + logger.error("No valid host metrics available") + return {} + + # Initialize assignment + assignment: Dict[str, List[Dict]] = {host: [] for host in host_metrics.keys()} + host_loads = {host: m.load_score for host, m in host_metrics.items()} + + # Sort tasks by weight (descending) to assign heavy tasks first + sorted_tasks = sorted( + tasks, + key=lambda t: t.get('weight', 1), + reverse=True + ) + + # Assign each task to least loaded host + for task in sorted_tasks: + # Find host with minimum current load + min_host = min(host_loads.keys(), key=lambda h: host_loads[h]) + + # Assign task + assignment[min_host].append(task) + + # Update simulated load (add task weight normalized) + task_weight = task.get('weight', 1) + host_loads[min_host] += (task_weight * 0.1) # 0.1 = scaling factor + + return assignment + + +def format_load_report(metrics: MachineMetrics, compare_to_avg: Optional[Dict] = None) -> str: + """ + Format load metrics as human-readable report. + + Args: + metrics: Machine metrics + compare_to_avg: Optional dict with avg_cpu, avg_mem, avg_disk for comparison + + Returns: + Formatted report string + + Example: + >>> metrics = MachineMetrics('web-01', 45, 60, 40, 0.49, 'moderate') + >>> print(format_load_report(metrics)) + web-01: Load Score: 0.49 (moderate) + CPU: 45.0% | Memory: 60.0% | Disk: 40.0% + """ + lines = [ + f"{metrics.host}: Load Score: {metrics.load_score:.2f} ({metrics.status})", + f" CPU: {metrics.cpu_pct:.1f}% | Memory: {metrics.mem_pct:.1f}% | Disk: {metrics.disk_pct:.1f}%" + ] + + if compare_to_avg: + cpu_vs = metrics.cpu_pct - compare_to_avg.get('avg_cpu', 0) + mem_vs = metrics.mem_pct - compare_to_avg.get('avg_mem', 0) + disk_vs = metrics.disk_pct - compare_to_avg.get('avg_disk', 0) + + comparisons = [] + if abs(cpu_vs) > 10: + comparisons.append(f"CPU {'+' if cpu_vs > 0 else ''}{cpu_vs:.0f}% vs avg") + if abs(mem_vs) > 10: + comparisons.append(f"Mem {'+' if mem_vs > 0 else ''}{mem_vs:.0f}% vs avg") + if abs(disk_vs) > 10: + comparisons.append(f"Disk {'+' if disk_vs > 0 else ''}{disk_vs:.0f}% vs avg") + + if comparisons: + lines.append(f" vs Average: {' | '.join(comparisons)}") + + return "\n".join(lines) + + +def main(): + """Test load balancer functions.""" + print("Testing load balancer...\n") + + print("1. Testing select_optimal_host:") + print(" (Requires configured hosts - using dry-run simulation)") + + # Simulate metrics + test_metrics = [ + MachineMetrics('web-01', 45, 60, 40, 0.49, 'moderate'), + MachineMetrics('web-02', 85, 70, 65, 0.75, 'high'), + MachineMetrics('web-03', 20, 35, 30, 0.28, 'low'), + ] + + # Sort by score + test_metrics.sort(key=lambda m: m.load_score) + best = test_metrics[0] + + print(f" ✓ Best host: {best.host} (score: {best.load_score:.2f})") + print(f" Reason: {best.status} load") + + print("\n2. Format load report:") + report = format_load_report(test_metrics[0], { + 'avg_cpu': 50, + 'avg_mem': 55, + 'avg_disk': 45 + }) + print(report) + + print("\n✅ Load balancer tested") + + +if __name__ == "__main__": + main() diff --git a/scripts/sshsync_wrapper.py b/scripts/sshsync_wrapper.py new file mode 100644 index 0000000..c240a90 --- /dev/null +++ b/scripts/sshsync_wrapper.py @@ -0,0 +1,409 @@ +#!/usr/bin/env python3 +""" +SSH Sync wrapper for Tailscale SSH Sync Agent. +Python interface to sshsync CLI operations. +""" + +import subprocess +import sys +from pathlib import Path +from typing import Dict, List, Optional, Tuple +import json +import logging + +# Add utils to path +sys.path.insert(0, str(Path(__file__).parent)) + +from utils.helpers import parse_ssh_config, parse_sshsync_config, format_bytes, format_duration +from utils.validators import validate_host, validate_group, validate_path_exists, validate_timeout, validate_command + +logger = logging.getLogger(__name__) + + +def get_host_status(group: Optional[str] = None) -> Dict: + """ + Get online/offline status of hosts. + + Args: + group: Optional group to filter (None = all hosts) + + Returns: + Dict with status info + + Example: + >>> status = get_host_status() + >>> status['online_count'] + 8 + """ + try: + # Run sshsync ls --with-status + cmd = ["sshsync", "ls", "--with-status"] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + + if result.returncode != 0: + return {'error': result.stderr, 'hosts': []} + + # Parse output + hosts = [] + for line in result.stdout.strip().split('\n'): + if not line or line.startswith('Host') or line.startswith('---'): + continue + + parts = line.split() + if len(parts) >= 2: + host_name = parts[0] + status = parts[1] if len(parts) > 1 else 'unknown' + + hosts.append({ + 'host': host_name, + 'online': status.lower() in ['online', 'reachable', '✓'], + 'status': status + }) + + # Filter by group if specified + if group: + groups_config = parse_sshsync_config() + group_hosts = groups_config.get(group, []) + hosts = [h for h in hosts if h['host'] in group_hosts] + + online_count = sum(1 for h in hosts if h['online']) + + return { + 'hosts': hosts, + 'total_count': len(hosts), + 'online_count': online_count, + 'offline_count': len(hosts) - online_count, + 'availability_pct': (online_count / len(hosts) * 100) if hosts else 0 + } + + except Exception as e: + logger.error(f"Error getting host status: {e}") + return {'error': str(e), 'hosts': []} + + +def execute_on_all(command: str, timeout: int = 10, dry_run: bool = False) -> Dict: + """ + Execute command on all hosts. + + Args: + command: Command to execute + timeout: Timeout in seconds + dry_run: If True, don't actually execute + + Returns: + Dict with results per host + + Example: + >>> result = execute_on_all("uptime", timeout=15) + >>> len(result['results']) + 10 + """ + validate_command(command) + validate_timeout(timeout) + + if dry_run: + return { + 'dry_run': True, + 'command': command, + 'message': 'Would execute on all hosts' + } + + try: + cmd = ["sshsync", "all", f"--timeout={timeout}", command] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout + 30) + + # Parse results (format varies, simplified here) + return { + 'success': result.returncode == 0, + 'stdout': result.stdout, + 'stderr': result.stderr, + 'command': command + } + + except subprocess.TimeoutExpired: + return {'error': f'Command timed out after {timeout}s'} + except Exception as e: + return {'error': str(e)} + + +def execute_on_group(group: str, command: str, timeout: int = 10, dry_run: bool = False) -> Dict: + """ + Execute command on specific group. + + Args: + group: Group name + command: Command to execute + timeout: Timeout in seconds + dry_run: Preview without executing + + Returns: + Dict with execution results + + Example: + >>> result = execute_on_group("web-servers", "df -h /var/www") + >>> result['success'] + True + """ + groups_config = parse_sshsync_config() + validate_group(group, list(groups_config.keys())) + validate_command(command) + validate_timeout(timeout) + + if dry_run: + group_hosts = groups_config.get(group, []) + return { + 'dry_run': True, + 'group': group, + 'hosts': group_hosts, + 'command': command, + 'message': f'Would execute on {len(group_hosts)} hosts in group {group}' + } + + try: + cmd = ["sshsync", "group", f"--timeout={timeout}", group, command] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout + 30) + + return { + 'success': result.returncode == 0, + 'group': group, + 'stdout': result.stdout, + 'stderr': result.stderr, + 'command': command + } + + except subprocess.TimeoutExpired: + return {'error': f'Command timed out after {timeout}s'} + except Exception as e: + return {'error': str(e)} + + +def execute_on_host(host: str, command: str, timeout: int = 10) -> Dict: + """ + Execute command on single host. + + Args: + host: Host name + command: Command to execute + timeout: Timeout in seconds + + Returns: + Dict with result + + Example: + >>> result = execute_on_host("web-01", "hostname") + >>> result['stdout'] + "web-01" + """ + ssh_hosts = parse_ssh_config() + validate_host(host, list(ssh_hosts.keys())) + validate_command(command) + validate_timeout(timeout) + + try: + cmd = ["ssh", "-o", f"ConnectTimeout={timeout}", host, command] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout + 5) + + return { + 'success': result.returncode == 0, + 'host': host, + 'stdout': result.stdout, + 'stderr': result.stderr, + 'command': command + } + + except subprocess.TimeoutExpired: + return {'error': f'Command timed out after {timeout}s'} + except Exception as e: + return {'error': str(e)} + + +def push_to_hosts(local_path: str, remote_path: str, + hosts: Optional[List[str]] = None, + group: Optional[str] = None, + recurse: bool = False, + dry_run: bool = False) -> Dict: + """ + Push files to hosts. + + Args: + local_path: Local file/directory path + remote_path: Remote destination path + hosts: Specific hosts (None = all if group also None) + group: Group name + recurse: Recursive copy + dry_run: Preview without executing + + Returns: + Dict with push results + + Example: + >>> result = push_to_hosts("./dist", "/var/www/app", group="production", recurse=True) + >>> result['success'] + True + """ + validate_path_exists(local_path) + + if dry_run: + return { + 'dry_run': True, + 'local_path': local_path, + 'remote_path': remote_path, + 'hosts': hosts, + 'group': group, + 'recurse': recurse, + 'message': 'Would push files' + } + + try: + cmd = ["sshsync", "push"] + + if hosts: + for host in hosts: + cmd.extend(["--host", host]) + elif group: + cmd.extend(["--group", group]) + else: + cmd.append("--all") + + if recurse: + cmd.append("--recurse") + + cmd.extend([local_path, remote_path]) + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + + return { + 'success': result.returncode == 0, + 'stdout': result.stdout, + 'stderr': result.stderr, + 'local_path': local_path, + 'remote_path': remote_path + } + + except subprocess.TimeoutExpired: + return {'error': 'Push operation timed out'} + except Exception as e: + return {'error': str(e)} + + +def pull_from_host(host: str, remote_path: str, local_path: str, + recurse: bool = False, dry_run: bool = False) -> Dict: + """ + Pull files from host. + + Args: + host: Host to pull from + remote_path: Remote file/directory path + local_path: Local destination path + recurse: Recursive copy + dry_run: Preview without executing + + Returns: + Dict with pull results + + Example: + >>> result = pull_from_host("web-01", "/var/log/nginx", "./logs", recurse=True) + >>> result['success'] + True + """ + ssh_hosts = parse_ssh_config() + validate_host(host, list(ssh_hosts.keys())) + + if dry_run: + return { + 'dry_run': True, + 'host': host, + 'remote_path': remote_path, + 'local_path': local_path, + 'recurse': recurse, + 'message': f'Would pull from {host}' + } + + try: + cmd = ["sshsync", "pull", "--host", host] + + if recurse: + cmd.append("--recurse") + + cmd.extend([remote_path, local_path]) + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + + return { + 'success': result.returncode == 0, + 'host': host, + 'stdout': result.stdout, + 'stderr': result.stderr, + 'remote_path': remote_path, + 'local_path': local_path + } + + except subprocess.TimeoutExpired: + return {'error': 'Pull operation timed out'} + except Exception as e: + return {'error': str(e)} + + +def list_hosts(with_status: bool = True) -> Dict: + """ + List all configured hosts. + + Args: + with_status: Include online/offline status + + Returns: + Dict with hosts info + + Example: + >>> result = list_hosts(with_status=True) + >>> len(result['hosts']) + 10 + """ + if with_status: + return get_host_status() + else: + ssh_hosts = parse_ssh_config() + return { + 'hosts': [{'host': name} for name in ssh_hosts.keys()], + 'count': len(ssh_hosts) + } + + +def get_groups() -> Dict[str, List[str]]: + """ + Get all defined groups and their members. + + Returns: + Dict mapping group names to host lists + + Example: + >>> groups = get_groups() + >>> groups['production'] + ['prod-web-01', 'prod-db-01'] + """ + return parse_sshsync_config() + + +def main(): + """Test sshsync wrapper functions.""" + print("Testing sshsync wrapper...\n") + + print("1. List hosts:") + result = list_hosts(with_status=False) + print(f" Found {result.get('count', 0)} hosts") + + print("\n2. Get groups:") + groups = get_groups() + print(f" Found {len(groups)} groups") + for group, hosts in groups.items(): + print(f" - {group}: {len(hosts)} hosts") + + print("\n3. Test dry-run:") + result = execute_on_all("uptime", dry_run=True) + print(f" Dry-run: {result.get('message', 'OK')}") + + print("\n✅ sshsync wrapper tested") + + +if __name__ == "__main__": + main() diff --git a/scripts/tailscale_manager.py b/scripts/tailscale_manager.py new file mode 100644 index 0000000..2867638 --- /dev/null +++ b/scripts/tailscale_manager.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python3 +""" +Tailscale manager for Tailscale SSH Sync Agent. +Tailscale-specific operations and status management. +""" + +import subprocess +import re +import json +from typing import Dict, List, Optional +from dataclasses import dataclass +import logging + +logger = logging.getLogger(__name__) + + +@dataclass +class TailscalePeer: + """Represents a Tailscale peer.""" + hostname: str + ip: str + online: bool + last_seen: Optional[str] = None + os: Optional[str] = None + relay: Optional[str] = None + + +def get_tailscale_status() -> Dict: + """ + Get Tailscale network status (all peers). + + Returns: + Dict with network status: + { + 'connected': bool, + 'peers': List[TailscalePeer], + 'online_count': int, + 'total_count': int, + 'self_ip': str + } + + Example: + >>> status = get_tailscale_status() + >>> status['online_count'] + 8 + >>> status['peers'][0].hostname + 'homelab-1' + """ + try: + # Get status in JSON format + result = subprocess.run( + ["tailscale", "status", "--json"], + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode != 0: + # Try text format if JSON fails + result = subprocess.run( + ["tailscale", "status"], + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode != 0: + return { + 'connected': False, + 'error': 'Tailscale not running or accessible', + 'peers': [] + } + + # Parse text format + return _parse_text_status(result.stdout) + + # Parse JSON format + data = json.loads(result.stdout) + return _parse_json_status(data) + + except FileNotFoundError: + return { + 'connected': False, + 'error': 'Tailscale not installed', + 'peers': [] + } + except subprocess.TimeoutExpired: + return { + 'connected': False, + 'error': 'Timeout getting Tailscale status', + 'peers': [] + } + except Exception as e: + logger.error(f"Error getting Tailscale status: {e}") + return { + 'connected': False, + 'error': str(e), + 'peers': [] + } + + +def _parse_json_status(data: Dict) -> Dict: + """Parse Tailscale JSON status.""" + peers = [] + + self_data = data.get('Self', {}) + self_ip = self_data.get('TailscaleIPs', [''])[0] + + for peer_id, peer_data in data.get('Peer', {}).items(): + hostname = peer_data.get('HostName', 'unknown') + ips = peer_data.get('TailscaleIPs', []) + ip = ips[0] if ips else 'unknown' + online = peer_data.get('Online', False) + os = peer_data.get('OS', 'unknown') + + peers.append(TailscalePeer( + hostname=hostname, + ip=ip, + online=online, + os=os + )) + + online_count = sum(1 for p in peers if p.online) + + return { + 'connected': True, + 'peers': peers, + 'online_count': online_count, + 'total_count': len(peers), + 'self_ip': self_ip + } + + +def _parse_text_status(output: str) -> Dict: + """Parse Tailscale text status output.""" + peers = [] + self_ip = None + + for line in output.strip().split('\n'): + line = line.strip() + if not line: + continue + + # Parse format: hostname ip status ... + parts = line.split() + if len(parts) >= 2: + hostname = parts[0] + ip = parts[1] if len(parts) > 1 else 'unknown' + + # Check for self (usually marked with *) + if hostname.endswith('-'): + self_ip = ip + continue + + # Determine online status from additional fields + online = 'offline' not in line.lower() + + peers.append(TailscalePeer( + hostname=hostname, + ip=ip, + online=online + )) + + online_count = sum(1 for p in peers if p.online) + + return { + 'connected': True, + 'peers': peers, + 'online_count': online_count, + 'total_count': len(peers), + 'self_ip': self_ip or 'unknown' + } + + +def check_connectivity(host: str, timeout: int = 5) -> bool: + """ + Ping host via Tailscale. + + Args: + host: Hostname to ping + timeout: Timeout in seconds + + Returns: + True if host responds to ping + + Example: + >>> check_connectivity("homelab-1") + True + """ + try: + result = subprocess.run( + ["tailscale", "ping", "--timeout", f"{timeout}s", "--c", "1", host], + capture_output=True, + text=True, + timeout=timeout + 2 + ) + + # Check if ping succeeded + return result.returncode == 0 or 'pong' in result.stdout.lower() + + except (FileNotFoundError, subprocess.TimeoutExpired): + return False + except Exception as e: + logger.error(f"Error pinging {host}: {e}") + return False + + +def get_peer_info(hostname: str) -> Optional[TailscalePeer]: + """ + Get detailed info about a specific peer. + + Args: + hostname: Peer hostname + + Returns: + TailscalePeer object or None if not found + + Example: + >>> peer = get_peer_info("homelab-1") + >>> peer.ip + '100.64.1.10' + """ + status = get_tailscale_status() + + if not status.get('connected'): + return None + + for peer in status.get('peers', []): + if peer.hostname == hostname or hostname in peer.hostname: + return peer + + return None + + +def list_online_machines() -> List[str]: + """ + List all online Tailscale machines. + + Returns: + List of online machine hostnames + + Example: + >>> machines = list_online_machines() + >>> len(machines) + 8 + """ + status = get_tailscale_status() + + if not status.get('connected'): + return [] + + return [ + peer.hostname + for peer in status.get('peers', []) + if peer.online + ] + + +def get_machine_ip(hostname: str) -> Optional[str]: + """ + Get Tailscale IP for a machine. + + Args: + hostname: Machine hostname + + Returns: + IP address or None if not found + + Example: + >>> ip = get_machine_ip("homelab-1") + >>> ip + '100.64.1.10' + """ + peer = get_peer_info(hostname) + return peer.ip if peer else None + + +def validate_tailscale_ssh(host: str, timeout: int = 10) -> Dict: + """ + Check if Tailscale SSH is working for a host. + + Args: + host: Host to check + timeout: Connection timeout + + Returns: + Dict with validation results: + { + 'working': bool, + 'message': str, + 'details': Dict + } + + Example: + >>> result = validate_tailscale_ssh("homelab-1") + >>> result['working'] + True + """ + # First check if host is in Tailscale network + peer = get_peer_info(host) + + if not peer: + return { + 'working': False, + 'message': f'Host {host} not found in Tailscale network', + 'details': {'peer_found': False} + } + + if not peer.online: + return { + 'working': False, + 'message': f'Host {host} is offline in Tailscale', + 'details': {'peer_found': True, 'online': False} + } + + # Check connectivity + if not check_connectivity(host, timeout=timeout): + return { + 'working': False, + 'message': f'Cannot ping {host} via Tailscale', + 'details': {'peer_found': True, 'online': True, 'ping': False} + } + + # Try SSH connection + try: + result = subprocess.run( + ["tailscale", "ssh", host, "echo", "test"], + capture_output=True, + text=True, + timeout=timeout + ) + + if result.returncode == 0: + return { + 'working': True, + 'message': f'Tailscale SSH to {host} is working', + 'details': { + 'peer_found': True, + 'online': True, + 'ping': True, + 'ssh': True, + 'ip': peer.ip + } + } + else: + return { + 'working': False, + 'message': f'Tailscale SSH failed: {result.stderr}', + 'details': { + 'peer_found': True, + 'online': True, + 'ping': True, + 'ssh': False, + 'error': result.stderr + } + } + + except subprocess.TimeoutExpired: + return { + 'working': False, + 'message': f'Tailscale SSH timed out after {timeout}s', + 'details': {'timeout': True} + } + except Exception as e: + return { + 'working': False, + 'message': f'Error testing Tailscale SSH: {e}', + 'details': {'error': str(e)} + } + + +def get_network_summary() -> str: + """ + Get human-readable network summary. + + Returns: + Formatted summary string + + Example: + >>> print(get_network_summary()) + Tailscale Network: Connected + Online: 8/10 machines (80%) + Self IP: 100.64.1.5 + """ + status = get_tailscale_status() + + if not status.get('connected'): + return "Tailscale Network: Not connected\nError: {}".format( + status.get('error', 'Unknown error') + ) + + lines = [ + "Tailscale Network: Connected", + f"Online: {status['online_count']}/{status['total_count']} machines ({status['online_count']/status['total_count']*100:.0f}%)", + f"Self IP: {status.get('self_ip', 'unknown')}" + ] + + return "\n".join(lines) + + +def main(): + """Test Tailscale manager functions.""" + print("Testing Tailscale manager...\n") + + print("1. Get Tailscale status:") + status = get_tailscale_status() + if status.get('connected'): + print(f" ✓ Connected") + print(f" Peers: {status['total_count']} total, {status['online_count']} online") + else: + print(f" ✗ Not connected: {status.get('error', 'Unknown error')}") + + print("\n2. List online machines:") + machines = list_online_machines() + print(f" Found {len(machines)} online machines") + for machine in machines[:5]: # Show first 5 + print(f" - {machine}") + + print("\n3. Network summary:") + print(get_network_summary()) + + print("\n✅ Tailscale manager tested") + + +if __name__ == "__main__": + main() diff --git a/scripts/utils/helpers.py b/scripts/utils/helpers.py new file mode 100644 index 0000000..5b0921b --- /dev/null +++ b/scripts/utils/helpers.py @@ -0,0 +1,628 @@ +#!/usr/bin/env python3 +""" +Helper utilities for Tailscale SSH Sync Agent. +Provides common formatting, parsing, and utility functions. +""" + +import os +import re +import subprocess +from datetime import datetime, timedelta +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Any +import yaml +import logging + +logger = logging.getLogger(__name__) + + +def format_bytes(bytes_value: int) -> str: + """ + Format bytes as human-readable string. + + Args: + bytes_value: Number of bytes + + Returns: + Formatted string (e.g., "12.3 MB", "1.5 GB") + + Example: + >>> format_bytes(12582912) + "12.0 MB" + >>> format_bytes(1610612736) + "1.5 GB" + """ + for unit in ['B', 'KB', 'MB', 'GB', 'TB']: + if bytes_value < 1024.0: + return f"{bytes_value:.1f} {unit}" + bytes_value /= 1024.0 + return f"{bytes_value:.1f} PB" + + +def format_duration(seconds: float) -> str: + """ + Format duration as human-readable string. + + Args: + seconds: Duration in seconds + + Returns: + Formatted string (e.g., "2m 15s", "1h 30m") + + Example: + >>> format_duration(135) + "2m 15s" + >>> format_duration(5430) + "1h 30m 30s" + """ + if seconds < 60: + return f"{int(seconds)}s" + + minutes = int(seconds // 60) + secs = int(seconds % 60) + + if minutes < 60: + return f"{minutes}m {secs}s" if secs > 0 else f"{minutes}m" + + hours = minutes // 60 + minutes = minutes % 60 + + parts = [f"{hours}h"] + if minutes > 0: + parts.append(f"{minutes}m") + if secs > 0 and hours == 0: # Only show seconds if < 1 hour + parts.append(f"{secs}s") + + return " ".join(parts) + + +def format_percentage(value: float, decimals: int = 1) -> str: + """ + Format percentage with specified decimals. + + Args: + value: Percentage value (0-100) + decimals: Number of decimal places + + Returns: + Formatted string (e.g., "45.5%") + + Example: + >>> format_percentage(45.567) + "45.6%" + """ + return f"{value:.{decimals}f}%" + + +def parse_ssh_config(config_path: Optional[Path] = None) -> Dict[str, Dict[str, str]]: + """ + Parse SSH config file for host definitions. + + Args: + config_path: Path to SSH config (default: ~/.ssh/config) + + Returns: + Dict mapping host aliases to their configuration: + { + 'host-alias': { + 'hostname': '100.64.1.10', + 'user': 'admin', + 'port': '22', + 'identityfile': '~/.ssh/id_ed25519' + } + } + + Example: + >>> hosts = parse_ssh_config() + >>> hosts['homelab-1']['hostname'] + '100.64.1.10' + """ + if config_path is None: + config_path = Path.home() / '.ssh' / 'config' + + if not config_path.exists(): + logger.warning(f"SSH config not found: {config_path}") + return {} + + hosts = {} + current_host = None + + try: + with open(config_path, 'r') as f: + for line in f: + line = line.strip() + + # Skip comments and empty lines + if not line or line.startswith('#'): + continue + + # Host directive + if line.lower().startswith('host '): + host_alias = line.split(maxsplit=1)[1] + # Skip wildcards + if '*' not in host_alias and '?' not in host_alias: + current_host = host_alias + hosts[current_host] = {} + + # Configuration directives + elif current_host: + parts = line.split(maxsplit=1) + if len(parts) == 2: + key, value = parts + hosts[current_host][key.lower()] = value + + return hosts + + except Exception as e: + logger.error(f"Error parsing SSH config: {e}") + return {} + + +def parse_sshsync_config(config_path: Optional[Path] = None) -> Dict[str, List[str]]: + """ + Parse sshsync config file for group definitions. + + Args: + config_path: Path to sshsync config (default: ~/.config/sshsync/config.yaml) + + Returns: + Dict mapping group names to list of hosts: + { + 'production': ['prod-web-01', 'prod-db-01'], + 'development': ['dev-laptop', 'dev-desktop'] + } + + Example: + >>> groups = parse_sshsync_config() + >>> groups['production'] + ['prod-web-01', 'prod-db-01'] + """ + if config_path is None: + config_path = Path.home() / '.config' / 'sshsync' / 'config.yaml' + + if not config_path.exists(): + logger.warning(f"sshsync config not found: {config_path}") + return {} + + try: + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + return config.get('groups', {}) + + except Exception as e: + logger.error(f"Error parsing sshsync config: {e}") + return {} + + +def get_timestamp(iso: bool = True) -> str: + """ + Get current timestamp. + + Args: + iso: If True, return ISO format; otherwise human-readable + + Returns: + Timestamp string + + Example: + >>> get_timestamp(iso=True) + "2025-10-19T19:43:41Z" + >>> get_timestamp(iso=False) + "2025-10-19 19:43:41" + """ + now = datetime.now() + if iso: + return now.strftime("%Y-%m-%dT%H:%M:%SZ") + else: + return now.strftime("%Y-%m-%d %H:%M:%S") + + +def safe_execute(func, *args, default=None, **kwargs) -> Any: + """ + Execute function with error handling. + + Args: + func: Function to execute + *args: Positional arguments + default: Value to return on error + **kwargs: Keyword arguments + + Returns: + Function result or default on error + + Example: + >>> safe_execute(int, "not_a_number", default=0) + 0 + >>> safe_execute(int, "42") + 42 + """ + try: + return func(*args, **kwargs) + except Exception as e: + logger.error(f"Error executing {func.__name__}: {e}") + return default + + +def validate_path(path: str, must_exist: bool = True) -> bool: + """ + Check if path is valid and accessible. + + Args: + path: Path to validate + must_exist: If True, path must exist + + Returns: + True if valid, False otherwise + + Example: + >>> validate_path("/tmp") + True + >>> validate_path("/nonexistent", must_exist=True) + False + """ + p = Path(path).expanduser() + + if must_exist: + return p.exists() + else: + # Check if parent directory exists (for paths that will be created) + return p.parent.exists() + + +def parse_disk_usage(df_output: str) -> Dict[str, Any]: + """ + Parse 'df' command output. + + Args: + df_output: Output from 'df -h' command + + Returns: + Dict with disk usage info: + { + 'filesystem': '/dev/sda1', + 'size': '100G', + 'used': '45G', + 'available': '50G', + 'use_pct': 45, + 'mount': '/' + } + + Example: + >>> output = "Filesystem Size Used Avail Use% Mounted on\\n/dev/sda1 100G 45G 50G 45% /" + >>> parse_disk_usage(output) + {'filesystem': '/dev/sda1', 'size': '100G', ...} + """ + lines = df_output.strip().split('\n') + if len(lines) < 2: + return {} + + # Parse last line (actual data, not header) + data_line = lines[-1] + parts = data_line.split() + + if len(parts) < 6: + return {} + + try: + return { + 'filesystem': parts[0], + 'size': parts[1], + 'used': parts[2], + 'available': parts[3], + 'use_pct': int(parts[4].rstrip('%')), + 'mount': parts[5] + } + except (ValueError, IndexError) as e: + logger.error(f"Error parsing disk usage: {e}") + return {} + + +def parse_memory_usage(free_output: str) -> Dict[str, Any]: + """ + Parse 'free' command output (Linux). + + Args: + free_output: Output from 'free -m' command + + Returns: + Dict with memory info: + { + 'total': 16384, # MB + 'used': 8192, + 'free': 8192, + 'use_pct': 50.0 + } + + Example: + >>> output = "Mem: 16384 8192 8192 0 0 0" + >>> parse_memory_usage(output) + {'total': 16384, 'used': 8192, ...} + """ + lines = free_output.strip().split('\n') + + for line in lines: + if line.startswith('Mem:'): + parts = line.split() + if len(parts) >= 3: + try: + total = int(parts[1]) + used = int(parts[2]) + free = int(parts[3]) if len(parts) > 3 else (total - used) + + return { + 'total': total, + 'used': used, + 'free': free, + 'use_pct': (used / total * 100) if total > 0 else 0 + } + except (ValueError, IndexError) as e: + logger.error(f"Error parsing memory usage: {e}") + + return {} + + +def parse_cpu_load(uptime_output: str) -> Dict[str, float]: + """ + Parse 'uptime' command output for load averages. + + Args: + uptime_output: Output from 'uptime' command + + Returns: + Dict with load averages: + { + 'load_1min': 0.45, + 'load_5min': 0.38, + 'load_15min': 0.32 + } + + Example: + >>> output = "19:43:41 up 5 days, 2:15, 3 users, load average: 0.45, 0.38, 0.32" + >>> parse_cpu_load(output) + {'load_1min': 0.45, 'load_5min': 0.38, 'load_15min': 0.32} + """ + # Find "load average:" part + match = re.search(r'load average:\s+([\d.]+),\s+([\d.]+),\s+([\d.]+)', uptime_output) + + if match: + try: + return { + 'load_1min': float(match.group(1)), + 'load_5min': float(match.group(2)), + 'load_15min': float(match.group(3)) + } + except ValueError as e: + logger.error(f"Error parsing CPU load: {e}") + + return {} + + +def format_host_status(host: str, online: bool, groups: List[str], + latency: Optional[int] = None, + tailscale_connected: bool = False) -> str: + """ + Format host status as display string. + + Args: + host: Host name + online: Whether host is online + groups: List of groups host belongs to + latency: Latency in ms (optional) + tailscale_connected: Tailscale connection status + + Returns: + Formatted status string + + Example: + >>> format_host_status("web-01", True, ["production", "web"], 25, True) + "🟢 web-01 (production, web) - Online - Tailscale: Connected | Latency: 25ms" + """ + icon = "🟢" if online else "🔴" + status = "Online" if online else "Offline" + group_str = ", ".join(groups) if groups else "no group" + + parts = [f"{icon} {host} ({group_str}) - {status}"] + + if tailscale_connected: + parts.append("Tailscale: Connected") + + if latency is not None and online: + parts.append(f"Latency: {latency}ms") + + return " - ".join(parts) + + +def calculate_load_score(cpu_pct: float, mem_pct: float, disk_pct: float) -> float: + """ + Calculate composite load score for a machine. + + Args: + cpu_pct: CPU usage percentage (0-100) + mem_pct: Memory usage percentage (0-100) + disk_pct: Disk usage percentage (0-100) + + Returns: + Load score (0-1, lower is better) + + Formula: + score = (cpu * 0.4) + (mem * 0.3) + (disk * 0.3) + + Example: + >>> calculate_load_score(45, 60, 40) + 0.48 # (0.45*0.4 + 0.60*0.3 + 0.40*0.3) + """ + return (cpu_pct * 0.4 + mem_pct * 0.3 + disk_pct * 0.3) / 100 + + +def classify_load_status(score: float) -> str: + """ + Classify load score into status category. + + Args: + score: Load score (0-1) + + Returns: + Status string: "low", "moderate", or "high" + + Example: + >>> classify_load_status(0.28) + "low" + >>> classify_load_status(0.55) + "moderate" + >>> classify_load_status(0.82) + "high" + """ + if score < 0.4: + return "low" + elif score < 0.7: + return "moderate" + else: + return "high" + + +def classify_latency(latency_ms: int) -> Tuple[str, str]: + """ + Classify network latency. + + Args: + latency_ms: Latency in milliseconds + + Returns: + Tuple of (status, description) + + Example: + >>> classify_latency(25) + ("excellent", "Ideal for interactive tasks") + >>> classify_latency(150) + ("fair", "May impact interactive workflows") + """ + if latency_ms < 50: + return ("excellent", "Ideal for interactive tasks") + elif latency_ms < 100: + return ("good", "Suitable for most operations") + elif latency_ms < 200: + return ("fair", "May impact interactive workflows") + else: + return ("poor", "Investigate network issues") + + +def get_hosts_from_groups(group: str, groups_config: Dict[str, List[str]]) -> List[str]: + """ + Get list of hosts in a group. + + Args: + group: Group name + groups_config: Groups configuration dict + + Returns: + List of host names in group + + Example: + >>> groups = {'production': ['web-01', 'db-01']} + >>> get_hosts_from_groups('production', groups) + ['web-01', 'db-01'] + """ + return groups_config.get(group, []) + + +def get_groups_for_host(host: str, groups_config: Dict[str, List[str]]) -> List[str]: + """ + Get list of groups a host belongs to. + + Args: + host: Host name + groups_config: Groups configuration dict + + Returns: + List of group names + + Example: + >>> groups = {'production': ['web-01'], 'web': ['web-01', 'web-02']} + >>> get_groups_for_host('web-01', groups) + ['production', 'web'] + """ + return [group for group, hosts in groups_config.items() if host in hosts] + + +def run_command(command: str, timeout: int = 10) -> Tuple[bool, str, str]: + """ + Run shell command with timeout. + + Args: + command: Command to execute + timeout: Timeout in seconds + + Returns: + Tuple of (success, stdout, stderr) + + Example: + >>> success, stdout, stderr = run_command("echo hello") + >>> success + True + >>> stdout.strip() + "hello" + """ + try: + result = subprocess.run( + command, + shell=True, + capture_output=True, + text=True, + timeout=timeout + ) + + return ( + result.returncode == 0, + result.stdout, + result.stderr + ) + + except subprocess.TimeoutExpired: + return (False, "", f"Command timed out after {timeout}s") + except Exception as e: + return (False, "", str(e)) + + +def main(): + """Test helper functions.""" + print("Testing helper functions...\n") + + # Test formatting + print("1. Format bytes:") + print(f" 12582912 bytes = {format_bytes(12582912)}") + print(f" 1610612736 bytes = {format_bytes(1610612736)}") + + print("\n2. Format duration:") + print(f" 135 seconds = {format_duration(135)}") + print(f" 5430 seconds = {format_duration(5430)}") + + print("\n3. Format percentage:") + print(f" 45.567 = {format_percentage(45.567)}") + + print("\n4. Calculate load score:") + score = calculate_load_score(45, 60, 40) + print(f" CPU 45%, Mem 60%, Disk 40% = {score:.2f}") + print(f" Status: {classify_load_status(score)}") + + print("\n5. Classify latency:") + latencies = [25, 75, 150, 250] + for lat in latencies: + status, desc = classify_latency(lat) + print(f" {lat}ms: {status} - {desc}") + + print("\n6. Parse SSH config:") + ssh_hosts = parse_ssh_config() + print(f" Found {len(ssh_hosts)} hosts") + + print("\n7. Parse sshsync config:") + groups = parse_sshsync_config() + print(f" Found {len(groups)} groups") + for group, hosts in groups.items(): + print(f" - {group}: {len(hosts)} hosts") + + print("\n✅ All helpers tested successfully") + + +if __name__ == "__main__": + main() diff --git a/scripts/utils/validators/__init__.py b/scripts/utils/validators/__init__.py new file mode 100644 index 0000000..e9cad86 --- /dev/null +++ b/scripts/utils/validators/__init__.py @@ -0,0 +1,43 @@ +""" +Validators package for Tailscale SSH Sync Agent. +""" + +from .parameter_validator import ( + ValidationError, + validate_host, + validate_group, + validate_path_exists, + validate_timeout, + validate_command +) + +from .host_validator import ( + validate_ssh_config, + validate_host_reachable, + validate_group_members, + get_invalid_hosts +) + +from .connection_validator import ( + validate_ssh_connection, + validate_tailscale_connection, + validate_ssh_key, + get_connection_diagnostics +) + +__all__ = [ + 'ValidationError', + 'validate_host', + 'validate_group', + 'validate_path_exists', + 'validate_timeout', + 'validate_command', + 'validate_ssh_config', + 'validate_host_reachable', + 'validate_group_members', + 'get_invalid_hosts', + 'validate_ssh_connection', + 'validate_tailscale_connection', + 'validate_ssh_key', + 'get_connection_diagnostics', +] diff --git a/scripts/utils/validators/connection_validator.py b/scripts/utils/validators/connection_validator.py new file mode 100644 index 0000000..7aa89d4 --- /dev/null +++ b/scripts/utils/validators/connection_validator.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 +""" +Connection validators for Tailscale SSH Sync Agent. +Validates SSH and Tailscale connections. +""" + +import subprocess +from typing import Dict, Optional +import logging + +from .parameter_validator import ValidationError + +logger = logging.getLogger(__name__) + + +def validate_ssh_connection(host: str, timeout: int = 10) -> bool: + """ + Test SSH connection works. + + Args: + host: Host to connect to + timeout: Connection timeout in seconds + + Returns: + True if SSH connection successful + + Raises: + ValidationError: If connection fails + + Example: + >>> validate_ssh_connection("web-01") + True + """ + try: + # Try to execute a simple command via SSH + result = subprocess.run( + ["ssh", "-o", "ConnectTimeout={}".format(timeout), + "-o", "BatchMode=yes", + "-o", "StrictHostKeyChecking=no", + host, "echo", "test"], + capture_output=True, + text=True, + timeout=timeout + 5 + ) + + if result.returncode == 0: + return True + else: + # Parse error message + error_msg = result.stderr.strip() + + if "Permission denied" in error_msg: + raise ValidationError( + f"SSH authentication failed for '{host}'\n" + "Check:\n" + "1. SSH key is added: ssh-add -l\n" + "2. Public key is on remote: cat ~/.ssh/authorized_keys\n" + "3. User/key in SSH config is correct" + ) + elif "Connection refused" in error_msg: + raise ValidationError( + f"SSH connection refused for '{host}'\n" + "Check:\n" + "1. SSH server is running on remote\n" + "2. Port 22 is not blocked by firewall" + ) + elif "Connection timed out" in error_msg or "timeout" in error_msg.lower(): + raise ValidationError( + f"SSH connection timed out for '{host}'\n" + "Check:\n" + "1. Host is reachable (ping test)\n" + "2. Tailscale is connected\n" + "3. Network connectivity" + ) + else: + raise ValidationError( + f"SSH connection failed for '{host}': {error_msg}" + ) + + except subprocess.TimeoutExpired: + raise ValidationError( + f"SSH connection timed out for '{host}' (>{timeout}s)" + ) + except Exception as e: + raise ValidationError(f"Error testing SSH connection to '{host}': {e}") + + +def validate_tailscale_connection(host: str) -> bool: + """ + Test Tailscale connectivity to host. + + Args: + host: Host to check + + Returns: + True if Tailscale connection active + + Raises: + ValidationError: If Tailscale not connected + + Example: + >>> validate_tailscale_connection("web-01") + True + """ + try: + # Check if tailscale is running + result = subprocess.run( + ["tailscale", "status"], + capture_output=True, + text=True, + timeout=5 + ) + + if result.returncode != 0: + raise ValidationError( + "Tailscale is not running\n" + "Start Tailscale: sudo tailscale up" + ) + + # Check if specific host is in the network + if host in result.stdout or host.replace('-', '.') in result.stdout: + return True + else: + raise ValidationError( + f"Host '{host}' not found in Tailscale network\n" + "Ensure host is:\n" + "1. Connected to Tailscale\n" + "2. In the same tailnet\n" + "3. Not expired/offline" + ) + + except FileNotFoundError: + raise ValidationError( + "Tailscale not installed\n" + "Install: https://tailscale.com/download" + ) + except subprocess.TimeoutExpired: + raise ValidationError("Timeout checking Tailscale status") + except Exception as e: + raise ValidationError(f"Error checking Tailscale connection: {e}") + + +def validate_ssh_key(host: str) -> bool: + """ + Check SSH key authentication is working. + + Args: + host: Host to check + + Returns: + True if SSH key auth works + + Raises: + ValidationError: If key auth fails + + Example: + >>> validate_ssh_key("web-01") + True + """ + try: + # Test connection with explicit key-only auth + result = subprocess.run( + ["ssh", "-o", "BatchMode=yes", + "-o", "PasswordAuthentication=no", + "-o", "ConnectTimeout=5", + host, "echo", "test"], + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode == 0: + return True + else: + error_msg = result.stderr.strip() + + if "Permission denied" in error_msg: + raise ValidationError( + f"SSH key authentication failed for '{host}'\n" + "Fix:\n" + "1. Add your SSH key: ssh-add ~/.ssh/id_ed25519\n" + "2. Copy public key to remote: ssh-copy-id {}\n" + "3. Verify: ssh -v {} 2>&1 | grep -i 'offering public key'".format(host, host) + ) + else: + raise ValidationError( + f"SSH key validation failed for '{host}': {error_msg}" + ) + + except subprocess.TimeoutExpired: + raise ValidationError(f"Timeout validating SSH key for '{host}'") + except Exception as e: + raise ValidationError(f"Error validating SSH key for '{host}': {e}") + + +def get_connection_diagnostics(host: str) -> Dict[str, any]: + """ + Comprehensive connection testing. + + Args: + host: Host to diagnose + + Returns: + Dict with diagnostic results: + { + 'ping': {'success': bool, 'message': str}, + 'ssh': {'success': bool, 'message': str}, + 'tailscale': {'success': bool, 'message': str}, + 'ssh_key': {'success': bool, 'message': str} + } + + Example: + >>> diag = get_connection_diagnostics("web-01") + >>> diag['ssh']['success'] + True + """ + diagnostics = {} + + # Test 1: Ping + try: + result = subprocess.run( + ["ping", "-c", "1", "-W", "2", host], + capture_output=True, + timeout=3 + ) + diagnostics['ping'] = { + 'success': result.returncode == 0, + 'message': 'Host is reachable' if result.returncode == 0 else 'Host not reachable' + } + except Exception as e: + diagnostics['ping'] = {'success': False, 'message': str(e)} + + # Test 2: SSH connection + try: + validate_ssh_connection(host, timeout=5) + diagnostics['ssh'] = {'success': True, 'message': 'SSH connection works'} + except ValidationError as e: + diagnostics['ssh'] = {'success': False, 'message': str(e).split('\n')[0]} + + # Test 3: Tailscale + try: + validate_tailscale_connection(host) + diagnostics['tailscale'] = {'success': True, 'message': 'Tailscale connected'} + except ValidationError as e: + diagnostics['tailscale'] = {'success': False, 'message': str(e).split('\n')[0]} + + # Test 4: SSH key + try: + validate_ssh_key(host) + diagnostics['ssh_key'] = {'success': True, 'message': 'SSH key authentication works'} + except ValidationError as e: + diagnostics['ssh_key'] = {'success': False, 'message': str(e).split('\n')[0]} + + return diagnostics + + +def main(): + """Test connection validators.""" + print("Testing connection validators...\n") + + print("1. Testing connection diagnostics:") + try: + diag = get_connection_diagnostics("localhost") + print(" Results:") + for test, result in diag.items(): + status = "✓" if result['success'] else "✗" + print(f" {status} {test}: {result['message']}") + except Exception as e: + print(f" Error: {e}") + + print("\n✅ Connection validators tested") + + +if __name__ == "__main__": + main() diff --git a/scripts/utils/validators/host_validator.py b/scripts/utils/validators/host_validator.py new file mode 100644 index 0000000..15bf12b --- /dev/null +++ b/scripts/utils/validators/host_validator.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 +""" +Host validators for Tailscale SSH Sync Agent. +Validates host configuration and availability. +""" + +import subprocess +from typing import List, Dict, Optional +from pathlib import Path +import logging + +from .parameter_validator import ValidationError + +logger = logging.getLogger(__name__) + + +def validate_ssh_config(host: str, config_path: Optional[Path] = None) -> bool: + """ + Check if host has SSH config entry. + + Args: + host: Host name to check + config_path: Path to SSH config (default: ~/.ssh/config) + + Returns: + True if host is in SSH config + + Raises: + ValidationError: If host not found in config + + Example: + >>> validate_ssh_config("web-01") + True + """ + if config_path is None: + config_path = Path.home() / '.ssh' / 'config' + + if not config_path.exists(): + raise ValidationError( + f"SSH config file not found: {config_path}\n" + "Create ~/.ssh/config with your host definitions" + ) + + # Parse SSH config for this host + host_found = False + + try: + with open(config_path, 'r') as f: + for line in f: + line = line.strip() + if line.lower().startswith('host ') and host in line: + host_found = True + break + + if not host_found: + raise ValidationError( + f"Host '{host}' not found in SSH config: {config_path}\n" + "Add host to SSH config:\n" + f"Host {host}\n" + f" HostName \n" + f" User " + ) + + return True + + except IOError as e: + raise ValidationError(f"Error reading SSH config: {e}") + + +def validate_host_reachable(host: str, timeout: int = 5) -> bool: + """ + Check if host is reachable via ping. + + Args: + host: Host name to check + timeout: Timeout in seconds + + Returns: + True if host is reachable + + Raises: + ValidationError: If host is not reachable + + Example: + >>> validate_host_reachable("web-01", timeout=5) + True + """ + try: + # Try to resolve via SSH config first + result = subprocess.run( + ["ssh", "-G", host], + capture_output=True, + text=True, + timeout=2 + ) + + if result.returncode == 0: + # Extract hostname from SSH config + for line in result.stdout.split('\n'): + if line.startswith('hostname '): + actual_host = line.split()[1] + break + else: + actual_host = host + else: + actual_host = host + + # Ping the host + ping_result = subprocess.run( + ["ping", "-c", "1", "-W", str(timeout), actual_host], + capture_output=True, + text=True, + timeout=timeout + 1 + ) + + if ping_result.returncode == 0: + return True + else: + raise ValidationError( + f"Host '{host}' ({actual_host}) is not reachable\n" + "Check:\n" + "1. Host is powered on\n" + "2. Tailscale is connected\n" + "3. Network connectivity" + ) + + except subprocess.TimeoutExpired: + raise ValidationError(f"Timeout checking host '{host}' (>{timeout}s)") + except Exception as e: + raise ValidationError(f"Error checking host '{host}': {e}") + + +def validate_group_members(group: str, groups_config: Dict[str, List[str]]) -> List[str]: + """ + Ensure group has valid members. + + Args: + group: Group name + groups_config: Groups configuration dict + + Returns: + List of valid hosts in group + + Raises: + ValidationError: If group is empty or has no valid members + + Example: + >>> groups = {'production': ['web-01', 'db-01']} + >>> validate_group_members('production', groups) + ['web-01', 'db-01'] + """ + if group not in groups_config: + raise ValidationError( + f"Group '{group}' not found in configuration\n" + f"Available groups: {', '.join(groups_config.keys())}" + ) + + members = groups_config[group] + + if not members: + raise ValidationError( + f"Group '{group}' has no members\n" + f"Add hosts to group with: sshsync gadd {group}" + ) + + if not isinstance(members, list): + raise ValidationError( + f"Invalid group configuration for '{group}': members must be a list" + ) + + return members + + +def get_invalid_hosts(hosts: List[str], config_path: Optional[Path] = None) -> List[str]: + """ + Find hosts without valid SSH config. + + Args: + hosts: List of host names + config_path: Path to SSH config + + Returns: + List of hosts without valid config + + Example: + >>> get_invalid_hosts(["web-01", "nonexistent"]) + ["nonexistent"] + """ + if config_path is None: + config_path = Path.home() / '.ssh' / 'config' + + if not config_path.exists(): + return hosts # All invalid if no config + + # Parse SSH config + valid_hosts = set() + try: + with open(config_path, 'r') as f: + for line in f: + line = line.strip() + if line.lower().startswith('host '): + host_alias = line.split(maxsplit=1)[1] + if '*' not in host_alias and '?' not in host_alias: + valid_hosts.add(host_alias) + except IOError: + return hosts + + # Find invalid hosts + return [h for h in hosts if h not in valid_hosts] + + +def main(): + """Test host validators.""" + print("Testing host validators...\n") + + print("1. Testing validate_ssh_config():") + try: + validate_ssh_config("localhost") + print(" ✓ localhost has SSH config") + except ValidationError as e: + print(f" Note: {e.args[0].split(chr(10))[0]}") + + print("\n2. Testing get_invalid_hosts():") + test_hosts = ["localhost", "nonexistent-host-12345"] + invalid = get_invalid_hosts(test_hosts) + print(f" Invalid hosts: {invalid}") + + print("\n✅ Host validators tested") + + +if __name__ == "__main__": + main() diff --git a/scripts/utils/validators/parameter_validator.py b/scripts/utils/validators/parameter_validator.py new file mode 100644 index 0000000..b9cfd9f --- /dev/null +++ b/scripts/utils/validators/parameter_validator.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python3 +""" +Parameter validators for Tailscale SSH Sync Agent. +Validates user inputs before making operations. +""" + +from typing import List, Optional +from pathlib import Path +import re +import logging + +logger = logging.getLogger(__name__) + + +class ValidationError(Exception): + """Raised when validation fails.""" + pass + + +def validate_host(host: str, valid_hosts: Optional[List[str]] = None) -> str: + """ + Validate host parameter. + + Args: + host: Host name or alias + valid_hosts: List of valid hosts (None to skip check) + + Returns: + str: Validated and normalized host name + + Raises: + ValidationError: If host is invalid + + Example: + >>> validate_host("web-01") + "web-01" + >>> validate_host("web-01", ["web-01", "web-02"]) + "web-01" + """ + if not host: + raise ValidationError("Host cannot be empty") + + if not isinstance(host, str): + raise ValidationError(f"Host must be string, got {type(host)}") + + # Normalize (strip whitespace, lowercase for comparison) + host = host.strip() + + # Basic validation: alphanumeric, dash, underscore, dot + if not re.match(r'^[a-zA-Z0-9._-]+$', host): + raise ValidationError( + f"Invalid host name format: {host}\n" + "Host names must contain only letters, numbers, dots, dashes, and underscores" + ) + + # Check if valid (if list provided) + if valid_hosts: + # Try exact match first + if host in valid_hosts: + return host + + # Try case-insensitive match + for valid_host in valid_hosts: + if host.lower() == valid_host.lower(): + return valid_host + + # Not found - provide suggestions + suggestions = [h for h in valid_hosts if host[:3].lower() in h.lower()] + raise ValidationError( + f"Invalid host: {host}\n" + f"Valid options: {', '.join(valid_hosts[:10])}\n" + + (f"Did you mean: {', '.join(suggestions[:3])}?" if suggestions else "") + ) + + return host + + +def validate_group(group: str, valid_groups: Optional[List[str]] = None) -> str: + """ + Validate group parameter. + + Args: + group: Group name + valid_groups: List of valid groups (None to skip check) + + Returns: + str: Validated group name + + Raises: + ValidationError: If group is invalid + + Example: + >>> validate_group("production") + "production" + >>> validate_group("prod", ["production", "development"]) + ValidationError: Invalid group: prod + """ + if not group: + raise ValidationError("Group cannot be empty") + + if not isinstance(group, str): + raise ValidationError(f"Group must be string, got {type(group)}") + + # Normalize + group = group.strip().lower() + + # Basic validation + if not re.match(r'^[a-z0-9_-]+$', group): + raise ValidationError( + f"Invalid group name format: {group}\n" + "Group names must contain only lowercase letters, numbers, dashes, and underscores" + ) + + # Check if valid (if list provided) + if valid_groups: + if group not in valid_groups: + suggestions = [g for g in valid_groups if group[:3] in g] + raise ValidationError( + f"Invalid group: {group}\n" + f"Valid groups: {', '.join(valid_groups)}\n" + + (f"Did you mean: {', '.join(suggestions[:3])}?" if suggestions else "") + ) + + return group + + +def validate_path_exists(path: str, must_be_file: bool = False, + must_be_dir: bool = False) -> Path: + """ + Validate path exists and is accessible. + + Args: + path: Path to validate + must_be_file: If True, path must be a file + must_be_dir: If True, path must be a directory + + Returns: + Path: Validated Path object + + Raises: + ValidationError: If path is invalid + + Example: + >>> validate_path_exists("/tmp", must_be_dir=True) + Path('/tmp') + >>> validate_path_exists("/nonexistent") + ValidationError: Path does not exist: /nonexistent + """ + if not path: + raise ValidationError("Path cannot be empty") + + p = Path(path).expanduser().resolve() + + if not p.exists(): + raise ValidationError( + f"Path does not exist: {path}\n" + f"Resolved to: {p}" + ) + + if must_be_file and not p.is_file(): + raise ValidationError(f"Path must be a file: {path}") + + if must_be_dir and not p.is_dir(): + raise ValidationError(f"Path must be a directory: {path}") + + return p + + +def validate_timeout(timeout: int, min_timeout: int = 1, + max_timeout: int = 600) -> int: + """ + Validate timeout parameter. + + Args: + timeout: Timeout in seconds + min_timeout: Minimum allowed timeout + max_timeout: Maximum allowed timeout + + Returns: + int: Validated timeout + + Raises: + ValidationError: If timeout is invalid + + Example: + >>> validate_timeout(10) + 10 + >>> validate_timeout(0) + ValidationError: Timeout must be between 1 and 600 seconds + """ + if not isinstance(timeout, int): + raise ValidationError(f"Timeout must be integer, got {type(timeout)}") + + if timeout < min_timeout: + raise ValidationError( + f"Timeout too low: {timeout}s (minimum: {min_timeout}s)" + ) + + if timeout > max_timeout: + raise ValidationError( + f"Timeout too high: {timeout}s (maximum: {max_timeout}s)" + ) + + return timeout + + +def validate_command(command: str, allow_dangerous: bool = False) -> str: + """ + Basic command safety validation. + + Args: + command: Command to validate + allow_dangerous: If False, block potentially dangerous commands + + Returns: + str: Validated command + + Raises: + ValidationError: If command is invalid or dangerous + + Example: + >>> validate_command("ls -la") + "ls -la" + >>> validate_command("rm -rf /", allow_dangerous=False) + ValidationError: Potentially dangerous command blocked: rm -rf + """ + if not command: + raise ValidationError("Command cannot be empty") + + if not isinstance(command, str): + raise ValidationError(f"Command must be string, got {type(command)}") + + command = command.strip() + + if not allow_dangerous: + # Check for dangerous patterns + dangerous_patterns = [ + (r'\brm\s+-rf\s+/', "rm -rf on root directory"), + (r'\bmkfs\.', "filesystem formatting"), + (r'\bdd\s+.*of=/dev/', "disk writing with dd"), + (r':(){:|:&};:', "fork bomb"), + (r'>\s*/dev/sd[a-z]', "direct disk writing"), + ] + + for pattern, description in dangerous_patterns: + if re.search(pattern, command, re.IGNORECASE): + raise ValidationError( + f"Potentially dangerous command blocked: {description}\n" + f"Command: {command}\n" + "Use allow_dangerous=True if you really want to execute this" + ) + + return command + + +def validate_hosts_list(hosts: List[str], valid_hosts: Optional[List[str]] = None) -> List[str]: + """ + Validate a list of hosts. + + Args: + hosts: List of host names + valid_hosts: List of valid hosts (None to skip check) + + Returns: + List[str]: Validated host names + + Raises: + ValidationError: If any host is invalid + + Example: + >>> validate_hosts_list(["web-01", "web-02"]) + ["web-01", "web-02"] + """ + if not hosts: + raise ValidationError("Hosts list cannot be empty") + + if not isinstance(hosts, list): + raise ValidationError(f"Hosts must be list, got {type(hosts)}") + + validated = [] + errors = [] + + for host in hosts: + try: + validated.append(validate_host(host, valid_hosts)) + except ValidationError as e: + errors.append(str(e)) + + if errors: + raise ValidationError( + f"Invalid hosts in list:\n" + "\n".join(errors) + ) + + return validated + + +def main(): + """Test validators.""" + print("Testing parameter validators...\n") + + # Test host validation + print("1. Testing validate_host():") + try: + host = validate_host("web-01", ["web-01", "web-02", "db-01"]) + print(f" ✓ Valid host: {host}") + except ValidationError as e: + print(f" ✗ Error: {e}") + + try: + host = validate_host("invalid-host", ["web-01", "web-02"]) + print(f" ✗ Should have failed!") + except ValidationError as e: + print(f" ✓ Correctly rejected: {e.args[0].split(chr(10))[0]}") + + # Test group validation + print("\n2. Testing validate_group():") + try: + group = validate_group("production", ["production", "development"]) + print(f" ✓ Valid group: {group}") + except ValidationError as e: + print(f" ✗ Error: {e}") + + # Test path validation + print("\n3. Testing validate_path_exists():") + try: + path = validate_path_exists("/tmp", must_be_dir=True) + print(f" ✓ Valid path: {path}") + except ValidationError as e: + print(f" ✗ Error: {e}") + + # Test timeout validation + print("\n4. Testing validate_timeout():") + try: + timeout = validate_timeout(10) + print(f" ✓ Valid timeout: {timeout}s") + except ValidationError as e: + print(f" ✗ Error: {e}") + + try: + timeout = validate_timeout(0) + print(f" ✗ Should have failed!") + except ValidationError as e: + print(f" ✓ Correctly rejected: {e.args[0].split(chr(10))[0]}") + + # Test command validation + print("\n5. Testing validate_command():") + try: + cmd = validate_command("ls -la") + print(f" ✓ Safe command: {cmd}") + except ValidationError as e: + print(f" ✗ Error: {e}") + + try: + cmd = validate_command("rm -rf /", allow_dangerous=False) + print(f" ✗ Should have failed!") + except ValidationError as e: + print(f" ✓ Correctly blocked: {e.args[0].split(chr(10))[0]}") + + print("\n✅ All parameter validators tested") + + +if __name__ == "__main__": + main() diff --git a/scripts/workflow_executor.py b/scripts/workflow_executor.py new file mode 100644 index 0000000..1416acb --- /dev/null +++ b/scripts/workflow_executor.py @@ -0,0 +1,445 @@ +#!/usr/bin/env python3 +""" +Workflow executor for Tailscale SSH Sync Agent. +Common multi-machine workflow automation. +""" + +import sys +from pathlib import Path +from typing import Dict, List, Optional +import time +import logging + +# Add utils to path +sys.path.insert(0, str(Path(__file__).parent)) + +from utils.helpers import format_duration, get_timestamp +from sshsync_wrapper import execute_on_group, execute_on_host, push_to_hosts +from load_balancer import get_group_capacity + +logger = logging.getLogger(__name__) + + +def deploy_workflow(code_path: str, + staging_group: str, + prod_group: str, + run_tests: bool = True) -> Dict: + """ + Full deployment pipeline: staging → test → production. + + Args: + code_path: Path to code to deploy + staging_group: Staging server group + prod_group: Production server group + run_tests: Whether to run tests on staging + + Returns: + Dict with deployment results + + Example: + >>> result = deploy_workflow("./dist", "staging", "production") + >>> result['success'] + True + >>> result['duration'] + "12m 45s" + """ + start_time = time.time() + results = { + 'stages': {}, + 'success': False, + 'start_time': get_timestamp() + } + + try: + # Stage 1: Deploy to staging + logger.info("Stage 1: Deploying to staging...") + stage1 = push_to_hosts( + local_path=code_path, + remote_path="/var/www/app", + group=staging_group, + recurse=True + ) + + results['stages']['staging_deploy'] = stage1 + + if not stage1.get('success'): + results['error'] = 'Staging deployment failed' + return results + + # Build on staging + logger.info("Building on staging...") + build_result = execute_on_group( + staging_group, + "cd /var/www/app && npm run build", + timeout=300 + ) + + results['stages']['staging_build'] = build_result + + if not build_result.get('success'): + results['error'] = 'Staging build failed' + return results + + # Stage 2: Run tests (if enabled) + if run_tests: + logger.info("Stage 2: Running tests...") + test_result = execute_on_group( + staging_group, + "cd /var/www/app && npm test", + timeout=600 + ) + + results['stages']['tests'] = test_result + + if not test_result.get('success'): + results['error'] = 'Tests failed on staging' + return results + + # Stage 3: Validation + logger.info("Stage 3: Validating staging...") + health_result = execute_on_group( + staging_group, + "curl -f http://localhost:3000/health || echo 'Health check failed'", + timeout=10 + ) + + results['stages']['staging_validation'] = health_result + + # Stage 4: Deploy to production + logger.info("Stage 4: Deploying to production...") + prod_deploy = push_to_hosts( + local_path=code_path, + remote_path="/var/www/app", + group=prod_group, + recurse=True + ) + + results['stages']['production_deploy'] = prod_deploy + + if not prod_deploy.get('success'): + results['error'] = 'Production deployment failed' + return results + + # Build and restart on production + logger.info("Building and restarting production...") + prod_build = execute_on_group( + prod_group, + "cd /var/www/app && npm run build && pm2 restart app", + timeout=300 + ) + + results['stages']['production_build'] = prod_build + + # Stage 5: Production verification + logger.info("Stage 5: Verifying production...") + prod_health = execute_on_group( + prod_group, + "curl -f http://localhost:3000/health", + timeout=15 + ) + + results['stages']['production_verification'] = prod_health + + # Success! + results['success'] = True + results['duration'] = format_duration(time.time() - start_time) + + return results + + except Exception as e: + logger.error(f"Deployment workflow error: {e}") + results['error'] = str(e) + results['duration'] = format_duration(time.time() - start_time) + return results + + +def backup_workflow(hosts: List[str], + backup_paths: List[str], + destination: str) -> Dict: + """ + Backup files from multiple hosts. + + Args: + hosts: List of hosts to backup from + backup_paths: Paths to backup on each host + destination: Local destination directory + + Returns: + Dict with backup results + + Example: + >>> result = backup_workflow( + ... ["db-01", "db-02"], + ... ["/var/lib/mysql"], + ... "./backups" + ... ) + >>> result['backed_up_hosts'] + 2 + """ + from sshsync_wrapper import pull_from_host + + start_time = time.time() + results = { + 'hosts': {}, + 'success': True, + 'backed_up_hosts': 0 + } + + for host in hosts: + host_results = [] + + for backup_path in backup_paths: + # Create timestamped backup directory + timestamp = time.strftime("%Y%m%d_%H%M%S") + host_dest = f"{destination}/{host}_{timestamp}" + + result = pull_from_host( + host=host, + remote_path=backup_path, + local_path=host_dest, + recurse=True + ) + + host_results.append(result) + + if not result.get('success'): + results['success'] = False + + results['hosts'][host] = host_results + + if all(r.get('success') for r in host_results): + results['backed_up_hosts'] += 1 + + results['duration'] = format_duration(time.time() - start_time) + + return results + + +def sync_workflow(source_host: str, + target_group: str, + paths: List[str]) -> Dict: + """ + Sync files from one host to many. + + Args: + source_host: Host to pull from + target_group: Group to push to + paths: Paths to sync + + Returns: + Dict with sync results + + Example: + >>> result = sync_workflow( + ... "master-db", + ... "replica-dbs", + ... ["/var/lib/mysql/config"] + ... ) + >>> result['success'] + True + """ + from sshsync_wrapper import pull_from_host, push_to_hosts + import tempfile + import shutil + + start_time = time.time() + results = {'paths': {}, 'success': True} + + # Create temp directory + with tempfile.TemporaryDirectory() as temp_dir: + for path in paths: + # Pull from source + pull_result = pull_from_host( + host=source_host, + remote_path=path, + local_path=f"{temp_dir}/{Path(path).name}", + recurse=True + ) + + if not pull_result.get('success'): + results['paths'][path] = { + 'success': False, + 'error': 'Pull from source failed' + } + results['success'] = False + continue + + # Push to targets + push_result = push_to_hosts( + local_path=f"{temp_dir}/{Path(path).name}", + remote_path=path, + group=target_group, + recurse=True + ) + + results['paths'][path] = { + 'pull': pull_result, + 'push': push_result, + 'success': push_result.get('success', False) + } + + if not push_result.get('success'): + results['success'] = False + + results['duration'] = format_duration(time.time() - start_time) + + return results + + +def rolling_restart(group: str, + service_name: str, + wait_between: int = 30) -> Dict: + """ + Zero-downtime rolling restart of a service across group. + + Args: + group: Group to restart + service_name: Service name (e.g., "nginx", "app") + wait_between: Seconds to wait between restarts + + Returns: + Dict with restart results + + Example: + >>> result = rolling_restart("web-servers", "nginx") + >>> result['restarted_count'] + 3 + """ + from utils.helpers import parse_sshsync_config + + start_time = time.time() + groups_config = parse_sshsync_config() + hosts = groups_config.get(group, []) + + if not hosts: + return { + 'success': False, + 'error': f'Group {group} not found or empty' + } + + results = { + 'hosts': {}, + 'restarted_count': 0, + 'failed_count': 0, + 'success': True + } + + for host in hosts: + logger.info(f"Restarting {service_name} on {host}...") + + # Restart service + restart_result = execute_on_host( + host, + f"sudo systemctl restart {service_name} || sudo service {service_name} restart", + timeout=30 + ) + + # Health check + time.sleep(5) # Wait for service to start + + health_result = execute_on_host( + host, + f"sudo systemctl is-active {service_name} || sudo service {service_name} status", + timeout=10 + ) + + success = restart_result.get('success') and health_result.get('success') + + results['hosts'][host] = { + 'restart': restart_result, + 'health': health_result, + 'success': success + } + + if success: + results['restarted_count'] += 1 + logger.info(f"✓ {host} restarted successfully") + else: + results['failed_count'] += 1 + results['success'] = False + logger.error(f"✗ {host} restart failed") + + # Wait before next restart (except last) + if host != hosts[-1]: + time.sleep(wait_between) + + results['duration'] = format_duration(time.time() - start_time) + + return results + + +def health_check_workflow(group: str, + endpoint: str = "/health", + timeout: int = 10) -> Dict: + """ + Check health endpoint across group. + + Args: + group: Group to check + endpoint: Health endpoint path + timeout: Request timeout + + Returns: + Dict with health check results + + Example: + >>> result = health_check_workflow("production", "/health") + >>> result['healthy_count'] + 3 + """ + from utils.helpers import parse_sshsync_config + + groups_config = parse_sshsync_config() + hosts = groups_config.get(group, []) + + if not hosts: + return { + 'success': False, + 'error': f'Group {group} not found or empty' + } + + results = { + 'hosts': {}, + 'healthy_count': 0, + 'unhealthy_count': 0 + } + + for host in hosts: + health_result = execute_on_host( + host, + f"curl -f -s -o /dev/null -w '%{{http_code}}' http://localhost:3000{endpoint}", + timeout=timeout + ) + + is_healthy = ( + health_result.get('success') and + '200' in health_result.get('stdout', '') + ) + + results['hosts'][host] = { + 'healthy': is_healthy, + 'response': health_result.get('stdout', '').strip() + } + + if is_healthy: + results['healthy_count'] += 1 + else: + results['unhealthy_count'] += 1 + + results['success'] = results['unhealthy_count'] == 0 + + return results + + +def main(): + """Test workflow executor functions.""" + print("Testing workflow executor...\n") + + print("Note: Workflow executor requires configured hosts and groups.") + print("Tests would execute real operations, so showing dry-run simulations.\n") + + print("✅ Workflow executor ready") + + +if __name__ == "__main__": + main() diff --git a/tests/test_helpers.py b/tests/test_helpers.py new file mode 100644 index 0000000..d50226e --- /dev/null +++ b/tests/test_helpers.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +""" +Tests for helper utilities. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / 'scripts')) + +from utils.helpers import * + + +def test_format_bytes(): + """Test byte formatting.""" + assert format_bytes(0) == "0.0 B" + assert format_bytes(512) == "512.0 B" + assert format_bytes(1024) == "1.0 KB" + assert format_bytes(1048576) == "1.0 MB" + assert format_bytes(1073741824) == "1.0 GB" + print("✓ format_bytes() passed") + return True + + +def test_format_duration(): + """Test duration formatting.""" + assert format_duration(30) == "30s" + assert format_duration(65) == "1m 5s" + assert format_duration(3600) == "1h" + assert format_duration(3665) == "1h 1m" + assert format_duration(7265) == "2h 1m" + print("✓ format_duration() passed") + return True + + +def test_format_percentage(): + """Test percentage formatting.""" + assert format_percentage(45.567) == "45.6%" + assert format_percentage(100) == "100.0%" + assert format_percentage(0.123, decimals=2) == "0.12%" + print("✓ format_percentage() passed") + return True + + +def test_calculate_load_score(): + """Test load score calculation.""" + score = calculate_load_score(50, 50, 50) + assert 0 <= score <= 1 + assert abs(score - 0.5) < 0.01 + + score_low = calculate_load_score(20, 30, 25) + score_high = calculate_load_score(80, 85, 90) + assert score_low < score_high + + print("✓ calculate_load_score() passed") + return True + + +def test_classify_load_status(): + """Test load status classification.""" + assert classify_load_status(0.2) == "low" + assert classify_load_status(0.5) == "moderate" + assert classify_load_status(0.8) == "high" + print("✓ classify_load_status() passed") + return True + + +def test_classify_latency(): + """Test latency classification.""" + status, desc = classify_latency(25) + assert status == "excellent" + assert "interactive" in desc.lower() + + status, desc = classify_latency(150) + assert status == "fair" + + print("✓ classify_latency() passed") + return True + + +def test_parse_disk_usage(): + """Test disk usage parsing.""" + sample_output = """Filesystem Size Used Avail Use% Mounted on +/dev/sda1 100G 45G 50G 45% /""" + + result = parse_disk_usage(sample_output) + assert result['filesystem'] == '/dev/sda1' + assert result['size'] == '100G' + assert result['used'] == '45G' + assert result['use_pct'] == 45 + + print("✓ parse_disk_usage() passed") + return True + + +def test_parse_cpu_load(): + """Test CPU load parsing.""" + sample_output = "19:43:41 up 5 days, 2:15, 3 users, load average: 0.45, 0.38, 0.32" + + result = parse_cpu_load(sample_output) + assert result['load_1min'] == 0.45 + assert result['load_5min'] == 0.38 + assert result['load_15min'] == 0.32 + + print("✓ parse_cpu_load() passed") + return True + + +def test_get_timestamp(): + """Test timestamp generation.""" + ts_iso = get_timestamp(iso=True) + assert 'T' in ts_iso + assert 'Z' in ts_iso + + ts_human = get_timestamp(iso=False) + assert ' ' in ts_human + assert len(ts_human) == 19 # YYYY-MM-DD HH:MM:SS + + print("✓ get_timestamp() passed") + return True + + +def test_validate_path(): + """Test path validation.""" + assert validate_path("/tmp", must_exist=True) == True + assert validate_path("/nonexistent_path_12345", must_exist=False) == False + + print("✓ validate_path() passed") + return True + + +def test_safe_execute(): + """Test safe execution wrapper.""" + # Should return result on success + result = safe_execute(int, "42") + assert result == 42 + + # Should return default on failure + result = safe_execute(int, "not_a_number", default=0) + assert result == 0 + + print("✓ safe_execute() passed") + return True + + +def main(): + """Run all helper tests.""" + print("=" * 70) + print("HELPER TESTS") + print("=" * 70) + + tests = [ + test_format_bytes, + test_format_duration, + test_format_percentage, + test_calculate_load_score, + test_classify_load_status, + test_classify_latency, + test_parse_disk_usage, + test_parse_cpu_load, + test_get_timestamp, + test_validate_path, + test_safe_execute, + ] + + passed = 0 + for test in tests: + try: + if test(): + passed += 1 + except Exception as e: + print(f"✗ {test.__name__} failed: {e}") + + print(f"\nResults: {passed}/{len(tests)} passed") + return passed == len(tests) + + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1) diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..20b1a9a --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 +""" +Integration tests for Tailscale SSH Sync Agent. +Tests complete workflows from query to result. +""" + +import sys +from pathlib import Path + +# Add scripts to path +sys.path.insert(0, str(Path(__file__).parent.parent / 'scripts')) + +from sshsync_wrapper import get_host_status, list_hosts, get_groups +from tailscale_manager import get_tailscale_status, get_network_summary +from load_balancer import format_load_report, MachineMetrics +from utils.helpers import ( + format_bytes, format_duration, format_percentage, + calculate_load_score, classify_load_status, classify_latency +) + + +def test_host_status_basic(): + """Test get_host_status() without errors.""" + print("\n✓ Testing get_host_status()...") + + try: + result = get_host_status() + + # Validations + assert 'hosts' in result, "Missing 'hosts' in result" + assert isinstance(result.get('hosts', []), list), "'hosts' must be list" + + # Should have basic counts even if no hosts configured + assert 'total_count' in result, "Missing 'total_count'" + assert 'online_count' in result, "Missing 'online_count'" + assert 'offline_count' in result, "Missing 'offline_count'" + + print(f" ✓ Found {result.get('total_count', 0)} hosts") + print(f" ✓ Online: {result.get('online_count', 0)}") + print(f" ✓ Offline: {result.get('offline_count', 0)}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + import traceback + traceback.print_exc() + return False + + +def test_list_hosts(): + """Test list_hosts() function.""" + print("\n✓ Testing list_hosts()...") + + try: + result = list_hosts(with_status=False) + + assert 'hosts' in result, "Missing 'hosts' in result" + assert 'count' in result, "Missing 'count' in result" + assert isinstance(result['hosts'], list), "'hosts' must be list" + + print(f" ✓ List hosts working") + print(f" ✓ Found {result['count']} configured hosts") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_get_groups(): + """Test get_groups() function.""" + print("\n✓ Testing get_groups()...") + + try: + groups = get_groups() + + assert isinstance(groups, dict), "Groups must be dict" + + print(f" ✓ Groups config loaded") + print(f" ✓ Found {len(groups)} groups") + + for group, hosts in list(groups.items())[:3]: # Show first 3 + print(f" - {group}: {len(hosts)} hosts") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_tailscale_status(): + """Test Tailscale status check.""" + print("\n✓ Testing get_tailscale_status()...") + + try: + status = get_tailscale_status() + + assert isinstance(status, dict), "Status must be dict" + assert 'connected' in status, "Missing 'connected' field" + + if status.get('connected'): + print(f" ✓ Tailscale connected") + print(f" ✓ Peers: {status.get('total_count', 0)} total, {status.get('online_count', 0)} online") + else: + print(f" ℹ Tailscale not connected: {status.get('error', 'Unknown')}") + print(f" (This is OK if Tailscale is not installed/configured)") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_network_summary(): + """Test network summary generation.""" + print("\n✓ Testing get_network_summary()...") + + try: + summary = get_network_summary() + + assert isinstance(summary, str), "Summary must be string" + assert len(summary) > 0, "Summary cannot be empty" + + print(f" ✓ Network summary generated:") + for line in summary.split('\n'): + print(f" {line}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_format_helpers(): + """Test formatting helper functions.""" + print("\n✓ Testing format helpers...") + + try: + # Test format_bytes + assert format_bytes(1024) == "1.0 KB", "format_bytes failed for 1024" + assert format_bytes(12582912) == "12.0 MB", "format_bytes failed for 12MB" + + # Test format_duration + assert format_duration(65) == "1m 5s", "format_duration failed for 65s" + assert format_duration(3665) == "1h 1m", "format_duration failed for 1h+" + + # Test format_percentage + assert format_percentage(45.567) == "45.6%", "format_percentage failed" + + print(f" ✓ format_bytes(12582912) = {format_bytes(12582912)}") + print(f" ✓ format_duration(3665) = {format_duration(3665)}") + print(f" ✓ format_percentage(45.567) = {format_percentage(45.567)}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_load_score_calculation(): + """Test load score calculation.""" + print("\n✓ Testing calculate_load_score()...") + + try: + # Test various scenarios + score1 = calculate_load_score(45, 60, 40) + assert 0 <= score1 <= 1, "Score must be 0-1" + assert abs(score1 - 0.49) < 0.01, f"Expected ~0.49, got {score1}" + + score2 = calculate_load_score(20, 35, 30) + assert score2 < score1, "Lower usage should have lower score" + + score3 = calculate_load_score(85, 70, 65) + assert score3 > score1, "Higher usage should have higher score" + + print(f" ✓ Low load (20%, 35%, 30%): {score2:.2f}") + print(f" ✓ Med load (45%, 60%, 40%): {score1:.2f}") + print(f" ✓ High load (85%, 70%, 65%): {score3:.2f}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_load_classification(): + """Test load status classification.""" + print("\n✓ Testing classify_load_status()...") + + try: + assert classify_load_status(0.28) == "low", "0.28 should be 'low'" + assert classify_load_status(0.55) == "moderate", "0.55 should be 'moderate'" + assert classify_load_status(0.82) == "high", "0.82 should be 'high'" + + print(f" ✓ Score 0.28 = {classify_load_status(0.28)}") + print(f" ✓ Score 0.55 = {classify_load_status(0.55)}") + print(f" ✓ Score 0.82 = {classify_load_status(0.82)}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_latency_classification(): + """Test network latency classification.""" + print("\n✓ Testing classify_latency()...") + + try: + status1, desc1 = classify_latency(25) + assert status1 == "excellent", "25ms should be 'excellent'" + + status2, desc2 = classify_latency(75) + assert status2 == "good", "75ms should be 'good'" + + status3, desc3 = classify_latency(150) + assert status3 == "fair", "150ms should be 'fair'" + + status4, desc4 = classify_latency(250) + assert status4 == "poor", "250ms should be 'poor'" + + print(f" ✓ 25ms: {status1} - {desc1}") + print(f" ✓ 75ms: {status2} - {desc2}") + print(f" ✓ 150ms: {status3} - {desc3}") + print(f" ✓ 250ms: {status4} - {desc4}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_load_report_formatting(): + """Test load report formatting.""" + print("\n✓ Testing format_load_report()...") + + try: + metrics = MachineMetrics( + host='web-01', + cpu_pct=45.0, + mem_pct=60.0, + disk_pct=40.0, + load_score=0.49, + status='moderate' + ) + + report = format_load_report(metrics) + + assert 'web-01' in report, "Report must include hostname" + assert '0.49' in report, "Report must include load score" + assert 'moderate' in report, "Report must include status" + + print(f" ✓ Report generated:") + for line in report.split('\n'): + print(f" {line}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def test_dry_run_execution(): + """Test dry-run mode for operations.""" + print("\n✓ Testing dry-run execution...") + + try: + from sshsync_wrapper import execute_on_all + + result = execute_on_all("uptime", dry_run=True) + + assert result.get('dry_run') == True, "Must indicate dry-run mode" + assert 'command' in result, "Must include command" + assert 'message' in result, "Must include message" + + print(f" ✓ Dry-run mode working") + print(f" ✓ Command: {result.get('command')}") + print(f" ✓ Message: {result.get('message')}") + + return True + + except Exception as e: + print(f" ✗ FAILED: {e}") + return False + + +def main(): + """Run all integration tests.""" + print("=" * 70) + print("INTEGRATION TESTS - Tailscale SSH Sync Agent") + print("=" * 70) + + tests = [ + ("Host status check", test_host_status_basic), + ("List hosts", test_list_hosts), + ("Get groups", test_get_groups), + ("Tailscale status", test_tailscale_status), + ("Network summary", test_network_summary), + ("Format helpers", test_format_helpers), + ("Load score calculation", test_load_score_calculation), + ("Load classification", test_load_classification), + ("Latency classification", test_latency_classification), + ("Load report formatting", test_load_report_formatting), + ("Dry-run execution", test_dry_run_execution), + ] + + results = [] + for test_name, test_func in tests: + passed = test_func() + results.append((test_name, passed)) + + # Summary + print("\n" + "=" * 70) + print("SUMMARY") + print("=" * 70) + + for test_name, passed in results: + status = "✅ PASS" if passed else "❌ FAIL" + print(f"{status}: {test_name}") + + passed_count = sum(1 for _, p in results if p) + total_count = len(results) + + print(f"\nResults: {passed_count}/{total_count} passed") + + if passed_count == total_count: + print("\n🎉 All tests passed!") + else: + print(f"\n⚠️ {total_count - passed_count} test(s) failed") + + return passed_count == total_count + + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1) diff --git a/tests/test_validation.py b/tests/test_validation.py new file mode 100644 index 0000000..77c744f --- /dev/null +++ b/tests/test_validation.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +""" +Tests for validators. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / 'scripts')) + +from utils.validators import * + + +def test_validate_host(): + """Test host validation.""" + # Valid host + assert validate_host("web-01") == "web-01" + assert validate_host(" web-01 ") == "web-01" # Strips whitespace + + # With valid list + assert validate_host("web-01", ["web-01", "web-02"]) == "web-01" + + # Invalid format + try: + validate_host("web@01") # Invalid character + assert False, "Should have raised ValidationError" + except ValidationError: + pass + + print("✓ validate_host() passed") + return True + + +def test_validate_group(): + """Test group validation.""" + # Valid group + assert validate_group("production") == "production" + assert validate_group("PRODUCTION") == "production" # Lowercase normalization + + # With valid list + assert validate_group("production", ["production", "staging"]) == "production" + + # Invalid + try: + validate_group("invalid!", ["production"]) + assert False, "Should have raised ValidationError" + except ValidationError: + pass + + print("✓ validate_group() passed") + return True + + +def test_validate_path_exists(): + """Test path existence validation.""" + # Valid path + path = validate_path_exists("/tmp", must_be_dir=True) + assert isinstance(path, Path) + + # Invalid path + try: + validate_path_exists("/nonexistent_12345") + assert False, "Should have raised ValidationError" + except ValidationError: + pass + + print("✓ validate_path_exists() passed") + return True + + +def test_validate_timeout(): + """Test timeout validation.""" + # Valid timeouts + assert validate_timeout(10) == 10 + assert validate_timeout(1) == 1 + assert validate_timeout(600) == 600 + + # Too low + try: + validate_timeout(0) + assert False, "Should have raised ValidationError" + except ValidationError: + pass + + # Too high + try: + validate_timeout(1000) + assert False, "Should have raised ValidationError" + except ValidationError: + pass + + print("✓ validate_timeout() passed") + return True + + +def test_validate_command(): + """Test command validation.""" + # Safe commands + assert validate_command("ls -la") == "ls -la" + assert validate_command("uptime") == "uptime" + + # Dangerous commands (should fail without allow_dangerous) + try: + validate_command("rm -rf /") + assert False, "Should have blocked dangerous command" + except ValidationError: + pass + + # But should work with allow_dangerous + assert validate_command("rm -rf /tmp/test", allow_dangerous=True) + + print("✓ validate_command() passed") + return True + + +def test_validate_hosts_list(): + """Test list validation.""" + # Valid list + hosts = validate_hosts_list(["web-01", "web-02"]) + assert len(hosts) == 2 + assert "web-01" in hosts + + # Empty list + try: + validate_hosts_list([]) + assert False, "Should have raised ValidationError for empty list" + except ValidationError: + pass + + print("✓ validate_hosts_list() passed") + return True + + +def test_get_invalid_hosts(): + """Test finding invalid hosts.""" + # Test with mix of valid and invalid + # (This would require actual SSH config, so we test the function exists) + result = get_invalid_hosts(["web-01", "nonexistent-host-12345"]) + assert isinstance(result, list) + + print("✓ get_invalid_hosts() passed") + return True + + +def main(): + """Run all validation tests.""" + print("=" * 70) + print("VALIDATION TESTS") + print("=" * 70) + + tests = [ + test_validate_host, + test_validate_group, + test_validate_path_exists, + test_validate_timeout, + test_validate_command, + test_validate_hosts_list, + test_get_invalid_hosts, + ] + + passed = 0 + for test in tests: + try: + if test(): + passed += 1 + except Exception as e: + print(f"✗ {test.__name__} failed: {e}") + import traceback + traceback.print_exc() + + print(f"\nResults: {passed}/{len(tests)} passed") + return passed == len(tests) + + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1)