Initial commit

2025-11-29 18:00:27 +08:00
commit 0c6988a884
19 changed files with 5729 additions and 0 deletions
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -0,0 +1,12 @@
 {
  "name": "proxmox-infrastructure",
  "description": "Proxmox VE cluster management including VM provisioning, templates, VLAN networking, and CEPH storage",
  "version": "1.0.0",
  "author": {
    "name": "basher83",
    "email": "basher83@mail.spaceships.work"
  },
  "skills": [
    "./skills"
  ]
 }
--- a/README.md
+++ b/README.md
@@ -0,0 +1,3 @@
 # proxmox-infrastructure
 Proxmox VE cluster management including VM provisioning, templates, VLAN networking, and CEPH storage
--- a/plugin.lock.json
+++ b/plugin.lock.json
@@ -0,0 +1,105 @@
 {
  "$schema": "internal://schemas/plugin.lock.v1.json",
  "pluginId": "gh:basher83/lunar-claude:plugins/infrastructure/proxmox-infrastructure",
  "normalized": {
    "repo": null,
    "ref": "refs/tags/v20251128.0",
    "commit": "4443a5d5df66f90ee5678d11181044572ae39bcb",
    "treeHash": "5c6ff4105707bab91f3474e49aaed2d449e4ec488f25d2f2d552d6eadd167b54",
    "generatedAt": "2025-11-28T10:14:12.158310Z",
    "toolVersion": "publish_plugins.py@0.2.0"
  },
  "origin": {
    "remote": "git@github.com:zhongweili/42plugin-data.git",
    "branch": "master",
    "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390",
    "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data"
  },
  "manifest": {
    "name": "proxmox-infrastructure",
    "description": "Proxmox VE cluster management including VM provisioning, templates, VLAN networking, and CEPH storage",
    "version": "1.0.0"
  },
  "content": {
    "files": [
      {
        "path": "README.md",
        "sha256": "dc1558215c32922f14e23d784c1b5f7f5296fdd5090e4c1298c7248236443dd7"
      },
      {
        "path": ".claude-plugin/plugin.json",
        "sha256": "37f0fe197fab412f2f5e99afaaa1e87345c0347fa1c6ca53a908bdd7ce3f8e15"
      },
      {
        "path": "skills/proxmox-infrastructure/SKILL.md",
        "sha256": "6277a71e9d31ec7bfc2babbdff53d1492c882f556a83a0f3ffa6a2b7c2418275"
      },
      {
        "path": "skills/proxmox-infrastructure/tools/validate_template.py",
        "sha256": "c23a456e1e24de595e3e70078ae693543e19e7e7d374c63e123a0835aa9c8f18"
      },
      {
        "path": "skills/proxmox-infrastructure/tools/check_cluster_health.py",
        "sha256": "7681ed0b793191437976ca578463d93b69acbf98a640813a4dcf76c563756fef"
      },
      {
        "path": "skills/proxmox-infrastructure/tools/cluster_status.py",
        "sha256": "f492fbb074443ff9f839a390826a3811527594bb851e47d3c8fd7a69aa56af8e"
      },
      {
        "path": "skills/proxmox-infrastructure/tools/check_ceph_health.py",
        "sha256": "25ff530395eb757e1fd16da6cf1162a62d08aefd76fde2af1fa5eb3f08882c0e"
      },
      {
        "path": "skills/proxmox-infrastructure/anti-patterns/common-mistakes.md",
        "sha256": "f294cc1b8f21d397653f6cfe6b5f1eb0f5cb537a6c672cfafe05397ed2ca00d0"
      },
      {
        "path": "skills/proxmox-infrastructure/workflows/cluster-formation.md",
        "sha256": "af28601e2f561bfaf1342506f6f3eda7d2e09e51fa7d37f894beffcab6566d49"
      },
      {
        "path": "skills/proxmox-infrastructure/workflows/ceph-deployment.md",
        "sha256": "ce7b7bd85f0eed0a01ea7dd591b0cd9f1a0ee7b2f4da3fba74f9b532593a82c7"
      },
      {
        "path": "skills/proxmox-infrastructure/examples/01-basic-vm/main.tf",
        "sha256": "5f36b92de9ac76115291def1992631594339144c1ec6e2c6131908da536b2dc6"
      },
      {
        "path": "skills/proxmox-infrastructure/examples/01-basic-vm/README.md",
        "sha256": "311a65c49482ae2e87b1e001559a6eeae79f68b7c840cdd640131dec7e8d9c4f"
      },
      {
        "path": "skills/proxmox-infrastructure/examples/01-basic-vm/variables.tf",
        "sha256": "1eca73d9ae2c16a1f49d55fad2309aa1761f2935074f7624e952b28b3b4d0ce5"
      },
      {
        "path": "skills/proxmox-infrastructure/reference/qemu-guest-agent.md",
        "sha256": "c7545c83bcf443b27c81406b96abf4cfbf63be8114bab30f27536fa3ac1eb679"
      },
      {
        "path": "skills/proxmox-infrastructure/reference/networking.md",
        "sha256": "eff95710e488f40c52203be17a4c11068daac9391fe3b005b874551475fbe5bf"
      },
      {
        "path": "skills/proxmox-infrastructure/reference/storage-management.md",
        "sha256": "357eb01944e0f53ca61470b769c12dc89775e8979440b37f44340f3140f59154"
      },
      {
        "path": "skills/proxmox-infrastructure/reference/cloud-init-patterns.md",
        "sha256": "f8bc068ef9eefe27305ed490415a46fe6eab4f68553b24ff79187b91abaa6fe9"
      },
      {
        "path": "skills/proxmox-infrastructure/reference/api-reference.md",
        "sha256": "6dd249e90f808628687ddf4eac4893eb448eb1f025d6a9a060e9ba7f2d5940fc"
      }
    ],
    "dirSha256": "5c6ff4105707bab91f3474e49aaed2d449e4ec488f25d2f2d552d6eadd167b54"
  },
  "security": {
    "scannedAt": null,
    "scannerVersion": null,
    "flags": []
  }
 }
--- a/skills/proxmox-infrastructure/SKILL.md
+++ b/skills/proxmox-infrastructure/SKILL.md
@@ -0,0 +1,293 @@
 ---
 name: proxmox-infrastructure
 description: Proxmox VE cluster management including VM provisioning, template creation with cloud-init, QEMU guest
  agent integration, storage pool management, VLAN-aware bridge configuration, and Proxmox API interactions. Use when
  working with Proxmox VE, creating VM templates, configuring Proxmox networking, managing CEPH storage, troubleshooting
  VM deployment issues, or interacting with Proxmox API.
 ---
 # Proxmox Infrastructure Management
 Expert guidance for managing Proxmox VE clusters, creating templates, provisioning VMs, and configuring network
 infrastructure.
 ## Quick Start
 ### Common Tasks
 **Create VM Template:**
 ```bash
 # See tools/build-template.yml for automated playbook
 cd ansible && uv run ansible-playbook playbooks/proxmox-build-template.yml
 ```
 **Clone Template to VM:**
 ```bash
 qm clone <template-id> <new-vmid> --name <vm-name>
 qm set <new-vmid> --sshkey ~/.ssh/id_rsa.pub
 qm set <new-vmid> --ipconfig0 ip=192.168.1.100/24,gw=192.168.1.1
 qm start <new-vmid>
 ```
 **Check Cluster Status:**
 ```bash
 # Use tools/cluster_status.py
 ./tools/cluster_status.py
 ```
 ## When to Use This Skill
 Activate this skill when:
 - Creating or managing Proxmox VM templates
 - Provisioning VMs via cloning or Terraform
 - Configuring Proxmox networking (bridges, VLANs, bonds)
 - Troubleshooting VM deployment or network issues
 - Managing CEPH storage pools
 - Working with QEMU guest agent
 - Interacting with Proxmox API via Python or Ansible
 ## Core Workflows
 ### 1. Template Creation
 #### Method 1: Using Ansible (Recommended)
 See [tools/build-template.yml](tools/build-template.yml) for complete automation.
 #### Method 2: Manual CLI
 See [reference/cloud-init-patterns.md](reference/cloud-init-patterns.md) for detailed steps.
 Key points:
 - Use `virtio-scsi-pci` controller for Ubuntu images
 - Add cloud-init CD-ROM drive (`ide2`)
 - Configure serial console for cloud images
 - Convert to template with `qm template <vmid>`
 ### 2. VM Provisioning
 **From Ansible:**
 Analyze existing playbook: [../../ansible/playbooks/proxmox-build-template.yml](../../ansible/playbooks/proxmox-build-template.yml)
 **From Terraform:**
 See examples in [../../terraform/netbox-vm/](../../terraform/netbox-vm/)
 **Key Configuration:**
 ```yaml
 # Ansible example
 proxmox_kvm:
  node: foxtrot
  api_host: 192.168.3.5
  vmid: 101
  name: docker-01
  clone: ubuntu-template
  storage: local-lvm
  # Network with VLAN
  net:
    net0: 'virtio,bridge=vmbr0,tag=30'
  ipconfig:
    ipconfig0: 'ip=192.168.3.100/24,gw=192.168.3.1'
 ```
 ### 3. Network Configuration
 This Virgo-Core cluster uses:
 - **vmbr0**: Management (192.168.3.0/24, VLAN 9 for Corosync)
 - **vmbr1**: CEPH Public (192.168.5.0/24, MTU 9000)
 - **vmbr2**: CEPH Private (192.168.7.0/24, MTU 9000)
 See [reference/networking.md](reference/networking.md) for:
 - VLAN-aware bridge configuration
 - Bond setup (802.3ad LACP)
 - Routed vs bridged vs NAT setups
 ## Architecture Reference
 ### This Cluster ("Matrix")
 **Nodes:** Foxtrot, Golf, Hotel (3× MINISFORUM MS-A2)
 **Hardware per Node:**
 - AMD Ryzen 9 9955HX (16C/32T)
 - 64GB DDR5 @ 5600 MT/s
 - 3× NVMe: 1× 1TB (boot), 2× 4TB (CEPH)
 - 4× NICs: 2× 10GbE SFP+, 2× 2.5GbE
 **Network Architecture:**
 ```text
 enp4s0 → vmbr0 (mgmt + vlan9 for corosync)
 enp5s0f0np0 → vmbr1 (ceph public, MTU 9000)
 enp5s0f1np1 → vmbr2 (ceph private, MTU 9000)
 ```
 See [../../docs/goals.md](../../docs/goals.md) for complete specs.
 ## Tools Available
 ### Python Scripts (uv)
 **validate_template.py** - Validate template health via API
 ```bash
 ./tools/validate_template.py --template-id 9000
 ```
 **vm_diagnostics.py** - VM health checks
 ```bash
 ./tools/vm_diagnostics.py --vmid 101
 ```
 **cluster_status.py** - Cluster health metrics
 ```bash
 ./tools/cluster_status.py
 ```
 ### Ansible Playbooks
 **build-template.yml** - Automated template creation
 - Downloads cloud image
 - Creates VM with proper configuration
 - Converts to template
 **configure-networking.yml** - VLAN bridge setup
 - Creates VLAN-aware bridges
 - Configures bonds
 - Sets MTU for storage networks
 ### OpenTofu Modules
 **vm-module-example/** - Reusable VM provisioning
 - Clone-based deployment
 - Cloud-init integration
 - Network configuration
 See [examples/](examples/) directory.
 **Real Examples from Repository**:
 - **Multi-VM Cluster**: [../../terraform/examples/microk8s-cluster](../../terraform/examples/microk8s-cluster) - Comprehensive
  3-node MicroK8s deployment using `for_each` pattern, cross-node cloning, **dual NIC with VLAN** (VLAN 30 primary,
  VLAN 2 secondary), Ansible integration
 - **Template with Cloud-Init**:
  [../../terraform/examples/template-with-custom-cloudinit](../../terraform/examples/template-with-custom-cloudinit) -
  Custom cloud-init snippet configuration
 - **VLAN Bridge Configuration**:
  [../../ansible/playbooks/proxmox-enable-vlan-bridging.yml](../../ansible/playbooks/proxmox-enable-vlan-bridging.yml) -
  Enable VLAN-aware bridging on Proxmox nodes (supports VLANs 2-4094)
 ## Troubleshooting
 Common issues and solutions:
 ### Template Creation Issues
 **Serial console required:**
 Many cloud images need serial console configured.
 ```bash
 qm set <vmid> --serial0 socket --vga serial0
 ```
 **Boot order:**
 ```bash
 qm set <vmid> --boot order=scsi0
 ```
 ### Network Issues
 **VLAN not working:**
 1. Check bridge is VLAN-aware:
   ```bash
   grep "bridge-vlan-aware" /etc/network/interfaces
   ```
 2. Verify VLAN in bridge-vids:
   ```bash
   bridge vlan show
   ```
 **MTU problems (CEPH):**
 Ensure MTU 9000 on storage networks:
 ```bash
 ip link show vmbr1 | grep mtu
 ```
 ### VM Won't Start
 1. Check QEMU guest agent:
   ```bash
   qm agent <vmid> ping
   ```
 2. Review cloud-init logs (in VM):
   ```bash
   cloud-init status --wait
   cat /var/log/cloud-init.log
   ```
 3. Validate template exists:
   ```bash
   qm list | grep template
   ```
 For more issues, see [troubleshooting/](troubleshooting/) directory.
 ## Best Practices
 1. **Always use templates** - Clone for consistency
 2. **SSH keys only** - Never use password auth
 3. **VLAN-aware bridges** - Enable for flexibility
 4. **MTU 9000 for storage** - Essential for CEPH performance
 5. **Serial console** - Required for most cloud images
 6. **Guest agent** - Enable for IP detection and graceful shutdown
 7. **Tag VMs** - Use meaningful tags for organization
 ## Progressive Disclosure
 For deeper knowledge:
 ### Advanced Automation Workflows (from ProxSpray Analysis)
 - [Cluster Formation](workflows/cluster-formation.md) - Complete cluster automation with idempotency
 - [CEPH Deployment](workflows/ceph-deployment.md) - Automated CEPH storage deployment
 ### Core Reference
 - [Cloud-Init patterns](reference/cloud-init-patterns.md) - Complete template creation guide
 - [Network configuration](reference/networking.md) - VLANs, bonds, routing, NAT
 - [API reference](reference/api-reference.md) - Proxmox API interactions
 - [Storage management](reference/storage-management.md) - CEPH, LVM, datastores
 - [QEMU guest agent](reference/qemu-guest-agent.md) - Integration and troubleshooting
 ### Anti-Patterns & Common Mistakes
 - [Common Mistakes](anti-patterns/common-mistakes.md) - Real-world pitfalls from OpenTofu/Ansible deployments, template
  creation, and remote backend configuration
 ## Related Skills
 - **NetBox + PowerDNS Integration** - Automatic DNS for Proxmox VMs
 - **Ansible Best Practices** - Playbook patterns used in this cluster
--- a/skills/proxmox-infrastructure/anti-patterns/common-mistakes.md
+++ b/skills/proxmox-infrastructure/anti-patterns/common-mistakes.md
@@ -0,0 +1,313 @@
 # Common Mistakes and Anti-Patterns
 Lessons learned from real-world Proxmox deployments. Avoid these pitfalls to save time and frustration.
 ## VM Provisioning with OpenTofu
 **Note**: Use `tofu` CLI (not `terraform`). All examples use OpenTofu.
 ### ❌ Cloud-Init File Not on Target Node
 **Problem**: `tofu plan` succeeds but VM fails to start or configure properly.
 ```hcl
 # BAD - Cloud-init file only exists locally
 resource "proxmox_virtual_environment_vm" "example" {
  initialization {
    user_data_file_id = "local:snippets/user-data.yaml"  # File doesn't exist on node!
  }
 }
 ```
 **Solution**: Cloud-init YAML file MUST exist on the target Proxmox node's datastore.
 ```bash
 # Upload to Proxmox node first
 scp user-data.yaml root@foxtrot:/var/lib/vz/snippets/
 # Or use Ansible to deploy it
 ansible proxmox_nodes -m copy -a "src=user-data.yaml dest=/var/lib/vz/snippets/"
 ```
 **Reference**: See `terraform/netbox-template/user-data.yaml.example` for the required format.
 ---
 ### ❌ Template Missing on Target Node
 **Problem**: `tofu apply` fails with "template not found" error.
 ```hcl
 # BAD - Template referenced but doesn't exist
 resource "proxmox_virtual_environment_vm" "example" {
  node_name = "foxtrot"
  clone {
    vm_id = 9000  # Template doesn't exist on foxtrot!
  }
 }
 ```
 **Solution**: Ensure template exists on the specific node you're deploying to.
 ```bash
 # Check template exists
 ssh root@foxtrot "qm list | grep 9000"
 # Clone template to another node if needed
 ssh root@foxtrot "qm clone 9000 9000 --pool templates"
 ```
 **Better**: Use Ansible playbook to create templates consistently across nodes:
 ```bash
 cd ansible && uv run ansible-playbook playbooks/proxmox-build-template.yml
 ```
 ---
 ### ❌ Remote Backend Configuration Errors
 **Problem**: OpenTofu fails to authenticate with Proxmox when using Scalr remote backend.
 ```hcl
 # BAD - Incorrect provider config for remote backend
 provider "proxmox" {
  endpoint = var.proxmox_api_url
  ssh {
    agent = true  # ❌ Doesn't work with remote backend!
  }
 }
 ```
 **Solution (Remote Backend - Scalr)**:
 ```hcl
 provider "proxmox" {
  endpoint = var.proxmox_api_url
  username = var.proxmox_username  # Must use variables
  password = var.proxmox_password  # Must use variables
  ssh {
    agent = false  # Critical: false for remote backend
    username = var.ssh_username
  }
 }
 ```
 Required environment variables:
 ```bash
 export SCALR_HOSTNAME="your-scalr-host"
 export SCALR_TOKEN="your-scalr-token"
 export TF_VAR_proxmox_username="root@pam"
 export TF_VAR_proxmox_password="your-password"
 ```
 **Solution (Local Testing)**:
 ```hcl
 provider "proxmox" {
  endpoint = var.proxmox_api_url
  ssh {
    agent = true   # Use SSH agent for local testing
    username = "root"
  }
 }
 ```
 **Reference Architecture**:
 - Local examples: `terraform/examples/`
 - Versioned root modules: `basher83/Triangulum-Prime/terraform-bgp-vm`
 ---
 ## Template Creation
 ### ❌ Cloud Image Not Downloaded to Target Node
 **Problem**: Ansible playbook fails when creating template from cloud image.
 ```yaml
 # BAD - Assuming image exists
 - name: Create VM from cloud image
  ansible.builtin.command: >
    qm importdisk {{ template_id }} ubuntu-22.04.img local-lvm
  # Fails: ubuntu-22.04.img doesn't exist!
 ```
 **Solution**: Download cloud image to target node first.
 ```yaml
 # GOOD - Download first
 - name: Download Ubuntu cloud image
  ansible.builtin.get_url:
    url: https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-amd64.img
    dest: /tmp/ubuntu-22.04.img
    checksum: sha256:...
 - name: Import disk to VM
  ansible.builtin.command: >
    qm importdisk {{ template_id }} /tmp/ubuntu-22.04.img local-lvm
 ```
 **Reference**: See `ansible/playbooks/proxmox-build-template.yml` for complete workflow.
 ---
 ### ❌ Cloud-Init Snippet Format Violations
 **Problem**: VM boots but cloud-init doesn't configure properly.
 ```yaml
 # BAD - Wrong format
 #cloud-config
 users:
  - name: admin
    sudo: ALL=(ALL) NOPASSWD:ALL
 # Missing critical fields!
 ```
 **Solution**: Use the standardized snippet format pre-configured for Ansible.
 ```yaml
 # GOOD - Complete format
 #cloud-config
 users:
  - name: ansible
    groups: sudo
    shell: /bin/bash
    sudo: ALL=(ALL) NOPASSWD:ALL
    ssh_authorized_keys:
      - ssh-ed25519 AAAA...
 package_update: true
 package_upgrade: false
 packages:
  - qemu-guest-agent
  - python3
  - python3-pip
 runcmd:
  - systemctl enable qemu-guest-agent
  - systemctl start qemu-guest-agent
 ```
 **Critical Requirements**:
 - ✅ MUST include `qemu-guest-agent` package
 - ✅ MUST include `python3` for Ansible compatibility
 - ✅ MUST configure SSH key for Ansible user
 - ✅ MUST enable qemu-guest-agent service
 **Reference Format**: `terraform/netbox-template/user-data.yaml.example`
 ---
 ### ❌ Mixing Terraform and Ansible Provisioning
 **Problem**: Confusion about which tool is responsible for what.
 **Anti-Pattern**:
 ```hcl
 # BAD - Complex provisioning in Terraform
 resource "proxmox_virtual_environment_vm" "example" {
  initialization {
    user_data_file_id = "local:snippets/complex-setup.yaml"
    # Hundreds of lines of cloud-init doing app setup
  }
 }
 ```
 **Best Practice**: Clear separation of concerns.
 **OpenTofu Responsibility**:
 - VM resource allocation (CPU, memory, disk)
 - Network configuration
 - Basic cloud-init (user, SSH keys, qemu-guest-agent)
 - Infrastructure provisioning
 **Ansible Responsibility**:
 - Application installation
 - Configuration management
 - Service orchestration
 - Ongoing management
 **Pattern**:
 1. OpenTofu: Provision VM with minimal cloud-init
 2. Cloud-init: Create ansible user, install qemu-guest-agent, python3
 3. Ansible: Configure everything else
 **Reference Architecture**:
 - Template creation: `basher83/Triangulum-Prime/deployments/homelab/templates`
 - OpenTofu examples: `terraform/examples/`
 ---
 ## Best Practices Summary
 ### Template Creation
 1. ✅ Download cloud images to target node before import
 2. ✅ Use standardized cloud-init snippet format
 3. ✅ Always include qemu-guest-agent
 4. ✅ Keep cloud-init minimal - let Ansible handle configuration
 5. ✅ Reference: `basher83/Triangulum-Prime/deployments/homelab/templates`
 ### OpenTofu Provisioning
 1. ✅ Verify template exists on target node
 2. ✅ Upload cloud-init snippets before referencing
 3. ✅ Use `ssh.agent = false` for remote backends (Scalr)
 4. ✅ Use `ssh.agent = true` for local testing
 5. ✅ Set credentials via OpenTofu variables, not hardcoded
 6. ✅ Reference: `terraform/examples/` and `basher83/Triangulum-Prime`
 ### Workflow
 1. ✅ Create template once per node (or sync across nodes)
 2. ✅ Upload cloud-init snippets to `/var/lib/vz/snippets/`
 3. ✅ Provision VM via OpenTofu (infrastructure)
 4. ✅ Configure VM via Ansible (applications/services)
 ---
 ## Quick Troubleshooting
 ### VM Won't Start After tofu apply
 **Check**:
 1. Does template exist? `qm list | grep <template-id>`
 2. Does cloud-init file exist? `ls -la /var/lib/vz/snippets/`
 3. Is qemu-guest-agent installed? `qm agent <vmid> ping`
 ### tofu Can't Connect to Proxmox
 **Remote Backend**:
 1. `ssh.agent = false`? ✅
 2. `SCALR_HOSTNAME` and `SCALR_TOKEN` set? ✅
 3. Using OpenTofu variables for credentials? ✅
 **Local Testing**:
 1. `ssh.agent = true`? ✅
 2. SSH key in agent? `ssh-add -l` ✅
 3. Can you SSH to node? `ssh root@foxtrot` ✅
 ### Cloud-Init Didn't Configure VM
 **Check**:
 1. File format matches `user-data.yaml.example`? ✅
 2. Includes qemu-guest-agent? ✅
 3. Includes python3? ✅
 4. VM console logs: `qm terminal <vmid>` then check `/var/log/cloud-init.log`
--- a/skills/proxmox-infrastructure/examples/01-basic-vm/README.md
+++ b/skills/proxmox-infrastructure/examples/01-basic-vm/README.md
@@ -0,0 +1,245 @@
 # Basic VM Deployment Example
 **Learning objective:** Deploy your first VM using the unified VM module with minimal configuration.
 ## What This Example Shows
 - ✅ Minimal required configuration for VM deployment
 - ✅ Cloning from an existing template
 - ✅ Static IP address configuration with cloud-init
 - ✅ SSH key injection
 - ✅ Module defaults (what you DON'T need to specify)
 ## Prerequisites
 1. **Proxmox template** exists (VMID 9000)
   - Create one using: `terraform/netbox-template/` or Ansible playbook
   - Or use Triangulum-Prime template examples
 2. **Proxmox API credentials** configured:
   ```bash
   export PROXMOX_VE_ENDPOINT="https://192.168.3.5:8006"
   export PROXMOX_VE_API_TOKEN="user@realm!token-id=secret"
   # OR
   export PROXMOX_VE_USERNAME="root@pam"
   export PROXMOX_VE_PASSWORD="your-password"
   ```
 3. **SSH public key** available:
   ```bash
   export TF_VAR_ssh_public_key="$(cat ~/.ssh/id_rsa.pub)"
   ```
 ## Quick Start
 ### 1. Initialize Terraform
 ```bash
 tofu init
 ```
 ### 2. Review the Plan
 ```bash
 tofu plan
 ```
 **Expected resources:**
 - 1 VM (cloned from template 9000)
 - Cloud-init configuration
 - Network interface with static IP
 ### 3. Deploy
 ```bash
 tofu apply
 ```
 ### 4. Verify
 ```bash
 # SSH into the VM
 ssh ansible@192.168.3.100
 # Check VM in Proxmox
 qm status 100  # Or whatever VMID was assigned
 ```
 ### 5. Cleanup
 ```bash
 tofu destroy
 ```
 ## Understanding the Configuration
 ### What You MUST Specify
 ```hcl
 # These 6 parameters are required:
 vm_type       = "clone"               # Clone from template
 pve_node      = "foxtrot"             # Which node
 vm_name       = "test-vm-01"          # VM name
 src_clone     = { ... }               # Template to clone
 vm_disk       = { ... }               # Disk config
 vm_net_ifaces = { ... }               # Network config
 vm_init       = { ... }               # Cloud-init config
 vm_efi_disk   = { ... }               # EFI boot disk
 ```
 ### What Uses Defaults
 The module provides sensible defaults for:
 | Setting | Default | Why It's Good |
 |---------|---------|---------------|
 | CPU cores | 2 | Minimal baseline |
 | Memory | 2048 MB (2GB) | Enough for most services |
 | CPU type | `host` | Best performance |
 | Guest agent | Enabled | Needed for IP detection |
 | BIOS | `ovmf` (UEFI) | Modern, secure |
 | Machine | `q35` | Modern chipset |
 | Display | Standard VGA | Works everywhere |
 | Serial console | Enabled | Troubleshooting |
 | RNG device | Enabled | Entropy for crypto |
 **See:** [Module DEFAULTS.md](https://github.com/basher83/Triangulum-Prime/blob/main/terraform-bgp-vm/DEFAULTS.md)
 ## Customization
 ### Change VM Resources
 Override defaults in `main.tf`:
 ```hcl
 module "basic_vm" {
  # ... required params ...
  # Override CPU
  vm_cpu = {
    cores = 4  # Increase to 4 cores
  }
  # Override memory
  vm_mem = {
    dedicated = 8192  # 8GB
  }
 }
 ```
 ### Use Different Template
 Change the template ID:
 ```hcl
 src_clone = {
  datastore_id = "local-lvm"
  tpl_id       = 9001  # Different template
 }
 ```
 ### Add VLAN Tagging
 ```hcl
 vm_net_ifaces = {
  net0 = {
    bridge    = "vmbr0"
    vlan_id   = 30         # Add VLAN tag
    ipv4_addr = "192.168.3.100/24"
    ipv4_gw   = "192.168.3.1"
  }
 }
 ```
 ## Common Issues
 ### Issue: "Template 9000 not found"
 **Solution:** Create a template first:
 ```bash
 cd ../../.. # Back to repo root
 cd terraform/netbox-template
 tofu apply
 ```
 ### Issue: "IP address already in use"
 **Solution:** Change `ip_address` variable:
 ```bash
 tofu apply -var="ip_address=192.168.3.101"
 ```
 ### Issue: "Cannot connect to Proxmox API"
 **Solution:** Check credentials:
 ```bash
 echo $PROXMOX_VE_ENDPOINT
 echo $PROXMOX_VE_API_TOKEN
 ```
 ### Issue: "EFI disk creation failed"
 **Solution:** Ensure datastore has space:
 ```bash
 # On Proxmox node
 pvesm status
 ```
 ## Next Steps
 ### Learn More
 1. **Production Configuration:** See `../02-production-vm/`
   - Shows common overrides for production
   - Resource sizing best practices
   - Tagging and organization
 2. **Template Creation:** See `../03-template-creation/`
   - How to create templates from cloud images
   - Template best practices
 3. **Complete Examples:** Triangulum-Prime repository
   - [Single VM](https://github.com/basher83/Triangulum-Prime/tree/main/examples/single-vm)
   - [MicroK8s Cluster](https://github.com/basher83/Triangulum-Prime/tree/main/examples/microk8s-cluster)
   - [Custom Cloud-init](https://github.com/basher83/Triangulum-Prime/tree/main/examples/template-with-custom-cloudinit)
 ### Integration Examples
 - **NetBox + DNS:** See `.claude/skills/netbox-powerdns-integration/examples/01-vm-with-dns/`
 - **Ansible Configuration:** See `.claude/skills/ansible-best-practices/examples/`
 ## Module Documentation
 - **README:** [terraform-bgp-vm](https://github.com/basher83/Triangulum-Prime/tree/main/terraform-bgp-vm)
 - **DEFAULTS:** [DEFAULTS.md](https://github.com/basher83/Triangulum-Prime/blob/main/terraform-bgp-vm/DEFAULTS.md)
 - **Full API:** Module variables.tf
 ## Philosophy: DRY (Don't Repeat Yourself)
 This example follows the module's DRY principle:
 ✅ **Good:** Only specify what differs from defaults
 ```hcl
 vm_cpu = {
  cores = 4  # Only override cores, use default type
 }
 ```
 ❌ **Bad:** Repeating module defaults
 ```hcl
 vm_cpu = {
  cores = 4
  type  = "host"  # This is already the default!
 }
 ```
 **Why?** Reduces maintenance burden and makes changes obvious.
--- a/skills/proxmox-infrastructure/examples/01-basic-vm/main.tf
+++ b/skills/proxmox-infrastructure/examples/01-basic-vm/main.tf
@@ -0,0 +1,138 @@
 # =============================================================================
 # Basic VM Deployment Example
 # =============================================================================
 # This is a minimal example for learning the VM module. It shows only the
 # required parameters with sensible defaults for everything else.
 #
 # Use this as a starting point for understanding the module, then refer to
 # Triangulum-Prime examples for production-ready configurations.
 terraform {
  required_version = ">= 1.0"
  required_providers {
    proxmox = {
      source  = "bpg/proxmox"
      version = "~> 0.69"
    }
  }
 }
 # Provider configuration (credentials from environment)
 provider "proxmox" {
  endpoint = var.proxmox_endpoint
  # Uses PROXMOX_VE_API_TOKEN or PROXMOX_VE_USERNAME/PASSWORD from environment
 }
 # =============================================================================
 # Basic VM Module Usage
 # =============================================================================
 module "basic_vm" {
  source = "github.com/basher83/Triangulum-Prime//terraform-bgp-vm?ref=vm/1.0.1"
  # === REQUIRED: Basic Configuration ===
  vm_type  = "clone"           # Clone from existing template
  pve_node = var.proxmox_node  # Which Proxmox node to deploy on
  vm_name  = var.vm_name       # Name of the VM
  # === REQUIRED: Clone Source ===
  # Specify which template to clone from
  src_clone = {
    datastore_id = "local-lvm"
    tpl_id       = 9000  # Your template VMID
  }
  # === REQUIRED: Disk Configuration ===
  # Define the VM's disk
  vm_disk = {
    scsi0 = {
      datastore_id = "local-lvm"
      size         = 20  # GB
      main_disk    = true
      # Note: file_format, iothread, ssd, discard use optimal defaults
    }
  }
  # === REQUIRED: Network Configuration ===
  # At minimum, configure one network interface
  vm_net_ifaces = {
    net0 = {
      bridge    = "vmbr0"
      ipv4_addr = "${var.ip_address}/24"
      ipv4_gw   = var.gateway
      # Note: model defaults to "virtio", vlan_id defaults to null
    }
  }
  # === REQUIRED: Cloud-init Configuration ===
  vm_init = {
    datastore_id = "local-lvm"
    user = {
      name = var.username
      keys = [var.ssh_public_key]
    }
    dns = {
      domain  = "spaceships.work"
      servers = ["192.168.3.1"]
    }
  }
  # === REQUIRED: EFI Disk (for UEFI boot) ===
  vm_efi_disk = {
    datastore_id = "local-lvm"
    # file_format defaults to "raw"
    # type defaults to "4m"
  }
  # === OPTIONAL OVERRIDES ===
  # These are only shown here for educational purposes.
  # The module already provides these defaults - you DON'T need to specify them!
  # CPU (defaults to 2 cores, "host" type)
  # vm_cpu = {
  #   cores = 2
  #   type  = "host"
  # }
  # Memory (defaults to 2048 MB / 2GB)
  # vm_mem = {
  #   dedicated = 2048
  # }
  # Guest agent (defaults to enabled)
  # vm_agent = {
  #   enabled = true
  # }
  # VM start behavior (defaults: start on deploy, start on boot)
  # vm_start = {
  #   on_deploy = true
  #   on_boot   = true
  # }
  # === Learn More ===
  # See module DEFAULTS.md for complete list of defaults:
  # https://github.com/basher83/Triangulum-Prime/blob/main/terraform-bgp-vm/DEFAULTS.md
 }
 # =============================================================================
 # Outputs
 # =============================================================================
 output "vm_id" {
  description = "The ID of the created VM"
  value       = module.basic_vm.vm_id
 }
 output "vm_name" {
  description = "The name of the created VM"
  value       = module.basic_vm.vm_name
 }
 output "vm_ipv4_addresses" {
  description = "IPv4 addresses assigned to the VM"
  value       = module.basic_vm.ipv4_addresses
 }
--- a/skills/proxmox-infrastructure/examples/01-basic-vm/variables.tf
+++ b/skills/proxmox-infrastructure/examples/01-basic-vm/variables.tf
@@ -0,0 +1,41 @@
 variable "proxmox_endpoint" {
  description = "Proxmox API endpoint (e.g., https://192.168.3.5:8006)"
  type        = string
  default     = "https://192.168.3.5:8006"
 }
 variable "proxmox_node" {
  description = "Proxmox node to deploy on"
  type        = string
  default     = "foxtrot"
 }
 variable "vm_name" {
  description = "Name of the VM"
  type        = string
  default     = "test-vm-01"
 }
 variable "ip_address" {
  description = "Static IP address for the VM (without CIDR)"
  type        = string
  default     = "192.168.3.100"
 }
 variable "gateway" {
  description = "Network gateway"
  type        = string
  default     = "192.168.3.1"
 }
 variable "username" {
  description = "VM username for cloud-init"
  type        = string
  default     = "ansible"
 }
 variable "ssh_public_key" {
  description = "SSH public key for VM access"
  type        = string
  # Set via environment variable or tfvars file
 }
--- a/skills/proxmox-infrastructure/reference/api-reference.md
+++ b/skills/proxmox-infrastructure/reference/api-reference.md
@@ -0,0 +1,378 @@
 # Proxmox API Reference
 ## Overview
 The Proxmox API enables programmatic management of the cluster via REST. This reference focuses on common patterns for Python (proxmoxer) and Terraform/Ansible usage.
 ## Authentication Methods
 ### API Tokens (Recommended)
 **Create API token via CLI:**
 ```bash
 pveum user token add <user>@<realm> <token-id> --privsep 0
 ```
 **Environment variables:**
 ```bash
 export PROXMOX_VE_API_TOKEN="user@realm!token-id=secret"
 export PROXMOX_VE_ENDPOINT="https://192.168.3.5:8006"
 ```
 ### Password Authentication
 ```bash
 export PROXMOX_VE_USERNAME="root@pam"
 export PROXMOX_VE_PASSWORD="password"
 export PROXMOX_VE_ENDPOINT="https://192.168.3.5:8006"
 ```
 ## Python API Usage (proxmoxer)
 ### Installation
 ```bash
 # Using uv inline script metadata
 # /// script
 # dependencies = ["proxmoxer", "requests"]
 # ///
 ```
 ### Basic Connection
 ```python
 #!/usr/bin/env python3
 # /// script
 # dependencies = ["proxmoxer", "requests"]
 # ///
 from proxmoxer import ProxmoxAPI
 import os
 # Connect using API token
 proxmox = ProxmoxAPI(
    os.getenv("PROXMOX_VE_ENDPOINT").replace("https://", "").replace(":8006", ""),
    user=os.getenv("PROXMOX_VE_USERNAME"),
    token_name=os.getenv("PROXMOX_VE_TOKEN_NAME"),
    token_value=os.getenv("PROXMOX_VE_TOKEN_VALUE"),
    verify_ssl=False
 )
 # OR using password
 proxmox = ProxmoxAPI(
    '192.168.3.5',
    user='root@pam',
    password=os.getenv("PROXMOX_VE_PASSWORD"),
    verify_ssl=False
 )
 ```
 ### Common Operations
 **List VMs:**
 ```python
 # Get all VMs across cluster
 for node in proxmox.nodes.get():
    node_name = node['node']
    for vm in proxmox.nodes(node_name).qemu.get():
        print(f"VM {vm['vmid']}: {vm['name']} on {node_name} - {vm['status']}")
 ```
 **Get VM Configuration:**
 ```python
 vmid = 101
 node = "foxtrot"
 vm_config = proxmox.nodes(node).qemu(vmid).config.get()
 print(f"VM {vmid} config: {vm_config}")
 ```
 **Clone Template:**
 ```python
 template_id = 9000
 new_vmid = 101
 node = "foxtrot"
 # Clone template
 proxmox.nodes(node).qemu(template_id).clone.post(
    newid=new_vmid,
    name="docker-01-nexus",
    full=1,  # Full clone (not linked)
    storage="local-lvm"
 )
 # Wait for clone to complete
 import time
 while True:
    tasks = proxmox.nodes(node).tasks.get()
    clone_task = next((t for t in tasks if t['type'] == 'qmclone' and str(t['id']) == str(new_vmid)), None)
    if not clone_task or clone_task['status'] == 'stopped':
        break
    time.sleep(2)
 ```
 **Update VM Configuration:**
 ```python
 # Set cloud-init parameters
 proxmox.nodes(node).qemu(vmid).config.put(
    ipconfig0="ip=192.168.3.100/24,gw=192.168.3.1",
    nameserver="192.168.3.1",
    searchdomain="spaceships.work",
    sshkeys="ssh-rsa AAAA..."
 )
 ```
 **Start/Stop VM:**
 ```python
 # Start VM
 proxmox.nodes(node).qemu(vmid).status.start.post()
 # Stop VM (graceful)
 proxmox.nodes(node).qemu(vmid).status.shutdown.post()
 # Force stop
 proxmox.nodes(node).qemu(vmid).status.stop.post()
 ```
 **Delete VM:**
 ```python
 proxmox.nodes(node).qemu(vmid).delete()
 ```
 ### Cluster Operations
 **Get Cluster Status:**
 ```python
 cluster_status = proxmox.cluster.status.get()
 for node in cluster_status:
    if node['type'] == 'node':
        print(f"Node: {node['name']} - {node['online']}")
 ```
 **Get Node Resources:**
 ```python
 node_status = proxmox.nodes(node).status.get()
 print(f"CPU: {node_status['cpu']*100:.1f}%")
 print(f"Memory: {node_status['memory']['used']/1024**3:.1f}GB / {node_status['memory']['total']/1024**3:.1f}GB")
 ```
 ### Storage Operations
 **List Storage:**
 ```python
 for storage in proxmox.storage.get():
    print(f"Storage: {storage['storage']} - Type: {storage['type']} - {storage['active']}")
 ```
 **Get Storage Content:**
 ```python
 storage = "local-lvm"
 content = proxmox.storage(storage).content.get()
 for item in content:
    print(f"{item['volid']} - {item.get('vmid', 'N/A')} - {item['size']/1024**3:.1f}GB")
 ```
 ## Terraform Provider Patterns
 ### Basic Resource (VM from Clone)
 ```hcl
 resource "proxmox_vm_qemu" "docker_host" {
  name        = "docker-01-nexus"
  target_node = "foxtrot"
  vmid        = 101
  clone       = "ubuntu-template"
  full_clone  = true
  cores   = 4
  memory  = 8192
  sockets = 1
  network {
    bridge = "vmbr0"
    model  = "virtio"
    tag    = 30  # VLAN 30
  }
  disk {
    storage = "local-lvm"
    type    = "scsi"
    size    = "50G"
  }
  ipconfig0 = "ip=192.168.3.100/24,gw=192.168.3.1"
  sshkeys = file("~/.ssh/id_rsa.pub")
 }
 ```
 ### Data Sources
 ```hcl
 # Get template information
 data "proxmox_vm_qemu" "template" {
  name        = "ubuntu-template"
  target_node = "foxtrot"
 }
 # Get storage information
 data "proxmox_storage" "local_lvm" {
  node    = "foxtrot"
  storage = "local-lvm"
 }
 ```
 ## Ansible Module Patterns
 ### Create VM from Template
 ```yaml
 - name: Clone template to create VM
  community.proxmox.proxmox_kvm:
    api_host: "{{ proxmox_api_host }}"
    api_user: "{{ proxmox_api_user }}"
    api_token_id: "{{ proxmox_token_id }}"
    api_token_secret: "{{ proxmox_token_secret }}"
    node: foxtrot
    vmid: 101
    name: docker-01-nexus
    clone: ubuntu-template
    full: true
    storage: local-lvm
    net:
      net0: 'virtio,bridge=vmbr0,tag=30'
    ipconfig:
      ipconfig0: 'ip=192.168.3.100/24,gw=192.168.3.1'
    cores: 4
    memory: 8192
    agent: 1
    state: present
 ```
 ### Start VM
 ```yaml
 - name: Start VM
  community.proxmox.proxmox_kvm:
    api_host: "{{ proxmox_api_host }}"
    api_user: "{{ proxmox_api_user }}"
    api_token_id: "{{ proxmox_token_id }}"
    api_token_secret: "{{ proxmox_token_secret }}"
    node: foxtrot
    vmid: 101
    state: started
 ```
 ## Matrix Cluster Specifics
 ### Node IP Addresses
 ```python
 MATRIX_NODES = {
    "foxtrot": "192.168.3.5",
    "golf": "192.168.3.6",
    "hotel": "192.168.3.7"
 }
 ```
 ### Storage Pools
 ```python
 STORAGE_POOLS = {
    "local": "dir",           # Local directory
    "local-lvm": "lvmthin",   # LVM thin on boot disk
    "ceph-pool": "rbd"        # CEPH RBD (when configured)
 }
 ```
 ### Network Bridges
 ```python
 BRIDGES = {
    "vmbr0": "192.168.3.0/24",   # Management + VLAN 9 (Corosync)
    "vmbr1": "192.168.5.0/24",   # CEPH Public (MTU 9000)
    "vmbr2": "192.168.7.0/24"    # CEPH Private (MTU 9000)
 }
 ```
 ## Error Handling
 ### Python Example
 ```python
 from proxmoxer import ProxmoxAPI, ResourceException
 import sys
 try:
    proxmox = ProxmoxAPI('192.168.3.5', user='root@pam', password='pass', verify_ssl=False)
    vm_config = proxmox.nodes('foxtrot').qemu(101).config.get()
 except ResourceException as e:
    print(f"API Error: {e}", file=sys.stderr)
    sys.exit(1)
 except Exception as e:
    print(f"Unexpected error: {e}", file=sys.stderr)
    sys.exit(1)
 ```
 ### Ansible Example
 ```yaml
 - name: Clone VM with error handling
  community.proxmox.proxmox_kvm:
    api_host: "{{ proxmox_api_host }}"
    # ... config ...
  register: clone_result
  failed_when: false
 - name: Check clone result
  ansible.builtin.fail:
    msg: "Failed to clone VM: {{ clone_result.msg }}"
  when: clone_result.failed
 ```
 ## API Endpoints Reference
 ### Common Endpoints
 ```text
 GET    /api2/json/nodes                        # List nodes
 GET    /api2/json/nodes/{node}/qemu            # List VMs on node
 GET    /api2/json/nodes/{node}/qemu/{vmid}    # Get VM status
 POST   /api2/json/nodes/{node}/qemu/{vmid}/clone  # Clone VM
 PUT    /api2/json/nodes/{node}/qemu/{vmid}/config # Update config
 POST   /api2/json/nodes/{node}/qemu/{vmid}/status/start   # Start VM
 POST   /api2/json/nodes/{node}/qemu/{vmid}/status/shutdown # Stop VM
 DELETE /api2/json/nodes/{node}/qemu/{vmid}    # Delete VM
 GET    /api2/json/cluster/status               # Cluster status
 GET    /api2/json/storage                      # List storage
 ```
 ## Best Practices
 1. **Use API tokens** - More secure than password authentication
 2. **Handle SSL properly** - Use `verify_ssl=True` with proper CA cert in production
 3. **Check task completion** - Clone/migrate operations are async, poll for completion
 4. **Error handling** - Always catch ResourceException and provide meaningful errors
 5. **Rate limiting** - Don't hammer the API, add delays in loops
 6. **Idempotency** - Check if resource exists before creating
 7. **Use VMID ranges** - Reserve ranges for different purposes (templates: 9000-9999, VMs: 100-999)
 ## Further Reading
 - [Proxmox VE API Documentation](https://pve.proxmox.com/pve-docs/api-viewer/)
 - [proxmoxer GitHub](https://github.com/proxmoxer/proxmoxer)
 - [community.proxmox Collection](https://docs.ansible.com/ansible/latest/collections/community/proxmox/)
--- a/skills/proxmox-infrastructure/reference/cloud-init-patterns.md
+++ b/skills/proxmox-infrastructure/reference/cloud-init-patterns.md
@@ -0,0 +1,163 @@
 # Cloud-Init Patterns for Proxmox VE
 *Source: <https://pve.proxmox.com/wiki/Cloud-Init_Support*>
 ## Overview
 Cloud-Init is the de facto multi-distribution package that handles early initialization of virtual machines. When a VM starts for the first time, Cloud-Init applies network and SSH key settings configured on the hypervisor.
 ## Template Creation Workflow
 ### Download and Import Cloud Image
 ```bash
 # Download Ubuntu cloud image
 wget https://cloud-images.ubuntu.com/bionic/current/bionic-server-cloudimg-amd64.img
 # Create VM with VirtIO SCSI controller
 qm create 9000 --memory 2048 --net0 virtio,bridge=vmbr0 --scsihw virtio-scsi-pci
 # Import disk to storage
 qm set 9000 --scsi0 local-lvm:0,import-from=/path/to/bionic-server-cloudimg-amd64.img
 ```
 **Important**: Ubuntu Cloud-Init images require `virtio-scsi-pci` controller type for SCSI drives.
 ### Configure Cloud-Init Components
 ```bash
 # Add Cloud-Init CD-ROM drive
 qm set 9000 --ide2 local-lvm:cloudinit
 # Set boot order (speeds up boot)
 qm set 9000 --boot order=scsi0
 # Configure serial console (required for many cloud images)
 qm set 9000 --serial0 socket --vga serial0
 # Convert to template
 qm template 9000
 ```
 ## Deploying from Templates
 ### Clone Template
 ```bash
 # Clone template to new VM
 qm clone 9000 123 --name ubuntu2
 ```
 ### Configure VM
 ```bash
 # Set SSH public key
 qm set 123 --sshkey ~/.ssh/id_rsa.pub
 # Configure network
 qm set 123 --ipconfig0 ip=10.0.10.123/24,gw=10.0.10.1
 ```
 ## Custom Cloud-Init Configuration
 ### Using Custom Config Files
 Proxmox allows custom cloud-init configurations via the `cicustom` option:
 ```bash
 qm set 9000 --cicustom "user=<volume>,network=<volume>,meta=<volume>"
 ```
 Example using local snippets storage:
 ```bash
 qm set 9000 --cicustom "user=local:snippets/userconfig.yaml"
 ```
 ### Dump Generated Config
 Use as a base for custom configurations:
 ```bash
 qm cloudinit dump 9000 user
 qm cloudinit dump 9000 network
 qm cloudinit dump 9000 meta
 ```
 ## Cloud-Init Options Reference
 ### cicustom
 Specify custom files to replace automatically generated ones:
 - `meta=<volume>` - Meta data (provider specific)
 - `network=<volume>` - Network data
 - `user=<volume>` - User data
 - `vendor=<volume>` - Vendor data
 ### cipassword
 Password for the user. **Not recommended** - use SSH keys instead.
 ### citype
 Configuration format: `configdrive2 | nocloud | opennebula`
 - Default: `nocloud` for Linux, `configdrive2` for Windows
 ### ciupgrade
 Automatic package upgrade after first boot (default: `true`)
 ### ciuser
 Username to configure (instead of image's default user)
 ### ipconfig[n]
 IP addresses and gateways for network interfaces.
 Format: `[gw=<GatewayIPv4>] [,gw6=<GatewayIPv6>] [,ip=<IPv4Format/CIDR>] [,ip6=<IPv6Format/CIDR>]`
 Special values:
 - `ip=dhcp` - Use DHCP for IPv4
 - `ip6=auto` - Use stateless autoconfiguration (requires cloud-init 19.4+)
 ### sshkeys
 Public SSH keys (one per line, OpenSSH format)
 ### nameserver
 DNS server IP address
 ### searchdomain
 DNS search domains
 ## Best Practices
 1. **Use SSH keys** instead of passwords for authentication
 2. **Configure serial console** for cloud images (many require it)
 3. **Set boot order** to speed up boot process
 4. **Convert to template** for fast linked clone deployment
 5. **Store custom configs in snippets** storage (must be on all nodes for migration)
 6. **Test with a clone** before modifying template
 ## Troubleshooting
 ### Template Won't Boot
 - Check if serial console is configured: `qm set <vmid> --serial0 socket --vga serial0`
 - Verify boot order: `qm set <vmid> --boot order=scsi0`
 ### Network Not Configured
 - Ensure cloud-init CD-ROM is attached: `qm set <vmid> --ide2 local-lvm:cloudinit`
 - Check IP configuration: `qm config <vmid> | grep ipconfig`
 ### SSH Keys Not Working
 - Verify sshkeys format (OpenSSH format, one per line)
 - Check cloud-init logs in VM: `cat /var/log/cloud-init.log`
--- a/skills/proxmox-infrastructure/reference/networking.md
+++ b/skills/proxmox-infrastructure/reference/networking.md
@@ -0,0 +1,373 @@
 # Proxmox Network Configuration
 *Source: <https://pve.proxmox.com/wiki/Network_Configuration*>
 ## Key Concepts
 ### Configuration File
 All network configuration is in `/etc/network/interfaces`. GUI changes write to `/etc/network/interfaces.new` for safety.
 ### Applying Changes
 **ifupdown2 (recommended):**
 ```bash
 # Apply from GUI or run:
 ifreload -a
 ```
 **Reboot method:**
 The `pvenetcommit` service activates staging file before `networking` service applies it.
 ## Naming Conventions
 ### Current (Proxmox VE 5.0+)
 - Ethernet: `en*` (systemd predictable names)
  - `eno1` - first on-board NIC
  - `enp3s0f1` - function 1 of NIC on PCI bus 3, slot 0
 - Bridges: `vmbr[0-4094]`
 - Bonds: `bond[N]`
 - VLANs: Add VLAN number after period: `eno1.50`, `bond1.30`
 ### Legacy (pre-5.0)
 - Ethernet: `eth[N]` (eth0, eth1, ...)
 ### Pinning Naming Scheme Version
 Add to kernel command line to prevent name changes:
 ```bash
 net.naming-scheme=v252
 ```
 ### Overriding Device Names
 **Automatic tool:**
 ```bash
 # Generate .link files for all interfaces
 pve-network-interface-pinning generate
 # With custom prefix
 pve-network-interface-pinning generate --prefix myprefix
 # Pin specific interface
 pve-network-interface-pinning generate --interface enp1s0 --target-name if42
 ```
 **Manual method** (`/etc/systemd/network/10-enwan0.link`):
 ```ini
 [Match]
 MACAddress=aa:bb:cc:dd:ee:ff
 Type=ether
 [Link]
 Name=enwan0
 ```
 After creating link files:
 ```bash
 update-initramfs -u -k all
 # Then reboot
 ```
 ## Network Setups
 ### Default Bridged Configuration
 ```bash
 auto lo
 iface lo inet loopback
 iface eno1 inet manual
 auto vmbr0
 iface vmbr0 inet static
        address 192.168.10.2/24
        gateway 192.168.10.1
        bridge-ports eno1
        bridge-stp off
        bridge-fd 0
 ```
 VMs behave as if directly connected to physical network.
 ### Routed Configuration
 For hosting providers that block multiple MACs:
 ```bash
 auto lo
 iface lo inet loopback
 auto eno0
 iface eno0 inet static
        address  198.51.100.5/29
        gateway  198.51.100.1
        post-up echo 1 > /proc/sys/net/ipv4/ip_forward
        post-up echo 1 > /proc/sys/net/ipv4/conf/eno0/proxy_arp
 auto vmbr0
 iface vmbr0 inet static
        address  203.0.113.17/28
        bridge-ports none
        bridge-stp off
        bridge-fd 0
 ```
 ### Masquerading (NAT)
 For VMs with private IPs:
 ```bash
 auto lo
 iface lo inet loopback
 auto eno1
 iface eno1 inet static
        address  198.51.100.5/24
        gateway  198.51.100.1
 auto vmbr0
 iface vmbr0 inet static
        address  10.10.10.1/24
        bridge-ports none
        bridge-stp off
        bridge-fd 0
        post-up   echo 1 > /proc/sys/net/ipv4/ip_forward
        post-up   iptables -t nat -A POSTROUTING -s '10.10.10.0/24' -o eno1 -j MASQUERADE
        post-down iptables -t nat -D POSTROUTING -s '10.10.10.0/24' -o eno1 -j MASQUERADE
 ```
 **Conntrack zones fix** (if firewall blocks outgoing):
 ```bash
 post-up   iptables -t raw -I PREROUTING -i fwbr+ -j CT --zone 1
 post-down iptables -t raw -D PREROUTING -i fwbr+ -j CT --zone 1
 ```
 ## Linux Bonding
 ### Bond Modes
 1. **balance-rr** - Round-robin (load balancing + fault tolerance)
 2. **active-backup** - Only one active NIC (fault tolerance only)
 3. **balance-xor** - XOR selection (load balancing + fault tolerance)
 4. **broadcast** - Transmit on all slaves (fault tolerance)
 5. **802.3ad (LACP)** - IEEE 802.3ad Dynamic link aggregation (requires switch support)
 6. **balance-tlb** - Adaptive transmit load balancing
 7. **balance-alb** - Adaptive load balancing (balance-tlb + receive balancing)
 **Recommendation:**
 - If switch supports LACP → use 802.3ad
 - Otherwise → use active-backup
 ### Bond with Fixed IP
 ```bash
 auto lo
 iface lo inet loopback
 iface eno1 inet manual
 iface eno2 inet manual
 auto bond0
 iface bond0 inet static
      bond-slaves eno1 eno2
      address  192.168.1.2/24
      bond-miimon 100
      bond-mode 802.3ad
      bond-xmit-hash-policy layer2+3
 auto vmbr0
 iface vmbr0 inet static
        address  10.10.10.2/24
        gateway  10.10.10.1
        bridge-ports eno3
        bridge-stp off
        bridge-fd 0
 ```
 ### Bond as Bridge Port
 For fault-tolerant guest network:
 ```bash
 auto lo
 iface lo inet loopback
 iface eno1 inet manual
 iface eno2 inet manual
 auto bond0
 iface bond0 inet manual
      bond-slaves eno1 eno2
      bond-miimon 100
      bond-mode 802.3ad
      bond-xmit-hash-policy layer2+3
 auto vmbr0
 iface vmbr0 inet static
        address  10.10.10.2/24
        gateway  10.10.10.1
        bridge-ports bond0
        bridge-stp off
        bridge-fd 0
 ```
 ## VLAN Configuration (802.1Q)
 ### VLAN Awareness on Bridge
 **Guest VLANs** - Configure VLAN tag in VM settings, bridge handles transparently.
 **Bridge with VLAN awareness:**
 ```bash
 auto vmbr0
 iface vmbr0 inet manual
        bridge-ports eno1
        bridge-stp off
        bridge-fd 0
        bridge-vlan-aware yes
        bridge-vids 2-4094
 ```
 ### Host Management on VLAN
 **With VLAN-aware bridge:**
 ```bash
 auto lo
 iface lo inet loopback
 iface eno1 inet manual
 auto vmbr0.5
 iface vmbr0.5 inet static
        address  10.10.10.2/24
        gateway  10.10.10.1
 auto vmbr0
 iface vmbr0 inet manual
        bridge-ports eno1
        bridge-stp off
        bridge-fd 0
        bridge-vlan-aware yes
        bridge-vids 2-4094
 ```
 **Traditional VLAN:**
 ```bash
 auto lo
 iface lo inet loopback
 iface eno1 inet manual
 iface eno1.5 inet manual
 auto vmbr0v5
 iface vmbr0v5 inet static
        address  10.10.10.2/24
        gateway  10.10.10.1
        bridge-ports eno1.5
        bridge-stp off
        bridge-fd 0
 auto vmbr0
 iface vmbr0 inet manual
        bridge-ports eno1
        bridge-stp off
        bridge-fd 0
 ```
 ### VLAN with Bonding
 ```bash
 auto lo
 iface lo inet loopback
 iface eno1 inet manual
 iface eno2 inet manual
 auto bond0
 iface bond0 inet manual
      bond-slaves eno1 eno2
      bond-miimon 100
      bond-mode 802.3ad
      bond-xmit-hash-policy layer2+3
 iface bond0.5 inet manual
 auto vmbr0v5
 iface vmbr0v5 inet static
        address  10.10.10.2/24
        gateway  10.10.10.1
        bridge-ports bond0.5
        bridge-stp off
        bridge-fd 0
 auto vmbr0
 iface vmbr0 inet manual
        bridge-ports bond0
        bridge-stp off
        bridge-fd 0
 ```
 ## Advanced Features
 ### Disable MAC Learning
 Available since Proxmox VE 7.3:
 ```bash
 auto vmbr0
 iface vmbr0 inet static
        address  10.10.10.2/24
        gateway  10.10.10.1
        bridge-ports ens18
        bridge-stp off
        bridge-fd 0
        bridge-disable-mac-learning 1
 ```
 Proxmox VE manually adds VM/CT MAC addresses to forwarding database.
 ### Disable IPv6
 Create `/etc/sysctl.d/disable-ipv6.conf`:
 ```ini
 net.ipv6.conf.all.disable_ipv6 = 1
 net.ipv6.conf.default.disable_ipv6 = 1
 ```
 Then: `sysctl -p /etc/sysctl.d/disable-ipv6.conf`
 ## Troubleshooting
 ### Avoid ifup/ifdown
 **Don't use** `ifup`/`ifdown` on bridges as they interrupt guest traffic without reconnecting.
 **Use instead:**
 - GUI "Apply Configuration" button
 - `ifreload -a` command
 - Reboot
 ### Network Changes Not Applied
 1. Check `/etc/network/interfaces.new` exists
 2. Click "Apply Configuration" in GUI or run `ifreload -a`
 3. If issues persist, reboot
 ### Bond Not Working with Corosync
 Some bond modes are problematic for Corosync. Use multiple networks instead of bonding for cluster traffic.
--- a/skills/proxmox-infrastructure/reference/qemu-guest-agent.md
+++ b/skills/proxmox-infrastructure/reference/qemu-guest-agent.md
@@ -0,0 +1,467 @@
 # QEMU Guest Agent Integration
 ## Overview
 The QEMU Guest Agent (`qemu-guest-agent`) is a service running inside VMs that enables communication between Proxmox and the guest OS. It provides IP address detection, graceful shutdowns, filesystem freezing for snapshots, and more.
 ## Why Use QEMU Guest Agent?
 **Without Guest Agent:**
 - VM IP address unknown to Proxmox
 - Shutdown = hard power off
 - Snapshots don't freeze filesystem (risk of corruption)
 - No guest-level monitoring
 **With Guest Agent:**
 - Automatic IP address detection
 - Graceful shutdown/reboot
 - Consistent snapshots with filesystem freeze
 - Execute commands inside VM
 - Query guest information (hostname, users, OS details)
 ## Installation in Guest VM
 ### Ubuntu/Debian
 ```bash
 sudo apt update
 sudo apt install qemu-guest-agent
 sudo systemctl enable qemu-guest-agent
 sudo systemctl start qemu-guest-agent
 ```
 ### RHEL/Rocky/AlmaLinux
 ```bash
 sudo dnf install qemu-guest-agent
 sudo systemctl enable qemu-guest-agent
 sudo systemctl start qemu-guest-agent
 ```
 ### Verify Installation
 ```bash
 systemctl status qemu-guest-agent
 ```
 **Expected output:**
 ```text
 ● qemu-guest-agent.service - QEMU Guest Agent
     Loaded: loaded (/lib/systemd/system/qemu-guest-agent.service; enabled)
     Active: active (running)
 ```
 ## Enable in VM Configuration
 ### Via Proxmox Web UI
 **VM → Hardware → Add → QEMU Guest Agent**
 OR edit VM options:
 **VM → Options → QEMU Guest Agent → Edit → Check "Use QEMU Guest Agent"**
 ### Via CLI
 ```bash
 qm set <vmid> --agent 1
 ```
 **With custom options:**
 ```bash
 # Enable with filesystem freeze support
 qm set <vmid> --agent enabled=1,fstrim_cloned_disks=1
 ```
 ### Via Terraform
 ```hcl
 resource "proxmox_vm_qemu" "vm" {
  name = "my-vm"
  # ... other config ...
  agent = 1  # Enable guest agent
 }
 ```
 ### Via Ansible
 ```yaml
 - name: Enable QEMU guest agent
  community.proxmox.proxmox_kvm:
    api_host: "{{ proxmox_api_host }}"
    api_user: "{{ proxmox_api_user }}"
    api_token_id: "{{ proxmox_token_id }}"
    api_token_secret: "{{ proxmox_token_secret }}"
    node: foxtrot
    vmid: 101
    agent: 1
    update: true
 ```
 ## Using Guest Agent
 ### Check Agent Status
 **Via CLI:**
 ```bash
 # Test if agent is responding
 qm agent 101 ping
 # Get guest info
 qm agent 101 info
 # Get network interfaces
 qm agent 101 network-get-interfaces
 # Get IP addresses
 qm agent 101 get-osinfo
 ```
 **Example output:**
 ```json
 {
  "result": {
    "id": "ubuntu",
    "kernel-release": "5.15.0-91-generic",
    "kernel-version": "#101-Ubuntu SMP",
    "machine": "x86_64",
    "name": "Ubuntu",
    "pretty-name": "Ubuntu 22.04.3 LTS",
    "version": "22.04",
    "version-id": "22.04"
  }
 }
 ```
 ### Execute Commands
 **Via CLI:**
 ```bash
 # Execute command in guest
 qm guest exec 101 -- whoami
 # With arguments
 qm guest exec 101 -- ls -la /tmp
 ```
 **Via Python API:**
 ```python
 from proxmoxer import ProxmoxAPI
 proxmox = ProxmoxAPI('192.168.3.5', user='root@pam', password='pass')
 # Execute command
 result = proxmox.nodes('foxtrot').qemu(101).agent.exec.post(
    command=['whoami']
 )
 # Get execution result
 pid = result['pid']
 exec_status = proxmox.nodes('foxtrot').qemu(101).agent('exec-status').get(pid=pid)
 print(exec_status)
 ```
 ### Graceful Shutdown/Reboot
 **Shutdown (graceful with agent):**
 ```bash
 # Sends ACPI shutdown to guest, waits for agent to shutdown OS
 qm shutdown 101
 # Force shutdown if doesn't complete in 60s
 qm shutdown 101 --timeout 60 --forceStop 1
 ```
 **Reboot:**
 ```bash
 qm reboot 101
 ```
 ## Snapshot Integration
 ### Filesystem Freeze for Consistent Snapshots
 When guest agent is enabled, Proxmox can freeze the filesystem before taking a snapshot, ensuring consistency.
 **Create snapshot with FS freeze:**
 ```bash
 # Guest agent automatically freezes filesystem
 qm snapshot 101 before-upgrade --vmstate 0 --description "Before upgrade"
 ```
 **Rollback to snapshot:**
 ```bash
 qm rollback 101 before-upgrade
 ```
 **Delete snapshot:**
 ```bash
 qm delsnapshot 101 before-upgrade
 ```
 ## IP Address Detection
 ### Automatic IP Assignment
 With guest agent, Proxmox automatically detects VM IP addresses.
 **View in Web UI:**
 VM → Summary → IPs section shows detected IPs
 **Via CLI:**
 ```bash
 qm agent 101 network-get-interfaces | jq '.result[] | select(.name=="eth0") | ."ip-addresses"'
 ```
 **Via Python:**
 ```python
 interfaces = proxmox.nodes('foxtrot').qemu(101).agent('network-get-interfaces').get()
 for iface in interfaces['result']:
    if iface['name'] == 'eth0':
        for ip in iface.get('ip-addresses', []):
            if ip['ip-address-type'] == 'ipv4':
                print(f"IPv4: {ip['ip-address']}")
 ```
 ## Advanced Configuration
 ### Guest Agent Options
 **Full options syntax:**
 ```bash
 qm set <vmid> --agent [enabled=]<1|0>[,fstrim_cloned_disks=<1|0>][,type=<virtio|isa>]
 ```
 **Parameters:**
 - `enabled` - Enable/disable guest agent (default: 1)
 - `fstrim_cloned_disks` - Run fstrim after cloning disk (default: 0)
 - `type` - Agent communication type: virtio or isa (default: virtio)
 **Example:**
 ```bash
 # Enable with fstrim on cloned disks
 qm set 101 --agent enabled=1,fstrim_cloned_disks=1
 ```
 ### Filesystem Trim (fstrim)
 For VMs on thin-provisioned storage (LVM-thin, CEPH), fstrim helps reclaim unused space.
 **Manual fstrim:**
 ```bash
 # Inside VM
 sudo fstrim -av
 ```
 **Automatic on clone:**
 ```bash
 qm set <vmid> --agent enabled=1,fstrim_cloned_disks=1
 ```
 **Scheduled fstrim (inside VM):**
 ```bash
 # Enable weekly fstrim timer
 sudo systemctl enable fstrim.timer
 sudo systemctl start fstrim.timer
 ```
 ## Cloud-Init Integration
 ### Include in Cloud-Init Template
 **During template creation:**
 ```bash
 # Install agent package
 virt-customize -a ubuntu-22.04.img \
  --install qemu-guest-agent \
  --run-command "systemctl enable qemu-guest-agent"
 # Create VM from image
 qm create 9000 --name ubuntu-template --memory 2048 --cores 2 --net0 virtio,bridge=vmbr0
 qm importdisk 9000 ubuntu-22.04.img local-lvm
 qm set 9000 --scsihw virtio-scsi-pci --scsi0 local-lvm:vm-9000-disk-0
 qm set 9000 --agent 1  # Enable guest agent
 qm set 9000 --ide2 local-lvm:cloudinit
 qm template 9000
 ```
 ### Cloud-Init User Data
 **Include in cloud-init config:**
 ```yaml
 #cloud-config
 packages:
  - qemu-guest-agent
 runcmd:
  - systemctl enable qemu-guest-agent
  - systemctl start qemu-guest-agent
 ```
 ## Troubleshooting
 ### Guest Agent Not Responding
 **1. Check if service is running in guest:**
 ```bash
 # Inside VM
 systemctl status qemu-guest-agent
 journalctl -u qemu-guest-agent
 ```
 **2. Check if agent is enabled in VM config:**
 ```bash
 # On Proxmox host
 qm config 101 | grep agent
 ```
 **3. Check virtio serial device:**
 ```bash
 # Inside VM
 ls -l /dev/virtio-ports/
 # Should show: org.qemu.guest_agent.0
 ```
 **4. Restart agent:**
 ```bash
 # Inside VM
 sudo systemctl restart qemu-guest-agent
 ```
 **5. Check Proxmox can communicate:**
 ```bash
 # On Proxmox host
 qm agent 101 ping
 ```
 ### IP Address Not Detected
 **Possible causes:**
 1. Guest agent not running
 2. Network interface not configured
 3. DHCP not assigning IP
 4. Firewall blocking communication
 **Debug:**
 ```bash
 # Check all interfaces
 qm agent 101 network-get-interfaces | jq
 # Verify cloud-init completed
 # Inside VM
 cloud-init status
 ```
 ### Filesystem Freeze Timeout
 **Symptoms:**
 Snapshot creation hangs or times out.
 **Solution:**
 ```bash
 # Disable FS freeze for snapshots
 qm set 101 --agent enabled=1
 # Take snapshot without FS freeze
 qm snapshot 101 test --vmstate 0
 ```
 ### Agent Installed but Not Enabled
 **Check VM config:**
 ```bash
 qm config 101 | grep agent
 ```
 **If missing, enable:**
 ```bash
 qm set 101 --agent 1
 ```
 **Restart VM for changes to take effect:**
 ```bash
 qm reboot 101
 ```
 ## Best Practices
 1. **Always install in templates** - Include qemu-guest-agent in VM templates
 2. **Enable during provisioning** - Set `--agent 1` when creating VMs
 3. **Use for production VMs** - Critical for graceful shutdowns and monitoring
 4. **Enable fstrim for thin storage** - Helps reclaim space on LVM-thin and CEPH
 5. **Test before snapshots** - Verify agent works: `qm agent <vmid> ping`
 6. **Cloud-init integration** - Automate installation via cloud-init packages
 7. **Monitor agent status** - Check agent is running in monitoring tools
 ## Ansible Automation Example
 ```yaml
 ---
 - name: Ensure QEMU guest agent is configured
  hosts: proxmox_vms
  become: true
  tasks:
    - name: Install qemu-guest-agent
      ansible.builtin.apt:
        name: qemu-guest-agent
        state: present
      when: ansible_os_family == "Debian"
    - name: Enable and start qemu-guest-agent
      ansible.builtin.systemd:
        name: qemu-guest-agent
        enabled: true
        state: started
    - name: Verify agent is running
      ansible.builtin.systemd:
        name: qemu-guest-agent
      register: agent_status
    - name: Report agent status
      ansible.builtin.debug:
        msg: "Guest agent is {{ agent_status.status.ActiveState }}"
 ```
 ## Further Reading
 - [Proxmox QEMU Guest Agent Documentation](https://pve.proxmox.com/wiki/Qemu-guest-agent)
 - [QEMU Guest Agent Protocol](https://www.qemu.org/docs/master/interop/qemu-ga.html)
--- a/skills/proxmox-infrastructure/reference/storage-management.md
+++ b/skills/proxmox-infrastructure/reference/storage-management.md
@@ -0,0 +1,486 @@
 # Proxmox Storage Management
 ## Overview
 Proxmox VE supports multiple storage backends. This guide focuses on the storage architecture of the Matrix cluster: LVM-thin for boot disks and CEPH for distributed storage.
 ## Matrix Cluster Storage Architecture
 ### Hardware Configuration
 **Per Node (Foxtrot, Golf, Hotel):**
 ```text
 nvme0n1  - 1TB Crucial P3        → Boot disk + LVM
 nvme1n1  - 4TB Samsung 990 PRO   → CEPH OSD (2 OSDs)
 nvme2n1  - 4TB Samsung 990 PRO   → CEPH OSD (2 OSDs)
 ```
 **Total Cluster:**
 - 3× 1TB boot disks (LVM local storage)
 - 6× 4TB NVMe drives (24TB raw CEPH capacity)
 - 12 CEPH OSDs total (2 per NVMe drive)
 ### Storage Pools
 ```text
 Storage Pool     Type       Backend    Purpose
 -------------    ----       -------    -------
 local            dir        Directory  ISO images, templates, backups
 local-lvm        lvmthin    LVM-thin   VM disks (local)
 ceph-pool        rbd        CEPH RBD   VM disks (distributed, HA)
 ceph-fs          cephfs     CephFS     Shared filesystem
 ```
 ## LVM Storage
 ### LVM-thin Configuration
 **Advantages:**
 - Thin provisioning (overcommit storage)
 - Fast snapshots
 - Local to each node (low latency)
 - No network overhead
 **Disadvantages:**
 - No HA (tied to single node)
 - No live migration with storage
 - Limited to node's local disk size
 **Check LVM usage:**
 ```bash
 # View volume groups
 vgs
 # View logical volumes
 lvs
 # View thin pool usage
 lvs -a | grep thin
 ```
 **Example output:**
 ```text
  LV            VG  Attr       LSize   Pool Origin Data%
  data          pve twi-aotz-- 850.00g             45.23
  vm-101-disk-0 pve Vwi-aotz--  50.00g data        12.45
 ```
 ### Managing LVM Storage
 **Extend thin pool (if boot disk has space):**
 ```bash
 # Check free space in VG
 vgs pve
 # Extend thin pool
 lvextend -L +100G pve/data
 ```
 **Create VM disk manually:**
 ```bash
 # Create 50GB disk for VM 101
 lvcreate -V 50G -T pve/data -n vm-101-disk-0
 ```
 ## CEPH Storage
 ### CEPH Architecture for Matrix
 **Network Configuration:**
 ```text
 vmbr1 (192.168.5.0/24, MTU 9000) → CEPH Public Network
 vmbr2 (192.168.7.0/24, MTU 9000) → CEPH Private Network
 ```
 **OSD Distribution:**
 ```text
 Node      NVMe       OSDs    Capacity
 -------   ------     ----    --------
 foxtrot   nvme1n1    2       4TB
 foxtrot   nvme2n1    2       4TB
 golf      nvme1n1    2       4TB
 golf      nvme2n1    2       4TB
 hotel     nvme1n1    2       4TB
 hotel     nvme2n1    2       4TB
 -------   ------     ----    --------
 Total                12      24TB raw
 ```
 **Usable capacity (replica 3):** ~8TB
 ### CEPH Deployment Commands
 **Install CEPH:**
 ```bash
 # On first node (foxtrot)
 pveceph install --version reef
 # Initialize cluster
 pveceph init --network 192.168.5.0/24 --cluster-network 192.168.7.0/24
 ```
 **Create Monitors (3 for quorum):**
 ```bash
 # On each node
 pveceph mon create
 ```
 **Create Manager (on each node):**
 ```bash
 pveceph mgr create
 ```
 **Create OSDs:**
 ```bash
 # On each node - 2 OSDs per NVMe drive
 # For nvme1n1 (4TB)
 pveceph osd create /dev/nvme1n1 --crush-device-class nvme
 # For nvme2n1 (4TB)
 pveceph osd create /dev/nvme2n1 --crush-device-class nvme
 ```
 **Create CEPH Pool:**
 ```bash
 # Create RBD pool for VMs
 pveceph pool create ceph-pool --add_storages
 # Create CephFS for shared storage
 pveceph fs create --name cephfs --add-storage
 ```
 ### CEPH Configuration Best Practices
 **Optimize for NVMe:**
 ```bash
 # /etc/pve/ceph.conf
 [global]
    public_network = 192.168.5.0/24
    cluster_network = 192.168.7.0/24
    osd_pool_default_size = 3
    osd_pool_default_min_size = 2
 [osd]
    osd_memory_target = 4294967296  # 4GB per OSD
    osd_max_backfills = 1
    osd_recovery_max_active = 1
 ```
 **Restart CEPH services after config change:**
 ```bash
 systemctl restart ceph-osd@*.service
 ```
 ### CEPH Monitoring
 **Check cluster health:**
 ```bash
 ceph status
 ceph health detail
 ```
 **Example healthy output:**
 ```text
 cluster:
  id:     a1b2c3d4-e5f6-7890-abcd-ef1234567890
  health: HEALTH_OK
 services:
  mon: 3 daemons, quorum foxtrot,golf,hotel
  mgr: foxtrot(active), standbys: golf, hotel
  osd: 12 osds: 12 up, 12 in
 data:
  pools:   2 pools, 128 pgs
  objects: 1.23k objects, 45 GiB
  usage:   135 GiB used, 23.8 TiB / 24 TiB avail
  pgs:     128 active+clean
 ```
 **Check OSD performance:**
 ```bash
 ceph osd df
 ceph osd perf
 ```
 **Check pool usage:**
 ```bash
 ceph df
 rados df
 ```
 ## Storage Configuration in Proxmox
 ### Add Storage via Web UI
 **Datacenter → Storage → Add:**
 1. **Directory** - For ISOs and backups
 2. **LVM-Thin** - For local VM disks
 3. **RBD** - For CEPH VM disks
 4. **CephFS** - For shared files
 ### Add Storage via CLI
 **CEPH RBD:**
 ```bash
 pvesm add rbd ceph-pool \
  --pool ceph-pool \
  --content images,rootdir \
  --nodes foxtrot,golf,hotel
 ```
 **CephFS:**
 ```bash
 pvesm add cephfs cephfs \
  --path /mnt/pve/cephfs \
  --content backup,iso,vztmpl \
  --nodes foxtrot,golf,hotel
 ```
 **NFS (if using external NAS):**
 ```bash
 pvesm add nfs nas-storage \
  --server 192.168.3.10 \
  --export /mnt/tank/proxmox \
  --content images,backup,iso \
  --nodes foxtrot,golf,hotel
 ```
 ## VM Disk Management
 ### Create VM Disk on CEPH
 **Via CLI:**
 ```bash
 # Create 100GB disk for VM 101 on CEPH
 qm set 101 --scsi1 ceph-pool:100
 ```
 **Via API (Python):**
 ```python
 from proxmoxer import ProxmoxAPI
 proxmox = ProxmoxAPI('192.168.3.5', user='root@pam', password='pass')
 proxmox.nodes('foxtrot').qemu(101).config.put(scsi1='ceph-pool:100')
 ```
 ### Move VM Disk Between Storage
 **Move from local-lvm to CEPH:**
 ```bash
 qm move-disk 101 scsi0 ceph-pool --delete 1
 ```
 **Move with live migration:**
 ```bash
 qm move-disk 101 scsi0 ceph-pool --delete 1 --online 1
 ```
 ### Resize VM Disk
 **Grow disk (can't shrink):**
 ```bash
 # Grow VM 101's scsi0 by 50GB
 qm resize 101 scsi0 +50G
 ```
 **Inside VM (expand filesystem):**
 ```bash
 # For ext4
 sudo resize2fs /dev/sda1
 # For XFS
 sudo xfs_growfs /
 ```
 ## Backup and Restore
 ### Backup to Storage
 **Create backup:**
 ```bash
 # Backup VM 101 to local storage
 vzdump 101 --storage local --mode snapshot --compress zstd
 # Backup to CephFS
 vzdump 101 --storage cephfs --mode snapshot --compress zstd
 ```
 **Scheduled backups (via Web UI):**
 Datacenter → Backup → Add:
 - Schedule: Daily at 2 AM
 - Storage: cephfs
 - Mode: Snapshot
 - Compression: ZSTD
 - Retention: Keep last 7
 ### Restore from Backup
 **List backups:**
 ```bash
 ls /var/lib/vz/dump/
 # OR
 ls /mnt/pve/cephfs/dump/
 ```
 **Restore:**
 ```bash
 # Restore to same VMID
 qmrestore /var/lib/vz/dump/vzdump-qemu-101-2024_01_15-02_00_00.vma.zst 101
 # Restore to new VMID
 qmrestore /var/lib/vz/dump/vzdump-qemu-101-2024_01_15-02_00_00.vma.zst 102 --storage ceph-pool
 ```
 ## Performance Tuning
 ### CEPH Performance
 **For NVMe OSDs:**
 ```bash
 # Set proper device class
 ceph osd crush set-device-class nvme osd.0
 ceph osd crush set-device-class nvme osd.1
 # ... repeat for all OSDs
 ```
 **Create performance pool:**
 ```bash
 ceph osd pool create fast-pool 128 128
 ceph osd pool application enable fast-pool rbd
 ```
 **Enable RBD cache:**
 ```bash
 # /etc/pve/ceph.conf
 [client]
    rbd_cache = true
    rbd_cache_size = 134217728  # 128MB
    rbd_cache_writethrough_until_flush = false
 ```
 ### LVM Performance
 **Use SSD discard:**
 ```bash
 # Enable discard on VM disk
 qm set 101 --scsi0 local-lvm:vm-101-disk-0,discard=on,ssd=1
 ```
 ## Troubleshooting
 ### CEPH Not Healthy
 **Check OSD status:**
 ```bash
 ceph osd tree
 ceph osd stat
 ```
 **Restart stuck OSD:**
 ```bash
 systemctl restart ceph-osd@0.service
 ```
 **Check network connectivity:**
 ```bash
 # From one node to another
 ping -c 3 -M do -s 8972 192.168.5.6  # Test MTU 9000
 ```
 ### LVM Out of Space
 **Check thin pool usage:**
 ```bash
 lvs pve/data -o lv_name,data_percent,metadata_percent
 ```
 **If thin pool > 90% full:**
 ```bash
 # Extend if VG has space
 lvextend -L +100G pve/data
 # OR delete unused VM disks
 lvremove pve/vm-XXX-disk-0
 ```
 ### Storage Performance Issues
 **Test disk I/O:**
 ```bash
 # Test sequential write
 dd if=/dev/zero of=/tmp/test bs=1M count=1024 oflag=direct
 # Test CEPH RBD performance
 rbd bench --io-type write ceph-pool/test-image
 ```
 **Monitor CEPH latency:**
 ```bash
 ceph osd perf
 ```
 ## Best Practices
 1. **Use CEPH for HA VMs** - Store critical VM disks on CEPH for live migration
 2. **Use LVM for performance** - Non-critical VMs get better performance on local LVM
 3. **MTU 9000 for CEPH** - Always use jumbo frames on CEPH networks
 4. **Separate networks** - Public and private CEPH networks on different interfaces
 5. **Monitor CEPH health** - Set up alerts for HEALTH_WARN/HEALTH_ERR
 6. **Regular backups** - Automated daily backups to CephFS or external NAS
 7. **Plan for growth** - Leave 20% free space in CEPH for rebalancing
 8. **Use replica 3** - Essential for data safety, especially with only 3 nodes
 ## Further Reading
 - [Proxmox VE Storage Documentation](https://pve.proxmox.com/wiki/Storage)
 - [CEPH Documentation](https://docs.ceph.com/)
 - [Proxmox CEPH Guide](https://pve.proxmox.com/wiki/Deploy_Hyper-Converged_Ceph_Cluster)
--- a/skills/proxmox-infrastructure/tools/check_ceph_health.py
+++ b/skills/proxmox-infrastructure/tools/check_ceph_health.py
@@ -0,0 +1,469 @@
 #!/usr/bin/env -S uv run --script --quiet
 # /// script
 # requires-python = ">=3.11"
 # dependencies = []
 # ///
 """
 CEPH Cluster Health Checker
 Validates CEPH storage cluster health including:
 - Cluster health status
 - Monitor and manager status
 - OSD status and distribution
 - Pool configuration and usage
 - PG state verification
 Usage:
    python check_ceph_health.py [--node NODE] [--json]
 Examples:
    # Check CEPH health (requires SSH access to cluster node)
    python check_ceph_health.py --node foxtrot
    # Output as JSON for parsing
    python check_ceph_health.py --node foxtrot --json
    # Check minimum OSD count
    python check_ceph_health.py --node foxtrot --min-osds 12
 """
 import argparse
 import json
 import re
 import subprocess
 import sys
 from dataclasses import dataclass, asdict, field
 from typing import Dict, List, Optional
@dataclass
 class OSDStatus:
    """OSD status information"""
    osd_id: int
    host: str
    status: str  # up/down
    in_cluster: bool
    weight: float
    device_class: str
@dataclass
 class PoolStatus:
    """Pool status information"""
    name: str
    pool_id: int
    size: int
    min_size: int
    pg_num: int
    pgp_num: int
    used_bytes: int
    max_avail_bytes: int
    percent_used: float
@dataclass
 class MonitorStatus:
    """Monitor status"""
    name: str
    rank: int
    address: str
    in_quorum: bool
@dataclass
 class ManagerStatus:
    """Manager status"""
    name: str
    active: bool
    address: str
@dataclass
 class CEPHHealth:
    """Overall CEPH health"""
    status: str  # HEALTH_OK, HEALTH_WARN, HEALTH_ERR
    num_osds: int
    num_up_osds: int
    num_in_osds: int
    num_pgs: int
    num_active_clean_pgs: int
    monitors: List[MonitorStatus] = field(default_factory=list)
    managers: List[ManagerStatus] = field(default_factory=list)
    osds: List[OSDStatus] = field(default_factory=list)
    pools: List[PoolStatus] = field(default_factory=list)
    data_bytes: int = 0
    used_bytes: int = 0
    avail_bytes: int = 0
    warnings: List[str] = field(default_factory=list)
    errors: List[str] = field(default_factory=list)
    @property
    def is_healthy(self) -> bool:
        """Check if CEPH is in healthy state"""
        return (
            self.status == 'HEALTH_OK' and
            self.num_up_osds == self.num_osds and
            self.num_in_osds == self.num_osds and
            self.num_active_clean_pgs == self.num_pgs and
            len(self.errors) == 0
        )
    @property
    def percent_used(self) -> float:
        """Calculate cluster usage percentage"""
        if self.data_bytes == 0:
            return 0.0
        return (self.used_bytes / self.data_bytes) * 100
 class CEPHHealthChecker:
    """Check CEPH cluster health via SSH"""
    def __init__(self, node: str):
        # Validate node is a valid hostname or IP address
        if not self._validate_node(node):
            raise ValueError(f"Invalid node name or IP address: {node}")
        self.node = node
        self.health = CEPHHealth(
            status="UNKNOWN",
            num_osds=0,
            num_up_osds=0,
            num_in_osds=0,
            num_pgs=0,
            num_active_clean_pgs=0
        )
    def _validate_node(self, node: str) -> bool:
        """Validate node is a valid hostname or IP address"""
        # Allow valid hostnames and IPv4/IPv6 addresses
        hostname_pattern = r'^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*$'
        ipv4_pattern = r'^(\d{1,3}\.){3}\d{1,3}$'
        ipv6_pattern = r'^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$'
        return bool(
            re.match(hostname_pattern, node) or
            re.match(ipv4_pattern, node) or
            re.match(ipv6_pattern, node)
        )
    def run_command(self, command: str) -> str:
        """Execute command on remote node via SSH"""
        try:
            # Use -- to prevent SSH option injection
            result = subprocess.run(
                ["ssh", "-o", "BatchMode=yes", f"root@{self.node}", "--", command],
                capture_output=True,
                text=True,
                check=True,
                timeout=30
            )
            return result.stdout
        except subprocess.TimeoutExpired as e:
            error_msg = f"Command timed out after 30s: {command}"
            self.health.errors.append(error_msg)
            raise RuntimeError(error_msg) from e
        except subprocess.CalledProcessError as e:
            error_msg = f"Command failed: {command}: {e.stderr}"
            self.health.errors.append(error_msg)
            raise RuntimeError(error_msg) from e
    def check_ceph_status(self):
        """Check ceph status output"""
        output = self.run_command("ceph status --format json")
        if not output:
            self.health.errors.append("Failed to get CEPH status")
            return
        try:
            status_data = json.loads(output)
            # Parse overall health
            self.health.status = status_data.get('health', {}).get('status', 'UNKNOWN')
            # Parse OSD map
            osd_map = status_data.get('osdmap', {}).get('osdmap', {})
            self.health.num_osds = osd_map.get('num_osds', 0)
            self.health.num_up_osds = osd_map.get('num_up_osds', 0)
            self.health.num_in_osds = osd_map.get('num_in_osds', 0)
            # Parse PG map
            pg_map = status_data.get('pgmap', {})
            self.health.num_pgs = pg_map.get('num_pgs', 0)
            # Parse PG states
            pg_states = pg_map.get('pgs_by_state', [])
            for state in pg_states:
                if state.get('state_name') == 'active+clean':
                    self.health.num_active_clean_pgs = state.get('count', 0)
            # Parse storage usage
            self.health.data_bytes = pg_map.get('data_bytes', 0)
            self.health.used_bytes = pg_map.get('bytes_used', 0)
            self.health.avail_bytes = pg_map.get('bytes_avail', 0)
            # Check for health warnings
            health_checks = status_data.get('health', {}).get('checks', {})
            for check_name, check_data in health_checks.items():
                severity = check_data.get('severity', '')
                summary = check_data.get('summary', {}).get('message', '')
                if severity == 'HEALTH_ERR':
                    self.health.errors.append(f"{check_name}: {summary}")
                elif severity == 'HEALTH_WARN':
                    self.health.warnings.append(f"{check_name}: {summary}")
        except (json.JSONDecodeError, KeyError) as e:
            self.health.errors.append(f"Failed to parse CEPH status: {e}")
    def check_monitors(self):
        """Check monitor status"""
        output = self.run_command("ceph mon dump --format json")
        if not output:
            self.health.warnings.append("Failed to get monitor status")
            return
        try:
            mon_data = json.loads(output)
            quorum = set()
            # Get quorum members
            quorum_output = self.run_command("ceph quorum_status --format json")
            if quorum_output:
                quorum_data = json.loads(quorum_output)
                quorum = set(quorum_data.get('quorum', []))
            # Parse monitors
            for mon in mon_data.get('mons', []):
                self.health.monitors.append(MonitorStatus(
                    name=mon.get('name', ''),
                    rank=mon.get('rank', -1),
                    address=mon.get('addr', ''),
                    in_quorum=mon.get('rank', -1) in quorum
                ))
            # Check if all monitors are in quorum
            not_in_quorum = [m.name for m in self.health.monitors if not m.in_quorum]
            if not_in_quorum:
                self.health.warnings.append(
                    f"Monitors not in quorum: {', '.join(not_in_quorum)}"
                )
        except (json.JSONDecodeError, KeyError) as e:
            self.health.warnings.append(f"Failed to parse monitor status: {e}")
    def check_managers(self):
        """Check manager status"""
        output = self.run_command("ceph mgr dump --format json")
        if not output:
            self.health.warnings.append("Failed to get manager status")
            return
        try:
            mgr_data = json.loads(output)
            # Active manager
            active_name = mgr_data.get('active_name', '')
            active_addr = mgr_data.get('active_addr', '')
            if active_name:
                self.health.managers.append(ManagerStatus(
                    name=active_name,
                    active=True,
                    address=active_addr
                ))
            # Standby managers
            for standby in mgr_data.get('standbys', []):
                self.health.managers.append(ManagerStatus(
                    name=standby.get('name', ''),
                    active=False,
                    address=standby.get('gid', '')
                ))
        except (json.JSONDecodeError, KeyError) as e:
            self.health.warnings.append(f"Failed to parse manager status: {e}")
    def check_osds(self):
        """Check OSD status"""
        output = self.run_command("ceph osd tree --format json")
        if not output:
            self.health.warnings.append("Failed to get OSD tree")
            return
        try:
            osd_data = json.loads(output)
            # Parse OSD nodes
            for node in osd_data.get('nodes', []):
                if node.get('type') == 'osd':
                    osd_id = node.get('id', -1)
                    status = node.get('status', 'unknown')
                    in_cluster = node.get('exists', 0) == 1
                    self.health.osds.append(OSDStatus(
                        osd_id=osd_id,
                        host=node.get('name', 'unknown'),
                        status=status,
                        in_cluster=in_cluster,
                        weight=node.get('crush_weight', 0.0),
                        device_class=node.get('device_class', 'unknown')
                    ))
            # Check for down OSDs
            down_osds = [o.osd_id for o in self.health.osds if o.status != 'up']
            if down_osds:
                self.health.errors.append(f"OSDs down: {down_osds}")
        except (json.JSONDecodeError, KeyError) as e:
            self.health.warnings.append(f"Failed to parse OSD tree: {e}")
    def check_pools(self):
        """Check pool status"""
        output = self.run_command("ceph osd pool ls detail --format json")
        if not output:
            self.health.warnings.append("Failed to get pool information")
            return
        try:
            pool_data = json.loads(output)
            for pool in pool_data:
                pool_name = pool.get('pool_name', '')
                # Get pool stats
                stats_output = self.run_command(f"ceph osd pool stats {pool_name} --format json")
                if stats_output:
                    stats = json.loads(stats_output)
                    pool_stats = stats[0] if stats else {}
                    self.health.pools.append(PoolStatus(
                        name=pool_name,
                        pool_id=pool.get('pool', 0),
                        size=pool.get('size', 0),
                        min_size=pool.get('min_size', 0),
                        pg_num=pool.get('pg_num', 0),
                        pgp_num=pool.get('pgp_num', 0),
                        used_bytes=pool_stats.get('bytes_used', 0),
                        max_avail_bytes=pool_stats.get('max_avail', 0),
                        percent_used=pool_stats.get('percent_used', 0.0) * 100
                    ))
        except (json.JSONDecodeError, KeyError) as e:
            self.health.warnings.append(f"Failed to parse pool information: {e}")
    def check_pg_state(self):
        """Verify all PGs are active+clean"""
        if self.health.num_active_clean_pgs != self.health.num_pgs:
            self.health.errors.append(
                f"Not all PGs active+clean: {self.health.num_active_clean_pgs}/{self.health.num_pgs}"
            )
    def run_all_checks(self) -> CEPHHealth:
        """Run all health checks"""
        self.check_ceph_status()
        self.check_monitors()
        self.check_managers()
        self.check_osds()
        self.check_pools()
        self.check_pg_state()
        return self.health
 def human_readable_size(bytes_val: int) -> str:
    """Convert bytes to human readable format"""
    for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
        if bytes_val < 1024.0:
            return f"{bytes_val:.2f} {unit}"
        bytes_val /= 1024.0
    return f"{bytes_val:.2f} EB"
 def main():
    parser = argparse.ArgumentParser(
        description="Check CEPH cluster health",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__
    )
    parser.add_argument(
        '--node',
        default='foxtrot',
        help='Cluster node to check (default: foxtrot)'
    )
    parser.add_argument(
        '--json',
        action='store_true',
        help='Output as JSON'
    )
    parser.add_argument(
        '--min-osds',
        type=int,
        help='Minimum expected OSD count (error if below this)'
    )
    args = parser.parse_args()
    # Run health checks
    checker = CEPHHealthChecker(args.node)
    health = checker.run_all_checks()
    # Check minimum OSD count
    if args.min_osds and health.num_osds < args.min_osds:
        health.errors.append(
            f"OSD count below minimum: {health.num_osds} < {args.min_osds}"
        )
    if args.json:
        # Output as JSON
        print(json.dumps(asdict(health), indent=2))
        # Exit with appropriate code based on health status
        sys.exit(0 if health.is_healthy else 1)
    else:
        # Human-readable output
        print("CEPH Cluster Health Check")
        print("=" * 60)
        print(f"Overall Status: {health.status}")
        print(f"OSDs: {health.num_up_osds}/{health.num_osds} up, {health.num_in_osds}/{health.num_osds} in")
        print(f"PGs: {health.num_active_clean_pgs}/{health.num_pgs} active+clean")
        print(f"Usage: {health.percent_used:.1f}% ({human_readable_size(health.used_bytes)}/{human_readable_size(health.data_bytes)})")
        print("\nMonitors:")
        for mon in health.monitors:
            quorum_status = "✓" if mon.in_quorum else "✗"
            print(f"  {quorum_status} {mon.name} (rank: {mon.rank}, {mon.address})")
        print("\nManagers:")
        for mgr in health.managers:
            active_status = "ACTIVE" if mgr.active else "STANDBY"
            print(f"  {mgr.name} ({active_status}, {mgr.address})")
        print("\nOSDs:")
        for osd in health.osds:
            status = "✓" if osd.status == 'up' else "✗"
            in_status = "in" if osd.in_cluster else "out"
            print(f"  {status} osd.{osd.osd_id} on {osd.host} ({in_status}, {osd.device_class})")
        print("\nPools:")
        for pool in health.pools:
            print(f"  {pool.name}: size={pool.size}, min_size={pool.min_size}, "
                  f"pgs={pool.pg_num}, used={pool.percent_used:.1f}%")
        if health.warnings:
            print("\nWarnings:")
            for warning in health.warnings:
                print(f"  ⚠ {warning}")
        if health.errors:
            print("\nErrors:")
            for error in health.errors:
                print(f"  ✗ {error}")
        print("\n" + "=" * 60)
        if health.is_healthy:
            print("Status: ✓ HEALTHY")
            sys.exit(0)
        else:
            print("Status: ✗ UNHEALTHY")
            sys.exit(1)
 if __name__ == '__main__':
    main()
--- a/skills/proxmox-infrastructure/tools/check_cluster_health.py
+++ b/skills/proxmox-infrastructure/tools/check_cluster_health.py
@@ -0,0 +1,339 @@
 #!/usr/bin/env -S uv run --script --quiet
 # /// script
 # requires-python = ">=3.11"
 # dependencies = []
 # ///
 """
 Proxmox Cluster Health Checker
 Validates Proxmox cluster health including:
 - Cluster quorum status
 - Node membership and status
 - Corosync ring health
 - Resource manager status
 - Configuration version sync
 Usage:
    python check_cluster_health.py [--node NODE] [--json]
 Examples:
    # Check cluster health (requires SSH access to cluster node)
    python check_cluster_health.py --node foxtrot
    # Output as JSON for parsing
    python check_cluster_health.py --node foxtrot --json
 """
 import argparse
 import json
 import re
 import subprocess
 import sys
 from dataclasses import dataclass, asdict
 from typing import Dict, List, Optional
@dataclass
 class NodeStatus:
    """Cluster node status"""
    name: str
    online: bool
    node_id: int
    ip: str
@dataclass
 class CorosyncStatus:
    """Corosync ring status"""
    ring_id: int
    nodes: List[str]
    status: str
@dataclass
 class ClusterHealth:
    """Overall cluster health"""
    cluster_name: str
    quorate: bool
    node_count: int
    expected_votes: int
    total_votes: int
    nodes: List[NodeStatus]
    corosync_rings: List[CorosyncStatus]
    config_version: Optional[int]
    warnings: List[str]
    errors: List[str]
    @property
    def is_healthy(self) -> bool:
        """Check if cluster is in healthy state"""
        return self.quorate and len(self.errors) == 0
 class ClusterHealthChecker:
    """Check Proxmox cluster health via SSH"""
    def __init__(self, node: str):
        # Validate node is a valid hostname or IP address
        if not self._validate_node(node):
            raise ValueError(f"Invalid node name or IP address: {node}")
        self.node = node
        self.health = ClusterHealth(
            cluster_name="",
            quorate=False,
            node_count=0,
            expected_votes=0,
            total_votes=0,
            nodes=[],
            corosync_rings=[],
            config_version=None,
            warnings=[],
            errors=[]
        )
    def _validate_node(self, node: str) -> bool:
        """Validate node is a valid hostname or IP address"""
        import re
        # Allow valid hostnames and IPv4/IPv6 addresses
        hostname_pattern = r'^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*$'
        ipv4_pattern = r'^(\d{1,3}\.){3}\d{1,3}$'
        ipv6_pattern = r'^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$'
        return bool(
            re.match(hostname_pattern, node) or
            re.match(ipv4_pattern, node) or
            re.match(ipv6_pattern, node)
        )
    def run_command(self, command: str) -> str:
        """Execute command on remote node via SSH"""
        try:
            # Use -- to prevent SSH option injection
            result = subprocess.run(
                ["ssh", "-o", "BatchMode=yes", f"root@{self.node}", "--", command],
                capture_output=True,
                text=True,
                check=True,
                timeout=30
            )
            return result.stdout
        except subprocess.TimeoutExpired:
            self.health.errors.append(f"Command timed out: {command}")
            return ""
        except subprocess.CalledProcessError as e:
            self.health.errors.append(f"Command failed: {command}: {e.stderr}")
            return ""
    def check_cluster_status(self):
        """Check pvecm status output"""
        output = self.run_command("pvecm status")
        if not output:
            self.health.errors.append("Failed to get cluster status")
            return
        # Parse cluster name
        cluster_match = re.search(r'Cluster name:\s+(\S+)', output)
        if cluster_match:
            self.health.cluster_name = cluster_match.group(1)
        # Parse quorum status
        quorum_match = re.search(r'Quorate:\s+(\w+)', output)
        if quorum_match:
            self.health.quorate = quorum_match.group(1).lower() == 'yes'
        if not self.health.quorate:
            self.health.errors.append("Cluster does not have quorum!")
        # Parse node count
        node_match = re.search(r'Nodes:\s+(\d+)', output)
        if node_match:
            self.health.node_count = int(node_match.group(1))
        # Parse expected votes
        expected_match = re.search(r'Expected votes:\s+(\d+)', output)
        if expected_match:
            self.health.expected_votes = int(expected_match.group(1))
        # Parse total votes
        total_match = re.search(r'Total votes:\s+(\d+)', output)
        if total_match:
            self.health.total_votes = int(total_match.group(1))
        # Check if we have majority
        if self.health.total_votes < (self.health.expected_votes // 2 + 1):
            self.health.errors.append(
                f"Insufficient votes: {self.health.total_votes}/{self.health.expected_votes}"
            )
    def check_nodes(self):
        """Check node membership"""
        output = self.run_command("pvecm nodes")
        if not output:
            self.health.warnings.append("Failed to get node list")
            return
        # Parse node list (skip header)
        lines = output.strip().split('\n')[1:]  # Skip header
        for line in lines:
            if not line.strip():
                continue
            # Example: "   1 0x00000001 foxtrot 192.168.3.5"
            parts = line.split()
            if len(parts) >= 3:
                try:
                    node_id = int(parts[0])
                    name = parts[2] if len(parts) >= 3 else "unknown"
                    ip = parts[3] if len(parts) >= 4 else "unknown"
                    online = True  # If in list, assumed online
                    self.health.nodes.append(NodeStatus(
                        name=name,
                        online=online,
                        node_id=node_id,
                        ip=ip
                    ))
                except (ValueError, IndexError) as e:
                    self.health.warnings.append(f"Failed to parse node line: {line}: {e}")
        # Verify expected node count
        if len(self.health.nodes) != self.health.node_count:
            self.health.warnings.append(
                f"Node count mismatch: expected {self.health.node_count}, found {len(self.health.nodes)}"
            )
    def check_corosync(self):
        """Check corosync ring status"""
        output = self.run_command("corosync-cfgtool -s")
        if not output:
            self.health.warnings.append("Failed to get corosync status")
            return
        # Parse corosync status
        # Example output:
        # Printing ring status.
        # Local node ID 1
        # RING ID 0
        #     id  = 192.168.8.5
        #     status  = ring 0 active with no faults
        current_ring = None
        for line in output.split('\n'):
            line = line.strip()
            if line.startswith('RING ID'):
                ring_match = re.search(r'RING ID (\d+)', line)
                if ring_match:
                    current_ring = int(ring_match.group(1))
            elif 'status' in line.lower() and current_ring is not None:
                status_match = re.search(r'status\s*=\s*(.+)', line)
                if status_match:
                    status = status_match.group(1)
                    # Check for faults
                    if 'no faults' not in status.lower():
                        self.health.errors.append(f"Corosync ring {current_ring}: {status}")
                    self.health.corosync_rings.append(CorosyncStatus(
                        ring_id=current_ring,
                        nodes=[],  # Could parse this if needed
                        status=status
                    ))
    def check_config_version(self):
        """Check cluster configuration version"""
        output = self.run_command("corosync-cmapctl -b totem.config_version")
        if output:
            try:
                self.health.config_version = int(output.strip())
            except ValueError:
                self.health.warnings.append("Failed to parse config version")
    def check_resource_manager(self):
        """Check pve-cluster service status"""
        output = self.run_command("systemctl is-active pve-cluster")
        if output.strip() != "active":
            self.health.errors.append("pve-cluster service is not active")
        # Check pmxcfs filesystem
        output = self.run_command("pvecm status | grep -i 'cluster filesystem'")
        if output and 'online' not in output.lower():
            self.health.warnings.append("Cluster filesystem may not be online")
    def run_all_checks(self) -> ClusterHealth:
        """Run all health checks"""
        self.check_cluster_status()
        self.check_nodes()
        self.check_corosync()
        self.check_config_version()
        self.check_resource_manager()
        return self.health
 def main():
    parser = argparse.ArgumentParser(
        description="Check Proxmox cluster health",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__
    )
    parser.add_argument(
        '--node',
        default='foxtrot',
        help='Cluster node to check (default: foxtrot)'
    )
    parser.add_argument(
        '--json',
        action='store_true',
        help='Output as JSON'
    )
    args = parser.parse_args()
    # Run health checks
    checker = ClusterHealthChecker(args.node)
    health = checker.run_all_checks()
    if args.json:
        # Output as JSON
        print(json.dumps(asdict(health), indent=2))
    else:
        # Human-readable output
        print(f"Cluster Health Check: {health.cluster_name}")
        print("=" * 60)
        print(f"Quorum Status: {'✓ YES' if health.quorate else '✗ NO'}")
        print(f"Nodes: {health.node_count} ({health.total_votes}/{health.expected_votes} votes)")
        if health.config_version:
            print(f"Config Version: {health.config_version}")
        print("\nNodes:")
        for node in health.nodes:
            status = "✓" if node.online else "✗"
            print(f"  {status} {node.name} (ID: {node.node_id}, IP: {node.ip})")
        print("\nCorosync Rings:")
        for ring in health.corosync_rings:
            print(f"  Ring {ring.ring_id}: {ring.status}")
        if health.warnings:
            print("\nWarnings:")
            for warning in health.warnings:
                print(f"  ⚠ {warning}")
        if health.errors:
            print("\nErrors:")
            for error in health.errors:
                print(f"  ✗ {error}")
        print("\n" + "=" * 60)
        if health.is_healthy:
            print("Status: ✓ HEALTHY")
            sys.exit(0)
        else:
            print("Status: ✗ UNHEALTHY")
            sys.exit(1)
 if __name__ == '__main__':
    main()
--- a/skills/proxmox-infrastructure/tools/cluster_status.py
+++ b/skills/proxmox-infrastructure/tools/cluster_status.py
@@ -0,0 +1,252 @@
 #!/usr/bin/env -S uv run --script --quiet
 # /// script
 # dependencies = ["proxmoxer", "requests"]
 # ///
 """
 Display Proxmox cluster health and resource usage.
 Usage:
    ./cluster_status.py
    ./cluster_status.py --node foxtrot
    ./cluster_status.py --detailed
 Environment Variables:
    PROXMOX_VE_ENDPOINT - Proxmox API endpoint (e.g., https://192.168.3.5:8006)
    PROXMOX_VE_USERNAME - Username (e.g., root@pam)
    PROXMOX_VE_PASSWORD - Password
    OR
    PROXMOX_VE_API_TOKEN - API token (user@realm!token-id=secret)
 """
 import argparse
 import os
 import sys
 from proxmoxer import ProxmoxAPI, ResourceException
 class ClusterMonitor:
    """Monitor Proxmox cluster health and resources."""
    def __init__(self, endpoint: str, auth_type: str, **auth_kwargs):
        """Initialize Proxmox connection."""
        self.endpoint = endpoint.replace("https://", "").replace(":8006", "")
        try:
            if auth_type == "token":
                user, token = auth_kwargs["token"].split("!")
                token_name, token_value = token.split("=")
                self.proxmox = ProxmoxAPI(
                    self.endpoint,
                    user=user,
                    token_name=token_name,
                    token_value=token_value,
                    verify_ssl=False
                )
            else:
                self.proxmox = ProxmoxAPI(
                    self.endpoint,
                    user=auth_kwargs["user"],
                    password=auth_kwargs["password"],
                    verify_ssl=False
                )
        except Exception as e:
            print(f"❌ Failed to connect to Proxmox: {e}", file=sys.stderr)
            sys.exit(1)
    def get_cluster_status(self):
        """Get cluster status and quorum info."""
        try:
            status = self.proxmox.cluster.status.get()
            return status
        except ResourceException as e:
            print(f"❌ Failed to get cluster status: {e}", file=sys.stderr)
            return None
    def get_node_status(self, node_name: str):
        """Get detailed node status."""
        try:
            status = self.proxmox.nodes(node_name).status.get()
            return status
        except ResourceException as e:
            print(f"❌ Failed to get node status: {e}", file=sys.stderr)
            return None
    def get_node_vms(self, node_name: str):
        """Get VMs on a node."""
        try:
            vms = self.proxmox.nodes(node_name).qemu.get()
            return vms
        except ResourceException as e:
            print(f"❌ Failed to get VMs: {e}", file=sys.stderr)
            return []
    def display_cluster_overview(self):
        """Display cluster overview."""
        print("🖥️  Proxmox Cluster Status")
        print("=" * 70)
        cluster_status = self.get_cluster_status()
        if not cluster_status:
            return
        # Find cluster info
        cluster_info = next((item for item in cluster_status if item['type'] == 'cluster'), None)
        if cluster_info:
            print(f"\n📊 Cluster: {cluster_info.get('name', 'N/A')}")
            print(f"   Quorum: {cluster_info.get('quorate', 0)} (nodes: {cluster_info.get('nodes', 0)})")
        # Node statuses
        nodes = [item for item in cluster_status if item['type'] == 'node']
        print(f"\n🔧 Nodes ({len(nodes)}):")
        print(f"{'Node':<15} {'Status':<10} {'CPU':<12} {'Memory':<20} {'VMs':<8}")
        print("-" * 70)
        for node_info in nodes:
            node_name = node_info['name']
            online = "✓ Online" if node_info.get('online', 0) == 1 else "✗ Offline"
            # Get detailed status
            detailed = self.get_node_status(node_name)
            if not detailed:
                print(f"{node_name:<15} {online:<10} {'N/A':<12} {'N/A':<20} {'N/A':<8}")
                continue
            # CPU usage
            cpu_pct = detailed.get('cpu', 0) * 100
            cpu_str = f"{cpu_pct:.1f}%"
            # Memory usage
            mem_used = detailed.get('memory', {}).get('used', 0) / (1024**3)  # GB
            mem_total = detailed.get('memory', {}).get('total', 0) / (1024**3)  # GB
            mem_pct = (mem_used / mem_total * 100) if mem_total > 0 else 0
            mem_str = f"{mem_used:.1f}/{mem_total:.1f}GB ({mem_pct:.1f}%)"
            # VM count
            vms = self.get_node_vms(node_name)
            vm_count = len(vms)
            running_vms = len([vm for vm in vms if vm.get('status') == 'running'])
            vm_str = f"{running_vms}/{vm_count}"
            print(f"{node_name:<15} {online:<10} {cpu_str:<12} {mem_str:<20} {vm_str:<8}")
        print("=" * 70)
    def display_node_detail(self, node_name: str):
        """Display detailed node information."""
        print(f"\n🔍 Node Details: {node_name}")
        print("=" * 70)
        status = self.get_node_status(node_name)
        if not status:
            return
        # System info
        print(f"\n💻 System:")
        print(f"   Uptime: {status.get('uptime', 0) / 86400:.1f} days")
        print(f"   Load Average: {status.get('loadavg', ['N/A', 'N/A', 'N/A'])[0]:.2f}")
        print(f"   CPU Cores: {status.get('cpuinfo', {}).get('cpus', 'N/A')}")
        # CPU
        cpu_pct = status.get('cpu', 0) * 100
        print(f"\n🖥️  CPU Usage: {cpu_pct:.1f}%")
        # Memory
        mem = status.get('memory', {})
        mem_used = mem.get('used', 0) / (1024**3)
        mem_total = mem.get('total', 0) / (1024**3)
        mem_free = mem.get('free', 0) / (1024**3)
        mem_pct = (mem_used / mem_total * 100) if mem_total > 0 else 0
        print(f"\n💾 Memory:")
        print(f"   Used: {mem_used:.2f} GB ({mem_pct:.1f}%)")
        print(f"   Free: {mem_free:.2f} GB")
        print(f"   Total: {mem_total:.2f} GB")
        # Storage
        root = status.get('rootfs', {})
        root_used = root.get('used', 0) / (1024**3)
        root_total = root.get('total', 0) / (1024**3)
        root_avail = root.get('avail', 0) / (1024**3)
        root_pct = (root_used / root_total * 100) if root_total > 0 else 0
        print(f"\n💿 Root Filesystem:")
        print(f"   Used: {root_used:.2f} GB ({root_pct:.1f}%)")
        print(f"   Available: {root_avail:.2f} GB")
        print(f"   Total: {root_total:.2f} GB")
        # VMs
        vms = self.get_node_vms(node_name)
        print(f"\n🖼️  Virtual Machines ({len(vms)}):")
        if vms:
            print(f"   {'VMID':<8} {'Name':<25} {'Status':<10} {'CPU':<8} {'Memory':<15}")
            print("   " + "-" * 66)
            for vm in vms:
                vmid = vm.get('vmid', 'N/A')
                name = vm.get('name', 'N/A')[:24]
                status = vm.get('status', 'unknown')
                cpu_pct = vm.get('cpu', 0) * 100 if vm.get('status') == 'running' else 0
                mem = vm.get('mem', 0) / (1024**2) if vm.get('status') == 'running' else 0  # MB
                status_icon = "▶️" if status == "running" else "⏸️"
                print(f"   {vmid:<8} {name:<25} {status_icon} {status:<8} {cpu_pct:>6.1f}% {mem:>8.0f} MB")
        else:
            print("   No VMs found")
        print("=" * 70)
 def main():
    parser = argparse.ArgumentParser(
        description="Display Proxmox cluster health and resource usage"
    )
    parser.add_argument(
        "--node",
        help="Show detailed info for specific node"
    )
    parser.add_argument(
        "--detailed",
        action="store_true",
        help="Show detailed info for all nodes"
    )
    args = parser.parse_args()
    # Get authentication from environment
    endpoint = os.getenv("PROXMOX_VE_ENDPOINT")
    api_token = os.getenv("PROXMOX_VE_API_TOKEN")
    username = os.getenv("PROXMOX_VE_USERNAME")
    password = os.getenv("PROXMOX_VE_PASSWORD")
    if not endpoint:
        print("❌ PROXMOX_VE_ENDPOINT environment variable required", file=sys.stderr)
        sys.exit(1)
    # Determine authentication method
    if api_token:
        monitor = ClusterMonitor(endpoint, "token", token=api_token)
    elif username and password:
        monitor = ClusterMonitor(endpoint, "password", user=username, password=password)
    else:
        print("❌ Authentication required: set PROXMOX_VE_API_TOKEN or PROXMOX_VE_USERNAME/PASSWORD", file=sys.stderr)
        sys.exit(1)
    # Display status
    if args.node:
        monitor.display_node_detail(args.node)
    elif args.detailed:
        monitor.display_cluster_overview()
        # Get all nodes and show details
        cluster_status = monitor.get_cluster_status()
        if cluster_status:
            nodes = [item['name'] for item in cluster_status if item['type'] == 'node']
            for node_name in nodes:
                monitor.display_node_detail(node_name)
    else:
        monitor.display_cluster_overview()
 if __name__ == "__main__":
    main()
--- a/skills/proxmox-infrastructure/tools/validate_template.py
+++ b/skills/proxmox-infrastructure/tools/validate_template.py
@@ -0,0 +1,224 @@
 #!/usr/bin/env -S uv run --script --quiet
 # /// script
 # dependencies = ["proxmoxer", "requests"]
 # ///
 """
 Validate Proxmox VM template health and configuration.
 Usage:
    ./validate_template.py --template-id 9000 --node foxtrot
    ./validate_template.py --template-id 9000 --all-nodes
 Environment Variables:
    PROXMOX_VE_ENDPOINT - Proxmox API endpoint (e.g., https://192.168.3.5:8006)
    PROXMOX_VE_USERNAME - Username (e.g., root@pam)
    PROXMOX_VE_PASSWORD - Password
    OR
    PROXMOX_VE_API_TOKEN - API token (user@realm!token-id=secret)
 """
 import argparse
 import os
 import sys
 from proxmoxer import ProxmoxAPI, ResourceException
 class TemplateValidator:
    """Validates Proxmox VM templates."""
    def __init__(self, endpoint: str, auth_type: str, **auth_kwargs):
        """Initialize Proxmox connection."""
        self.endpoint = endpoint.replace("https://", "").replace(":8006", "")
        try:
            if auth_type == "token":
                user, token = auth_kwargs["token"].split("!")
                token_name, token_value = token.split("=")
                self.proxmox = ProxmoxAPI(
                    self.endpoint,
                    user=user,
                    token_name=token_name,
                    token_value=token_value,
                    verify_ssl=False
                )
            else:
                self.proxmox = ProxmoxAPI(
                    self.endpoint,
                    user=auth_kwargs["user"],
                    password=auth_kwargs["password"],
                    verify_ssl=False
                )
        except Exception as e:
            print(f"❌ Failed to connect to Proxmox: {e}", file=sys.stderr)
            sys.exit(1)
    def find_template(self, template_id: int, node: str = None):
        """Find template on cluster."""
        nodes = [node] if node else [n['node'] for n in self.proxmox.nodes.get()]
        for node_name in nodes:
            try:
                vms = self.proxmox.nodes(node_name).qemu.get()
                for vm in vms:
                    if vm['vmid'] == template_id:
                        return node_name, vm
            except ResourceException:
                continue
        return None, None
    def validate_template(self, template_id: int, node: str = None):
        """Validate template configuration."""
        print(f"🔍 Validating template {template_id}...")
        # Find template
        node_name, vm_info = self.find_template(template_id, node)
        if not node_name:
            print(f"❌ Template {template_id} not found", file=sys.stderr)
            return False
        print(f"✓ Found on node: {node_name}")
        # Check if it's actually a template
        if vm_info.get('template', 0) != 1:
            print(f"❌ VM {template_id} is not a template", file=sys.stderr)
            return False
        print(f"✓ Confirmed as template")
        # Get detailed config
        try:
            config = self.proxmox.nodes(node_name).qemu(template_id).config.get()
        except ResourceException as e:
            print(f"❌ Failed to get template config: {e}", file=sys.stderr)
            return False
        # Validation checks
        checks = {
            "Cloud-init drive": self._check_cloudinit(config),
            "QEMU guest agent": self._check_agent(config),
            "SCSI controller": self._check_scsi(config),
            "Boot disk": self._check_boot_disk(config),
            "Serial console": self._check_serial(config),
            "EFI disk": self._check_efi(config),
        }
        # Print results
        print("\n📋 Validation Results:")
        print("-" * 50)
        all_passed = True
        for check_name, (passed, message) in checks.items():
            status = "✓" if passed else "✗"
            print(f"{status} {check_name}: {message}")
            if not passed:
                all_passed = False
        print("-" * 50)
        # Print template info
        print(f"\n📊 Template Info:")
        print(f"  Name: {config.get('name', 'N/A')}")
        print(f"  Memory: {config.get('memory', 'N/A')} MB")
        print(f"  Cores: {config.get('cores', 'N/A')}")
        print(f"  Sockets: {config.get('sockets', 'N/A')}")
        if all_passed:
            print(f"\n✅ Template {template_id} is properly configured")
        else:
            print(f"\n⚠️  Template {template_id} has configuration issues")
        return all_passed
    def _check_cloudinit(self, config):
        """Check for cloud-init drive."""
        for key in config:
            if key.startswith('ide') and 'cloudinit' in str(config[key]):
                return True, f"Found at {key}"
        return False, "Missing cloud-init drive (should be ide2)"
    def _check_agent(self, config):
        """Check for QEMU guest agent."""
        agent = config.get('agent', '0')
        if agent in ['1', 'enabled=1']:
            return True, "Enabled"
        return False, "Not enabled (recommended for IP detection)"
    def _check_scsi(self, config):
        """Check SCSI controller type."""
        scsihw = config.get('scsihw', '')
        if 'virtio' in scsihw:
            return True, f"Using {scsihw}"
        return False, f"Not using virtio-scsi (found: {scsihw or 'none'})"
    def _check_boot_disk(self, config):
        """Check for boot disk."""
        for key in config:
            if key.startswith('scsi') and key != 'scsihw':
                return True, f"Found at {key}"
        return False, "No SCSI disk found"
    def _check_serial(self, config):
        """Check for serial console."""
        if 'serial0' in config:
            return True, "Configured"
        return False, "Not configured (recommended for cloud images)"
    def _check_efi(self, config):
        """Check for EFI disk."""
        if 'efidisk0' in config:
            return True, "Configured"
        return False, "Not configured (needed for UEFI boot)"
 def main():
    parser = argparse.ArgumentParser(
        description="Validate Proxmox VM template health and configuration"
    )
    parser.add_argument(
        "--template-id",
        type=int,
        required=True,
        help="Template VM ID (e.g., 9000)"
    )
    parser.add_argument(
        "--node",
        help="Specific Proxmox node to check (default: search all nodes)"
    )
    parser.add_argument(
        "--all-nodes",
        action="store_true",
        help="Search all nodes in cluster"
    )
    args = parser.parse_args()
    # Get authentication from environment
    endpoint = os.getenv("PROXMOX_VE_ENDPOINT")
    api_token = os.getenv("PROXMOX_VE_API_TOKEN")
    username = os.getenv("PROXMOX_VE_USERNAME")
    password = os.getenv("PROXMOX_VE_PASSWORD")
    if not endpoint:
        print("❌ PROXMOX_VE_ENDPOINT environment variable required", file=sys.stderr)
        sys.exit(1)
    # Determine authentication method
    if api_token:
        validator = TemplateValidator(endpoint, "token", token=api_token)
    elif username and password:
        validator = TemplateValidator(endpoint, "password", user=username, password=password)
    else:
        print("❌ Authentication required: set PROXMOX_VE_API_TOKEN or PROXMOX_VE_USERNAME/PASSWORD", file=sys.stderr)
        sys.exit(1)
    # Validate template
    node = None if args.all_nodes else args.node
    success = validator.validate_template(args.template_id, node)
    sys.exit(0 if success else 1)
 if __name__ == "__main__":
    main()
--- a/skills/proxmox-infrastructure/workflows/ceph-deployment.md
+++ b/skills/proxmox-infrastructure/workflows/ceph-deployment.md
@@ -0,0 +1,782 @@
 # CEPH Storage Deployment Workflow
 Complete guide to deploying CEPH storage on a Proxmox VE cluster with automated OSD creation, pool
 configuration, and health verification.
 ## Overview
 This workflow automates CEPH deployment with:
 - CEPH package installation
 - Cluster initialization with proper network configuration
 - Monitor and manager creation across all nodes
 - Automated OSD creation with partition support
 - Pool configuration with replication and compression
 - Comprehensive health verification
 ## Prerequisites
 Before deploying CEPH:
 1. **Cluster must be formed:**
   - Proxmox cluster already initialized and healthy
   - All nodes showing quorum
   - See [Cluster Formation](cluster-formation.md) first
 2. **Network requirements:**
   - Dedicated CEPH public network (192.168.5.0/24 for Matrix)
   - Dedicated CEPH private/cluster network (192.168.7.0/24 for Matrix)
   - MTU 9000 (jumbo frames) configured on CEPH networks
   - Bridges configured: vmbr1 (public), vmbr2 (private)
 3. **Storage requirements:**
   - Dedicated disks for OSDs (not boot disks)
   - All OSD disks should be the same type (SSD/NVMe)
   - Matrix: 2× 4TB Samsung 990 PRO NVMe per node = 24TB raw
 4. **System requirements:**
   - Minimum 3 nodes for production (replication factor 3)
   - At least 4GB RAM per OSD
   - Fast network (10GbE recommended for CEPH networks)
 ## Phase 1: Install CEPH Packages
 ### Step 1: Install CEPH
 ```yaml
 # roles/proxmox_ceph/tasks/install.yml
 ---
 - name: Check if CEPH is already installed
  ansible.builtin.stat:
    path: /etc/pve/ceph.conf
  register: ceph_conf_check
 - name: Check CEPH packages
  ansible.builtin.command:
    cmd: dpkg -l ceph-common
  register: ceph_package_check
  failed_when: false
  changed_when: false
 - name: Install CEPH packages via pveceph
  ansible.builtin.command:
    cmd: "pveceph install --repository {{ ceph_repository }}"
  when: ceph_package_check.rc != 0
  register: ceph_install
  changed_when: "'installed' in ceph_install.stdout | default('')"
 - name: Verify CEPH installation
  ansible.builtin.command:
    cmd: ceph --version
  register: ceph_version
  changed_when: false
  failed_when: ceph_version.rc != 0
 - name: Display CEPH version
  ansible.builtin.debug:
    msg: "Installed CEPH version: {{ ceph_version.stdout }}"
 ```
 ## Phase 2: Initialize CEPH Cluster
 ### Step 2: Initialize CEPH (First Node Only)
 ```yaml
 # roles/proxmox_ceph/tasks/init.yml
 ---
 - name: Check if CEPH cluster is initialized
  ansible.builtin.command:
    cmd: ceph status
  register: ceph_status_check
  failed_when: false
  changed_when: false
 - name: Set CEPH initialization facts
  ansible.builtin.set_fact:
    ceph_initialized: "{{ ceph_status_check.rc == 0 }}"
    is_ceph_first_node: "{{ inventory_hostname == groups[cluster_group | default('matrix_cluster')][0] }}"
 - name: Initialize CEPH cluster on first node
  ansible.builtin.command:
    cmd: >
      pveceph init
      --network {{ ceph_network }}
      --cluster-network {{ ceph_cluster_network }}
  when:
    - is_ceph_first_node
    - not ceph_initialized
  register: ceph_init
  changed_when: ceph_init.rc == 0
 - name: Wait for CEPH cluster to initialize
  ansible.builtin.pause:
    seconds: 15
  when: ceph_init.changed
 - name: Verify CEPH initialization
  ansible.builtin.command:
    cmd: ceph status
  register: ceph_init_verify
  changed_when: false
  when:
    - is_ceph_first_node
  failed_when:
    - ceph_init_verify.rc != 0
 - name: Display initial CEPH status
  ansible.builtin.debug:
    var: ceph_init_verify.stdout_lines
  when:
    - is_ceph_first_node
    - ceph_init.changed or ansible_verbosity > 0
 ```
 ## Phase 3: Create Monitors and Managers
 ### Step 3: Create CEPH Monitors
 ```yaml
 # roles/proxmox_ceph/tasks/monitors.yml
 ---
 - name: Check existing CEPH monitors
  ansible.builtin.command:
    cmd: ceph mon dump --format json
  register: mon_dump
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
  failed_when: false
  changed_when: false
 - name: Parse monitor list
  ansible.builtin.set_fact:
    existing_monitors: "{{ (mon_dump.stdout | from_json).mons | map(attribute='name') | list }}"
  when: mon_dump.rc == 0
 - name: Set monitor facts
  ansible.builtin.set_fact:
    has_monitor: "{{ inventory_hostname_short in existing_monitors | default([]) }}"
 - name: Create CEPH monitor on first node
  ansible.builtin.command:
    cmd: pveceph mon create
  when:
    - is_ceph_first_node
    - not has_monitor
  register: mon_create_first
  changed_when: mon_create_first.rc == 0
 - name: Wait for first monitor to stabilize
  ansible.builtin.pause:
    seconds: 10
  when: mon_create_first.changed
 - name: Create CEPH monitors on other nodes
  ansible.builtin.command:
    cmd: pveceph mon create
  when:
    - not is_ceph_first_node
    - not has_monitor
  register: mon_create_others
  changed_when: mon_create_others.rc == 0
 - name: Verify monitor quorum
  ansible.builtin.command:
    cmd: ceph quorum_status --format json
  register: quorum_status
  changed_when: false
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
 - name: Check monitor quorum size
  ansible.builtin.assert:
    that:
      - (quorum_status.stdout | from_json).quorum | length >= ((groups[cluster_group | default('matrix_cluster')] | length // 2) + 1)
    fail_msg: "Monitor quorum not established"
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
 ```
 ### Step 4: Create CEPH Managers
 ```yaml
 # roles/proxmox_ceph/tasks/managers.yml
 ---
 - name: Check existing CEPH managers
  ansible.builtin.command:
    cmd: ceph mgr dump --format json
  register: mgr_dump
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
  failed_when: false
  changed_when: false
 - name: Parse manager list
  ansible.builtin.set_fact:
    existing_managers: "{{ [(mgr_dump.stdout | from_json).active_name] + ((mgr_dump.stdout | from_json).standbys | map(attribute='name') | list) }}"
  when: mgr_dump.rc == 0
 - name: Initialize empty manager list if check failed
  ansible.builtin.set_fact:
    existing_managers: []
  when: mgr_dump.rc != 0
 - name: Set manager facts
  ansible.builtin.set_fact:
    has_manager: "{{ inventory_hostname_short in (existing_managers | default([])) }}"
 - name: Create CEPH manager
  ansible.builtin.command:
    cmd: pveceph mgr create
  when: not has_manager
  register: mgr_create
  changed_when: mgr_create.rc == 0
 - name: Wait for managers to stabilize
  ansible.builtin.pause:
    seconds: 5
  when: mgr_create.changed
 - name: Enable CEPH dashboard module
  ansible.builtin.command:
    cmd: ceph mgr module enable dashboard
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
  register: dashboard_enable
  changed_when: "'already enabled' not in dashboard_enable.stderr"
  failed_when:
    - dashboard_enable.rc != 0
    - "'already enabled' not in dashboard_enable.stderr"
 - name: Enable Prometheus module
  ansible.builtin.command:
    cmd: ceph mgr module enable prometheus
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
  register: prometheus_enable
  changed_when: "'already enabled' not in prometheus_enable.stderr"
  failed_when:
    - prometheus_enable.rc != 0
    - "'already enabled' not in prometheus_enable.stderr"
 ```
 ## Phase 4: Create OSDs
 ### Step 5: Prepare and Create OSDs
 ```yaml
 # roles/proxmox_ceph/tasks/osd_create.yml
 ---
 - name: Get list of existing OSDs
  ansible.builtin.command:
    cmd: ceph osd ls
  register: existing_osds
  changed_when: false
  failed_when: false
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
 - name: Check OSD devices availability
  ansible.builtin.command:
    cmd: "lsblk -ndo NAME,SIZE,TYPE {{ item.device }}"
  register: device_check
  failed_when: device_check.rc != 0
  changed_when: false
  loop: "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
  loop_control:
    label: "{{ item.device }}"
 - name: Display device information
  ansible.builtin.debug:
    msg: "Device {{ item.item.device }}: {{ item.stdout }}"
  loop: "{{ device_check.results }}"
  loop_control:
    label: "{{ item.item.device }}"
  when: ansible_verbosity > 0
 - name: Wipe existing partitions on OSD devices
  ansible.builtin.command:
    cmd: "wipefs -a {{ item.device }}"
  when:
    - ceph_wipe_disks | default(false)
  loop: "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
  loop_control:
    label: "{{ item.device }}"
  register: wipe_result
  changed_when: wipe_result.rc == 0
 - name: Create OSDs from whole devices (no partitioning)
  ansible.builtin.command:
    cmd: >
      pveceph osd create {{ item.device }}
      {% if item.db_device is defined and item.db_device %}--db_dev {{ item.db_device }}{% endif %}
      {% if item.wal_device is defined and item.wal_device %}--wal_dev {{ item.wal_device }}{% endif %}
  when:
    - item.partitions | default(1) == 1
  loop: "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
  loop_control:
    label: "{{ item.device }}"
  register: osd_create_whole
  changed_when: "'successfully created' in osd_create_whole.stdout | default('')"
  failed_when:
    - osd_create_whole.rc != 0
    - "'already in use' not in osd_create_whole.stderr | default('')"
    - "'ceph-volume' not in osd_create_whole.stderr | default('')"
 - name: Create multiple OSDs per device (with partitioning)
  ansible.builtin.command:
    cmd: >
      pveceph osd create {{ item.0.device }}
      --size {{ (item.0.device_size_gb | default(4000) / item.0.partitions) | int }}G
      {% if item.0.db_device is defined and item.0.db_device %}--db_dev {{ item.0.db_device }}{% endif %}
      {% if item.0.wal_device is defined and item.0.wal_device %}--wal_dev {{ item.0.wal_device }}{% endif %}
  when:
    - item.0.partitions > 1
  with_subelements:
    - "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
    - partition_indices
    - skip_missing: true
  loop_control:
    label: "{{ item.0.device }} partition {{ item.1 }}"
  register: osd_create_partition
  changed_when: "'successfully created' in osd_create_partition.stdout | default('')"
  failed_when:
    - osd_create_partition.rc != 0
    - "'already in use' not in osd_create_partition.stderr | default('')"
 - name: Wait for OSDs to come up
  ansible.builtin.command:
    cmd: ceph osd tree --format json
  register: osd_tree
  changed_when: false
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
  until: >
    (osd_tree.stdout | from_json).nodes
    | selectattr('type', 'equalto', 'osd')
    | selectattr('status', 'equalto', 'up')
    | list | length >= expected_osd_count | int
  retries: 20
  delay: 10
  vars:
    expected_osd_count: >-
      {{
        ceph_osds.values()
        | map('map', attribute='partitions')
        | map('default', 1)
        | sum
      }}
 ```
 ## Phase 5: Create and Configure Pools
 ### Step 6: Create CEPH Pools
 ```yaml
 # roles/proxmox_ceph/tasks/pools.yml
 ---
 - name: Get existing CEPH pools
  ansible.builtin.command:
    cmd: ceph osd pool ls
  register: existing_pools
  changed_when: false
 - name: Create CEPH pools
  ansible.builtin.command:
    cmd: >
      ceph osd pool create {{ item.name }}
      {{ item.pg_num }}
      {{ item.pgp_num | default(item.pg_num) }}
  when: item.name not in existing_pools.stdout_lines
  loop: "{{ ceph_pools }}"
  loop_control:
    label: "{{ item.name }}"
  register: pool_create
  changed_when: pool_create.rc == 0
 - name: Set pool replication size
  ansible.builtin.command:
    cmd: "ceph osd pool set {{ item.name }} size {{ item.size }}"
  loop: "{{ ceph_pools }}"
  loop_control:
    label: "{{ item.name }}"
  register: pool_size
  changed_when: "'set pool' in pool_size.stdout"
 - name: Set pool minimum replication size
  ansible.builtin.command:
    cmd: "ceph osd pool set {{ item.name }} min_size {{ item.min_size }}"
  loop: "{{ ceph_pools }}"
  loop_control:
    label: "{{ item.name }}"
  register: pool_min_size
  changed_when: "'set pool' in pool_min_size.stdout"
 - name: Set pool application
  ansible.builtin.command:
    cmd: "ceph osd pool application enable {{ item.name }} {{ item.application }}"
  when: item.application is defined
  loop: "{{ ceph_pools }}"
  loop_control:
    label: "{{ item.name }}"
  register: pool_app
  changed_when: "'enabled application' in pool_app.stdout"
  failed_when:
    - pool_app.rc != 0
    - "'already enabled' not in pool_app.stderr"
 - name: Enable compression on pools
  ansible.builtin.command:
    cmd: "ceph osd pool set {{ item.name }} compression_mode aggressive"
  when: item.compression | default(false)
  loop: "{{ ceph_pools }}"
  loop_control:
    label: "{{ item.name }}"
  register: pool_compression
  changed_when: "'set pool' in pool_compression.stdout"
 - name: Set compression algorithm
  ansible.builtin.command:
    cmd: "ceph osd pool set {{ item.name }} compression_algorithm {{ item.compression_algorithm | default('zstd') }}"
  when: item.compression | default(false)
  loop: "{{ ceph_pools }}"
  loop_control:
    label: "{{ item.name }}"
  register: pool_compression_algo
  changed_when: "'set pool' in pool_compression_algo.stdout"
 ```
 ## Phase 6: Verify CEPH Health
 ### Step 7: Health Verification
 ```yaml
 # roles/proxmox_ceph/tasks/verify.yml
 ---
 - name: Wait for CEPH to stabilize
  ansible.builtin.pause:
    seconds: 30
 - name: Check CEPH cluster health
  ansible.builtin.command:
    cmd: ceph health
  register: ceph_health
  changed_when: false
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
 - name: Get CEPH status
  ansible.builtin.command:
    cmd: ceph status --format json
  register: ceph_status
  changed_when: false
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
 - name: Parse CEPH status
  ansible.builtin.set_fact:
    ceph_status_data: "{{ ceph_status.stdout | from_json }}"
 - name: Calculate expected OSD count
  ansible.builtin.set_fact:
    expected_osd_count: >-
      {{
        ceph_osds.values()
        | map('map', attribute='partitions')
        | map('default', 1)
        | sum
      }}
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
 - name: Verify OSD count
  ansible.builtin.assert:
    that:
      - ceph_status_data.osdmap.num_osds | int == expected_osd_count | int
    fail_msg: "Expected {{ expected_osd_count }} OSDs but found {{ ceph_status_data.osdmap.num_osds }}"
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
 - name: Verify all OSDs are up
  ansible.builtin.assert:
    that:
      - ceph_status_data.osdmap.num_up_osds == ceph_status_data.osdmap.num_osds
    fail_msg: "Not all OSDs are up: {{ ceph_status_data.osdmap.num_up_osds }}/{{ ceph_status_data.osdmap.num_osds }}"
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
 - name: Verify all OSDs are in
  ansible.builtin.assert:
    that:
      - ceph_status_data.osdmap.num_in_osds == ceph_status_data.osdmap.num_osds
    fail_msg: "Not all OSDs are in cluster: {{ ceph_status_data.osdmap.num_in_osds }}/{{ ceph_status_data.osdmap.num_osds }}"
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
 - name: Wait for PGs to become active+clean
  ansible.builtin.command:
    cmd: ceph pg stat --format json
  register: pg_stat
  changed_when: false
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
  until: >
    (pg_stat.stdout | from_json).num_pg_by_state
    | selectattr('name', 'equalto', 'active+clean')
    | map(attribute='num')
    | sum == (pg_stat.stdout | from_json).num_pgs
  retries: 60
  delay: 10
 - name: Display CEPH cluster summary
  ansible.builtin.debug:
    msg: |
      CEPH Cluster Health: {{ ceph_health.stdout }}
      Total OSDs: {{ ceph_status_data.osdmap.num_osds }}
      OSDs Up: {{ ceph_status_data.osdmap.num_up_osds }}
      OSDs In: {{ ceph_status_data.osdmap.num_in_osds }}
      PGs: {{ ceph_status_data.pgmap.num_pgs }}
      Data: {{ ceph_status_data.pgmap.bytes_used | default(0) | human_readable }}
      Available: {{ ceph_status_data.pgmap.bytes_avail | default(0) | human_readable }}
  delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
  run_once: true
 ```
 ## Matrix Cluster Configuration Example
 ```yaml
 # group_vars/matrix_cluster.yml (CEPH section)
 ---
 # CEPH configuration
 ceph_enabled: true
 ceph_repository: "no-subscription"  # or "enterprise" with subscription
 ceph_network: "192.168.5.0/24"          # vmbr1 - Public network
 ceph_cluster_network: "192.168.7.0/24"  # vmbr2 - Private network
 # OSD configuration (4 OSDs per node = 12 total)
 ceph_osds:
  foxtrot:
    - device: /dev/nvme1n1
      partitions: 2  # Create 2 OSDs per 4TB NVMe
      device_size_gb: 4000
      partition_indices: [0, 1]
      db_device: null
      wal_device: null
      crush_device_class: nvme
    - device: /dev/nvme2n1
      partitions: 2
      device_size_gb: 4000
      partition_indices: [0, 1]
      db_device: null
      wal_device: null
      crush_device_class: nvme
  golf:
    - device: /dev/nvme1n1
      partitions: 2
      device_size_gb: 4000
      partition_indices: [0, 1]
      crush_device_class: nvme
    - device: /dev/nvme2n1
      partitions: 2
      device_size_gb: 4000
      partition_indices: [0, 1]
      crush_device_class: nvme
  hotel:
    - device: /dev/nvme1n1
      partitions: 2
      device_size_gb: 4000
      partition_indices: [0, 1]
      crush_device_class: nvme
    - device: /dev/nvme2n1
      partitions: 2
      device_size_gb: 4000
      partition_indices: [0, 1]
      crush_device_class: nvme
 # Pool configuration
 ceph_pools:
  - name: vm_ssd
    pg_num: 128
    pgp_num: 128
    size: 3           # Replicate across 3 nodes
    min_size: 2       # Minimum 2 replicas required
    application: rbd
    compression: false
  - name: vm_containers
    pg_num: 64
    pgp_num: 64
    size: 3
    min_size: 2
    application: rbd
    compression: true
    compression_algorithm: zstd
 # Safety flags
 ceph_wipe_disks: false  # Set to true for fresh deployment (DESTRUCTIVE!)
 ```
 ## Complete Playbook Example
 ```yaml
 # playbooks/ceph-deploy.yml
 ---
 - name: Deploy CEPH Storage on Proxmox Cluster
  hosts: "{{ cluster_group | default('matrix_cluster') }}"
  become: true
  serial: 1  # Deploy one node at a time
  pre_tasks:
    - name: Verify cluster is healthy
      ansible.builtin.command:
        cmd: pvecm status
      register: cluster_check
      changed_when: false
      failed_when: "'Quorate: Yes' not in cluster_check.stdout"
    - name: Verify CEPH networks MTU
      ansible.builtin.command:
        cmd: "ip link show {{ item }}"
      register: mtu_check
      changed_when: false
      failed_when: "'mtu 9000' not in mtu_check.stdout"
      loop:
        - vmbr1  # CEPH public
        - vmbr2  # CEPH private
    - name: Display CEPH configuration
      ansible.builtin.debug:
        msg: |
          Deploying CEPH to cluster: {{ cluster_name }}
          Public network: {{ ceph_network }}
          Cluster network: {{ ceph_cluster_network }}
          Expected OSDs: {{ ceph_osds.values() | map('map', attribute='partitions') | map('default', 1) | sum }}
      run_once: true
  roles:
    - role: proxmox_ceph
  post_tasks:
    - name: Display CEPH OSD tree
      ansible.builtin.command:
        cmd: ceph osd tree
      register: osd_tree_final
      changed_when: false
      delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
      run_once: true
    - name: Show OSD tree
      ansible.builtin.debug:
        var: osd_tree_final.stdout_lines
      run_once: true
    - name: Display pool information
      ansible.builtin.command:
        cmd: ceph osd pool ls detail
      register: pool_info
      changed_when: false
      delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
      run_once: true
    - name: Show pool details
      ansible.builtin.debug:
        var: pool_info.stdout_lines
      run_once: true
 ```
 ## Usage
 ### Deploy CEPH to Matrix Cluster
 ```bash
 # Check syntax
 ansible-playbook playbooks/ceph-deploy.yml --syntax-check
 # Deploy CEPH
 ansible-playbook playbooks/ceph-deploy.yml --limit matrix_cluster
 # Verify CEPH status
 ansible -i inventory/proxmox.yml foxtrot -m shell -a "ceph status"
 ansible -i inventory/proxmox.yml foxtrot -m shell -a "ceph osd tree"
 ansible -i inventory/proxmox.yml foxtrot -m shell -a "ceph df"
 ```
 ### Add mise Tasks
 ```toml
 # .mise.toml
 [tasks."ceph:deploy"]
 description = "Deploy CEPH storage on cluster"
 run = """
 cd ansible
 uv run ansible-playbook playbooks/ceph-deploy.yml
 """
 [tasks."ceph:status"]
 description = "Show CEPH cluster status"
 run = """
 ansible -i ansible/inventory/proxmox.yml foxtrot -m shell -a "ceph -s"
 """
 [tasks."ceph:health"]
 description = "Show CEPH health detail"
 run = """
 ansible -i ansible/inventory/proxmox.yml foxtrot -m shell -a "ceph health detail"
 """
 ```
 ## Troubleshooting
 ### OSDs Won't Create
 **Symptoms:**
 - `pveceph osd create` fails with "already in use" error
 **Solutions:**
 1. Check if disk has existing partitions: `lsblk /dev/nvme1n1`
 2. Wipe disk: `wipefs -a /dev/nvme1n1` (DESTRUCTIVE!)
 3. Set `ceph_wipe_disks: true` in group_vars
 4. Check for existing LVM: `pvdisplay`, `lvdisplay`
 ### PGs Stuck in Creating
 **Symptoms:**
 - PGs stay in "creating" state for extended period
 **Solutions:**
 1. Check OSD status: `ceph osd tree`
 2. Verify all OSDs are up and in: `ceph osd stat`
 3. Check mon/mgr status: `ceph mon stat`, `ceph mgr stat`
 4. Review logs: `journalctl -u ceph-osd@*.service -n 100`
 ### Poor CEPH Performance
 **Symptoms:**
 - Slow VM disk I/O
 **Solutions:**
 1. Verify MTU 9000: `ip link show vmbr1 | grep mtu`
 2. Test network throughput: `iperf3` between nodes
 3. Check OSD utilization: `ceph osd df`
 4. Verify SSD/NVMe is being used: `ceph osd tree`
 5. Check for rebalancing: `ceph -s` (look for "recovery")
 ## Related Workflows
 - [Cluster Formation](cluster-formation.md) - Form cluster before CEPH
 - [Network Configuration](../reference/networking.md) - Configure CEPH networks
 - [Storage Management](../reference/storage-management.md) - Manage CEPH pools and OSDs
 ## References
 - ProxSpray analysis: `docs/proxspray-analysis.md` (lines 1431-1562)
 - Proxmox VE CEPH documentation
 - CEPH deployment best practices
 - [Ansible CEPH automation pattern](../../.claude/skills/ansible-best-practices/patterns/ceph-automation.md)
--- a/skills/proxmox-infrastructure/workflows/cluster-formation.md
+++ b/skills/proxmox-infrastructure/workflows/cluster-formation.md
@@ -0,0 +1,646 @@
 # Proxmox Cluster Formation Workflow
 Complete guide to forming a Proxmox VE cluster using Ansible automation with idempotent patterns.
 ## Overview
 This workflow automates the creation of a Proxmox VE cluster with:
 - Hostname resolution configuration
 - SSH key distribution for cluster operations
 - Idempotent cluster initialization
 - Corosync network configuration
 - Quorum and health verification
 ## Prerequisites
 Before forming a cluster:
 1. **All nodes must have:**
   - Proxmox VE 9.x installed
   - Network connectivity on management network
   - Dedicated corosync network configured (VLAN 9 for Matrix)
   - Unique hostnames
   - Synchronized time (NTP configured)
 2. **Minimum requirements:**
   - At least 3 nodes for quorum (production)
   - 1 node for development/testing (non-recommended)
 3. **Network requirements:**
   - All nodes must be able to resolve each other's hostnames
   - Corosync network must be isolated (no VM traffic)
   - Low latency between nodes (<2ms recommended)
   - MTU 1500 on management network
 ## Phase 1: Prepare Cluster Nodes
 ### Step 1: Verify Prerequisites
 ```yaml
 # roles/proxmox_cluster/tasks/prerequisites.yml
 ---
 - name: Check Proxmox VE is installed
  ansible.builtin.stat:
    path: /usr/bin/pvecm
  register: pvecm_binary
  failed_when: not pvecm_binary.stat.exists
 - name: Get Proxmox VE version
  ansible.builtin.command:
    cmd: pveversion
  register: pve_version
  changed_when: false
 - name: Verify minimum Proxmox VE version
  ansible.builtin.assert:
    that:
      - "'pve-manager/9' in pve_version.stdout or 'pve-manager/8' in pve_version.stdout"
    fail_msg: "Proxmox VE 8.x or 9.x required"
 - name: Verify minimum node count for production
  ansible.builtin.assert:
    that:
      - groups[cluster_group] | length >= 3
    fail_msg: "Production cluster requires at least 3 nodes for quorum"
  when: cluster_environment == 'production'
 - name: Check no existing cluster membership
  ansible.builtin.command:
    cmd: pvecm status
  register: existing_cluster
  failed_when: false
  changed_when: false
 - name: Display cluster warning if already member
  ansible.builtin.debug:
    msg: |
      WARNING: Node {{ inventory_hostname }} is already a cluster member.
      Current cluster: {{ existing_cluster.stdout }}
      This playbook will attempt to join the target cluster.
  when:
    - existing_cluster.rc == 0
    - cluster_name not in existing_cluster.stdout
 ```
 ### Step 2: Configure Hostname Resolution
 ```yaml
 # roles/proxmox_cluster/tasks/hosts_config.yml
 ---
 - name: Ensure cluster nodes in /etc/hosts (management IP)
  ansible.builtin.lineinfile:
    path: /etc/hosts
    regexp: "^{{ item.management_ip }}\\s+"
    line: "{{ item.management_ip }} {{ item.fqdn }} {{ item.short_name }}"
    state: present
  loop: "{{ cluster_nodes }}"
  loop_control:
    label: "{{ item.short_name }}"
 - name: Ensure corosync IPs in /etc/hosts
  ansible.builtin.lineinfile:
    path: /etc/hosts
    regexp: "^{{ item.corosync_ip }}\\s+"
    line: "{{ item.corosync_ip }} {{ item.short_name }}-corosync"
    state: present
  loop: "{{ cluster_nodes }}"
  loop_control:
    label: "{{ item.short_name }}"
 - name: Verify hostname resolution (forward)
  ansible.builtin.command:
    cmd: "getent hosts {{ item.fqdn }}"
  register: host_lookup
  failed_when: host_lookup.rc != 0
  changed_when: false
  loop: "{{ cluster_nodes }}"
  loop_control:
    label: "{{ item.fqdn }}"
 - name: Verify hostname resolution (reverse)
  ansible.builtin.command:
    cmd: "getent hosts {{ item.management_ip }}"
  register: reverse_lookup
  failed_when:
    - reverse_lookup.rc != 0
  changed_when: false
  loop: "{{ cluster_nodes }}"
  loop_control:
    label: "{{ item.management_ip }}"
 - name: Test corosync network connectivity
  ansible.builtin.command:
    cmd: "ping -c 3 -W 2 {{ item.corosync_ip }}"
  register: corosync_ping
  changed_when: false
  when: item.short_name != inventory_hostname_short
  loop: "{{ cluster_nodes }}"
  loop_control:
    label: "{{ item.short_name }}"
 ```
 ### Step 3: Distribute SSH Keys
 ```yaml
 # roles/proxmox_cluster/tasks/ssh_keys.yml
 ---
 - name: Generate SSH key for root (if not exists)
  ansible.builtin.user:
    name: root
    generate_ssh_key: true
    ssh_key_type: ed25519
    ssh_key_comment: "root@{{ inventory_hostname }}"
  register: root_ssh_key
 - name: Fetch public keys from all nodes
  ansible.builtin.slurp:
    src: /root/.ssh/id_ed25519.pub
  register: node_public_keys
 - name: Distribute SSH keys to all nodes
  ansible.posix.authorized_key:
    user: root
    state: present
    key: "{{ hostvars[item].node_public_keys.content | b64decode }}"
    comment: "cluster-{{ item }}"
  loop: "{{ groups[cluster_group] }}"
  when: item != inventory_hostname
 - name: Populate known_hosts with node SSH keys
  ansible.builtin.shell:
    cmd: "ssh-keyscan -H {{ item }} >> /root/.ssh/known_hosts"
  when: item != inventory_hostname
  loop: "{{ groups[cluster_group] }}"
  loop_control:
    label: "{{ item }}"
  changed_when: true
 - name: Test SSH connectivity to all nodes
  ansible.builtin.command:
    cmd: "ssh -o ConnectTimeout=5 {{ item }} hostname"
  register: ssh_test
  changed_when: false
  when: item != inventory_hostname
  loop: "{{ groups[cluster_group] }}"
  loop_control:
    label: "{{ item }}"
 ```
 ## Phase 2: Initialize Cluster
 ### Step 4: Create Cluster (First Node Only)
 ```yaml
 # roles/proxmox_cluster/tasks/cluster_init.yml
 ---
 - name: Check existing cluster status
  ansible.builtin.command:
    cmd: pvecm status
  register: cluster_status
  failed_when: false
  changed_when: false
 - name: Get cluster nodes list
  ansible.builtin.command:
    cmd: pvecm nodes
  register: cluster_nodes_check
  failed_when: false
  changed_when: false
 - name: Set cluster facts
  ansible.builtin.set_fact:
    in_target_cluster: "{{ cluster_status.rc == 0 and cluster_name in cluster_status.stdout }}"
 - name: Create new cluster on first node
  ansible.builtin.command:
    cmd: "pvecm create {{ cluster_name }} --link0 {{ corosync_link0_address }}"
  when: not in_target_cluster
  register: cluster_create
  changed_when: cluster_create.rc == 0
 - name: Wait for cluster to initialize
  ansible.builtin.pause:
    seconds: 10
  when: cluster_create.changed
 - name: Verify cluster creation
  ansible.builtin.command:
    cmd: pvecm status
  register: cluster_verify
  changed_when: false
  failed_when: cluster_name not in cluster_verify.stdout
 - name: Display cluster status
  ansible.builtin.debug:
    var: cluster_verify.stdout_lines
  when: cluster_create.changed or ansible_verbosity > 0
 ```
 ### Step 5: Join Nodes to Cluster
 ```yaml
 # roles/proxmox_cluster/tasks/cluster_join.yml
 ---
 - name: Check if already in cluster
  ansible.builtin.command:
    cmd: pvecm status
  register: cluster_status
  failed_when: false
  changed_when: false
 - name: Set membership facts
  ansible.builtin.set_fact:
    is_cluster_member: "{{ cluster_status.rc == 0 }}"
    in_target_cluster: "{{ cluster_status.rc == 0 and cluster_name in cluster_status.stdout }}"
 - name: Get first node hostname
  ansible.builtin.set_fact:
    first_node_hostname: "{{ hostvars[groups[cluster_group][0]].inventory_hostname }}"
 - name: Join cluster
  ansible.builtin.command:
    cmd: >
      pvecm add {{ first_node_hostname }}
      --link0 {{ corosync_link0_address }}
  when:
    - not is_cluster_member or not in_target_cluster
  register: cluster_join
  changed_when: cluster_join.rc == 0
  failed_when:
    - cluster_join.rc != 0
    - "'already in a cluster' not in cluster_join.stderr"
 - name: Wait for node to join cluster
  ansible.builtin.pause:
    seconds: 10
  when: cluster_join.changed
 - name: Verify cluster membership
  ansible.builtin.command:
    cmd: pvecm status
  register: join_verify
  changed_when: false
  failed_when:
    - "'Quorate: Yes' not in join_verify.stdout"
 ```
 ## Phase 3: Configure Corosync
 ### Step 6: Corosync Network Configuration
 ```yaml
 # roles/proxmox_cluster/tasks/corosync.yml
 ---
 - name: Get current corosync configuration
  ansible.builtin.slurp:
    src: /etc/pve/corosync.conf
  register: corosync_conf_current
 - name: Parse current corosync config
  ansible.builtin.set_fact:
    current_corosync: "{{ corosync_conf_current.content | b64decode }}"
 - name: Check if corosync config needs update
  ansible.builtin.set_fact:
    corosync_needs_update: "{{ corosync_network not in current_corosync }}"
 - name: Backup corosync.conf
  ansible.builtin.copy:
    src: /etc/pve/corosync.conf
    dest: "/etc/pve/corosync.conf.{{ ansible_date_time.epoch }}.bak"
    remote_src: true
    mode: '0640'
  when: corosync_needs_update
  delegate_to: "{{ groups[cluster_group][0] }}"
  run_once: true
 - name: Update corosync configuration
  ansible.builtin.template:
    src: corosync.conf.j2
    dest: /etc/pve/corosync.conf.new
    validate: corosync-cfgtool -c %s
    mode: '0640'
  when: corosync_needs_update
  delegate_to: "{{ groups[cluster_group][0] }}"
  run_once: true
 - name: Apply new corosync configuration
  ansible.builtin.copy:
    src: /etc/pve/corosync.conf.new
    dest: /etc/pve/corosync.conf
    remote_src: true
    mode: '0640'
  when: corosync_needs_update
  notify:
    - reload corosync
  delegate_to: "{{ groups[cluster_group][0] }}"
  run_once: true
 ```
 **Corosync Template Example:**
 ```jinja2
 # templates/corosync.conf.j2
 totem {
  version: 2
  cluster_name: {{ cluster_name }}
  transport: knet
  crypto_cipher: aes256
  crypto_hash: sha256
  interface {
    linknumber: 0
    knet_link_priority: 255
  }
 }
 nodelist {
 {% for node in cluster_nodes %}
  node {
    name: {{ node.short_name }}
    nodeid: {{ node.node_id }}
    quorum_votes: 1
    ring0_addr: {{ node.corosync_ip }}
  }
 {% endfor %}
 }
 quorum {
  provider: corosync_votequorum
 {% if cluster_nodes | length == 2 %}
  two_node: 1
 {% endif %}
 }
 logging {
  to_logfile: yes
  logfile: /var/log/corosync/corosync.log
  to_syslog: yes
  timestamp: on
 }
 ```
 ## Phase 4: Verify Cluster Health
 ### Step 7: Health Checks
 ```yaml
 # roles/proxmox_cluster/tasks/verify.yml
 ---
 - name: Wait for cluster to stabilize
  ansible.builtin.pause:
    seconds: 15
 - name: Check cluster quorum
  ansible.builtin.command:
    cmd: pvecm status
  register: cluster_health
  changed_when: false
  failed_when: "'Quorate: Yes' not in cluster_health.stdout"
 - name: Get cluster node count
  ansible.builtin.command:
    cmd: pvecm nodes
  register: cluster_nodes_final
  changed_when: false
 - name: Verify expected node count
  ansible.builtin.assert:
    that:
      - cluster_nodes_final.stdout_lines | length >= groups[cluster_group] | length
    fail_msg: "Expected {{ groups[cluster_group] | length }} nodes but found {{ cluster_nodes_final.stdout_lines | length }}"
 - name: Check corosync ring status
  ansible.builtin.command:
    cmd: corosync-cfgtool -s
  register: corosync_status
  changed_when: false
 - name: Verify all nodes in corosync
  ansible.builtin.assert:
    that:
      - "'online' in corosync_status.stdout"
    fail_msg: "Corosync ring issues detected"
 - name: Get cluster configuration version
  ansible.builtin.command:
    cmd: corosync-cmapctl -b totem.config_version
  register: config_version
  changed_when: false
 - name: Display cluster health summary
  ansible.builtin.debug:
    msg: |
      Cluster: {{ cluster_name }}
      Quorum: {{ 'Yes' if 'Quorate: Yes' in cluster_health.stdout else 'No' }}
      Nodes: {{ cluster_nodes_final.stdout_lines | length }}
      Config Version: {{ config_version.stdout }}
 ```
 ## Matrix Cluster Example Configuration
 ```yaml
 # group_vars/matrix_cluster.yml
 ---
 cluster_name: "Matrix"
 cluster_group: "matrix_cluster"
 cluster_environment: "production"
 # Corosync configuration
 corosync_network: "192.168.8.0/24"  # VLAN 9
 # Node configuration
 cluster_nodes:
  - short_name: foxtrot
    fqdn: foxtrot.matrix.spaceships.work
    management_ip: 192.168.3.5
    corosync_ip: 192.168.8.5
    node_id: 1
  - short_name: golf
    fqdn: golf.matrix.spaceships.work
    management_ip: 192.168.3.6
    corosync_ip: 192.168.8.6
    node_id: 2
  - short_name: hotel
    fqdn: hotel.matrix.spaceships.work
    management_ip: 192.168.3.7
    corosync_ip: 192.168.8.7
    node_id: 3
 # Set per-node corosync address
 corosync_link0_address: "{{ cluster_nodes | selectattr('short_name', 'equalto', inventory_hostname_short) | map(attribute='corosync_ip') | first }}"
 ```
 ## Complete Playbook Example
 ```yaml
 # playbooks/cluster-init.yml
 ---
 - name: Initialize Proxmox Cluster
  hosts: "{{ cluster_group | default('matrix_cluster') }}"
  become: true
  serial: 1  # One node at a time for safety
  pre_tasks:
    - name: Validate cluster group is defined
      ansible.builtin.assert:
        that:
          - cluster_group is defined
          - cluster_name is defined
          - cluster_nodes is defined
        fail_msg: "Required variables not defined in group_vars"
    - name: Display cluster configuration
      ansible.builtin.debug:
        msg: |
          Forming cluster: {{ cluster_name }}
          Nodes: {{ cluster_nodes | map(attribute='short_name') | join(', ') }}
          Corosync network: {{ corosync_network }}
      run_once: true
  tasks:
    - name: Verify prerequisites
      ansible.builtin.include_tasks: "{{ role_path }}/tasks/prerequisites.yml"
    - name: Configure /etc/hosts
      ansible.builtin.include_tasks: "{{ role_path }}/tasks/hosts_config.yml"
    - name: Distribute SSH keys
      ansible.builtin.include_tasks: "{{ role_path }}/tasks/ssh_keys.yml"
    # First node creates cluster
    - name: Initialize cluster on first node
      ansible.builtin.include_tasks: "{{ role_path }}/tasks/cluster_init.yml"
      when: inventory_hostname == groups[cluster_group][0]
    # Wait for first node
    - name: Wait for first node to complete
      ansible.builtin.pause:
        seconds: 20
      when: inventory_hostname != groups[cluster_group][0]
    # Other nodes join
    - name: Join cluster on other nodes
      ansible.builtin.include_tasks: "{{ role_path }}/tasks/cluster_join.yml"
      when: inventory_hostname != groups[cluster_group][0]
    - name: Configure corosync
      ansible.builtin.include_tasks: "{{ role_path }}/tasks/corosync.yml"
    - name: Verify cluster health
      ansible.builtin.include_tasks: "{{ role_path }}/tasks/verify.yml"
  post_tasks:
    - name: Display final cluster status
      ansible.builtin.command:
        cmd: pvecm status
      register: final_status
      changed_when: false
      delegate_to: "{{ groups[cluster_group][0] }}"
      run_once: true
    - name: Show cluster status
      ansible.builtin.debug:
        var: final_status.stdout_lines
      run_once: true
  handlers:
    - name: reload corosync
      ansible.builtin.systemd:
        name: corosync
        state: reloaded
      throttle: 1
 ```
 ## Usage
 ### Initialize Matrix Cluster
 ```bash
 # Check syntax
 ansible-playbook playbooks/cluster-init.yml --syntax-check
 # Dry run (limited functionality)
 ansible-playbook playbooks/cluster-init.yml --check --diff
 # Initialize cluster
 ansible-playbook playbooks/cluster-init.yml --limit matrix_cluster
 # Verify cluster status
 ansible -i inventory/proxmox.yml foxtrot -m shell -a "pvecm status"
 ansible -i inventory/proxmox.yml foxtrot -m shell -a "pvecm nodes"
 ```
 ### Add mise Task
 ```toml
 # .mise.toml
 [tasks."cluster:init"]
 description = "Initialize Proxmox cluster"
 run = """
 cd ansible
 uv run ansible-playbook playbooks/cluster-init.yml
 """
 [tasks."cluster:status"]
 description = "Show cluster status"
 run = """
 ansible -i ansible/inventory/proxmox.yml foxtrot -m shell -a "pvecm status"
 """
 ```
 ## Troubleshooting
 ### Node Won't Join Cluster
 **Symptoms:**
 - `pvecm add` fails with timeout or connection error
 **Solutions:**
 1. Verify SSH connectivity: `ssh root@first-node hostname`
 2. Check /etc/hosts: `getent hosts first-node`
 3. Verify corosync network: `ping -c 3 192.168.8.5`
 4. Check firewall: `iptables -L | grep 5404`
 ### Cluster Shows No Quorum
 **Symptoms:**
 - `pvecm status` shows `Quorate: No`
 **Solutions:**
 1. Check node count: Must have majority (2 of 3, 3 of 5, etc.)
 2. Verify corosync: `systemctl status corosync`
 3. Check corosync ring: `corosync-cfgtool -s`
 4. Review logs: `journalctl -u corosync -n 50`
 ### Configuration Sync Issues
 **Symptoms:**
 - Changes on one node don't appear on others
 **Solutions:**
 1. Verify pmxcfs: `systemctl status pve-cluster`
 2. Check filesystem: `pvecm status | grep -i cluster`
 3. Restart cluster filesystem: `systemctl restart pve-cluster`
 ## Related Workflows
 - [CEPH Deployment](ceph-deployment.md) - Deploy CEPH after cluster formation
 - [Network Configuration](../reference/networking.md) - Configure cluster networking
 - [Cluster Maintenance](cluster-maintenance.md) - Add/remove nodes, upgrades
 ## References
 - ProxSpray analysis: `docs/proxspray-analysis.md` (lines 1318-1428)
 - Proxmox VE Cluster Manager documentation
 - Corosync configuration guide
 - [Ansible cluster automation pattern](../../ansible-best-practices/patterns/cluster-automation.md)
		`@@ -0,0 +1,3 @@`
							`# proxmox-infrastructure`

							`Proxmox VE cluster management including VM provisioning, templates, VLAN networking, and CEPH storage`