Initial commit
This commit is contained in:
12
.claude-plugin/plugin.json
Normal file
12
.claude-plugin/plugin.json
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"name": "proxmox-infrastructure",
|
||||||
|
"description": "Proxmox VE cluster management including VM provisioning, templates, VLAN networking, and CEPH storage",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"author": {
|
||||||
|
"name": "basher83",
|
||||||
|
"email": "basher83@mail.spaceships.work"
|
||||||
|
},
|
||||||
|
"skills": [
|
||||||
|
"./skills"
|
||||||
|
]
|
||||||
|
}
|
||||||
3
README.md
Normal file
3
README.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# proxmox-infrastructure
|
||||||
|
|
||||||
|
Proxmox VE cluster management including VM provisioning, templates, VLAN networking, and CEPH storage
|
||||||
105
plugin.lock.json
Normal file
105
plugin.lock.json
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
{
|
||||||
|
"$schema": "internal://schemas/plugin.lock.v1.json",
|
||||||
|
"pluginId": "gh:basher83/lunar-claude:plugins/infrastructure/proxmox-infrastructure",
|
||||||
|
"normalized": {
|
||||||
|
"repo": null,
|
||||||
|
"ref": "refs/tags/v20251128.0",
|
||||||
|
"commit": "4443a5d5df66f90ee5678d11181044572ae39bcb",
|
||||||
|
"treeHash": "5c6ff4105707bab91f3474e49aaed2d449e4ec488f25d2f2d552d6eadd167b54",
|
||||||
|
"generatedAt": "2025-11-28T10:14:12.158310Z",
|
||||||
|
"toolVersion": "publish_plugins.py@0.2.0"
|
||||||
|
},
|
||||||
|
"origin": {
|
||||||
|
"remote": "git@github.com:zhongweili/42plugin-data.git",
|
||||||
|
"branch": "master",
|
||||||
|
"commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390",
|
||||||
|
"repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data"
|
||||||
|
},
|
||||||
|
"manifest": {
|
||||||
|
"name": "proxmox-infrastructure",
|
||||||
|
"description": "Proxmox VE cluster management including VM provisioning, templates, VLAN networking, and CEPH storage",
|
||||||
|
"version": "1.0.0"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"files": [
|
||||||
|
{
|
||||||
|
"path": "README.md",
|
||||||
|
"sha256": "dc1558215c32922f14e23d784c1b5f7f5296fdd5090e4c1298c7248236443dd7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": ".claude-plugin/plugin.json",
|
||||||
|
"sha256": "37f0fe197fab412f2f5e99afaaa1e87345c0347fa1c6ca53a908bdd7ce3f8e15"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/SKILL.md",
|
||||||
|
"sha256": "6277a71e9d31ec7bfc2babbdff53d1492c882f556a83a0f3ffa6a2b7c2418275"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/tools/validate_template.py",
|
||||||
|
"sha256": "c23a456e1e24de595e3e70078ae693543e19e7e7d374c63e123a0835aa9c8f18"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/tools/check_cluster_health.py",
|
||||||
|
"sha256": "7681ed0b793191437976ca578463d93b69acbf98a640813a4dcf76c563756fef"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/tools/cluster_status.py",
|
||||||
|
"sha256": "f492fbb074443ff9f839a390826a3811527594bb851e47d3c8fd7a69aa56af8e"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/tools/check_ceph_health.py",
|
||||||
|
"sha256": "25ff530395eb757e1fd16da6cf1162a62d08aefd76fde2af1fa5eb3f08882c0e"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/anti-patterns/common-mistakes.md",
|
||||||
|
"sha256": "f294cc1b8f21d397653f6cfe6b5f1eb0f5cb537a6c672cfafe05397ed2ca00d0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/workflows/cluster-formation.md",
|
||||||
|
"sha256": "af28601e2f561bfaf1342506f6f3eda7d2e09e51fa7d37f894beffcab6566d49"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/workflows/ceph-deployment.md",
|
||||||
|
"sha256": "ce7b7bd85f0eed0a01ea7dd591b0cd9f1a0ee7b2f4da3fba74f9b532593a82c7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/examples/01-basic-vm/main.tf",
|
||||||
|
"sha256": "5f36b92de9ac76115291def1992631594339144c1ec6e2c6131908da536b2dc6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/examples/01-basic-vm/README.md",
|
||||||
|
"sha256": "311a65c49482ae2e87b1e001559a6eeae79f68b7c840cdd640131dec7e8d9c4f"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/examples/01-basic-vm/variables.tf",
|
||||||
|
"sha256": "1eca73d9ae2c16a1f49d55fad2309aa1761f2935074f7624e952b28b3b4d0ce5"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/reference/qemu-guest-agent.md",
|
||||||
|
"sha256": "c7545c83bcf443b27c81406b96abf4cfbf63be8114bab30f27536fa3ac1eb679"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/reference/networking.md",
|
||||||
|
"sha256": "eff95710e488f40c52203be17a4c11068daac9391fe3b005b874551475fbe5bf"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/reference/storage-management.md",
|
||||||
|
"sha256": "357eb01944e0f53ca61470b769c12dc89775e8979440b37f44340f3140f59154"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/reference/cloud-init-patterns.md",
|
||||||
|
"sha256": "f8bc068ef9eefe27305ed490415a46fe6eab4f68553b24ff79187b91abaa6fe9"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/proxmox-infrastructure/reference/api-reference.md",
|
||||||
|
"sha256": "6dd249e90f808628687ddf4eac4893eb448eb1f025d6a9a060e9ba7f2d5940fc"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dirSha256": "5c6ff4105707bab91f3474e49aaed2d449e4ec488f25d2f2d552d6eadd167b54"
|
||||||
|
},
|
||||||
|
"security": {
|
||||||
|
"scannedAt": null,
|
||||||
|
"scannerVersion": null,
|
||||||
|
"flags": []
|
||||||
|
}
|
||||||
|
}
|
||||||
293
skills/proxmox-infrastructure/SKILL.md
Normal file
293
skills/proxmox-infrastructure/SKILL.md
Normal file
@@ -0,0 +1,293 @@
|
|||||||
|
---
|
||||||
|
name: proxmox-infrastructure
|
||||||
|
description: Proxmox VE cluster management including VM provisioning, template creation with cloud-init, QEMU guest
|
||||||
|
agent integration, storage pool management, VLAN-aware bridge configuration, and Proxmox API interactions. Use when
|
||||||
|
working with Proxmox VE, creating VM templates, configuring Proxmox networking, managing CEPH storage, troubleshooting
|
||||||
|
VM deployment issues, or interacting with Proxmox API.
|
||||||
|
---
|
||||||
|
|
||||||
|
# Proxmox Infrastructure Management
|
||||||
|
|
||||||
|
Expert guidance for managing Proxmox VE clusters, creating templates, provisioning VMs, and configuring network
|
||||||
|
infrastructure.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### Common Tasks
|
||||||
|
|
||||||
|
**Create VM Template:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# See tools/build-template.yml for automated playbook
|
||||||
|
cd ansible && uv run ansible-playbook playbooks/proxmox-build-template.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
**Clone Template to VM:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm clone <template-id> <new-vmid> --name <vm-name>
|
||||||
|
qm set <new-vmid> --sshkey ~/.ssh/id_rsa.pub
|
||||||
|
qm set <new-vmid> --ipconfig0 ip=192.168.1.100/24,gw=192.168.1.1
|
||||||
|
qm start <new-vmid>
|
||||||
|
```
|
||||||
|
|
||||||
|
**Check Cluster Status:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Use tools/cluster_status.py
|
||||||
|
./tools/cluster_status.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## When to Use This Skill
|
||||||
|
|
||||||
|
Activate this skill when:
|
||||||
|
|
||||||
|
- Creating or managing Proxmox VM templates
|
||||||
|
- Provisioning VMs via cloning or Terraform
|
||||||
|
- Configuring Proxmox networking (bridges, VLANs, bonds)
|
||||||
|
- Troubleshooting VM deployment or network issues
|
||||||
|
- Managing CEPH storage pools
|
||||||
|
- Working with QEMU guest agent
|
||||||
|
- Interacting with Proxmox API via Python or Ansible
|
||||||
|
|
||||||
|
## Core Workflows
|
||||||
|
|
||||||
|
### 1. Template Creation
|
||||||
|
|
||||||
|
#### Method 1: Using Ansible (Recommended)
|
||||||
|
|
||||||
|
See [tools/build-template.yml](tools/build-template.yml) for complete automation.
|
||||||
|
|
||||||
|
#### Method 2: Manual CLI
|
||||||
|
|
||||||
|
See [reference/cloud-init-patterns.md](reference/cloud-init-patterns.md) for detailed steps.
|
||||||
|
|
||||||
|
Key points:
|
||||||
|
|
||||||
|
- Use `virtio-scsi-pci` controller for Ubuntu images
|
||||||
|
- Add cloud-init CD-ROM drive (`ide2`)
|
||||||
|
- Configure serial console for cloud images
|
||||||
|
- Convert to template with `qm template <vmid>`
|
||||||
|
|
||||||
|
### 2. VM Provisioning
|
||||||
|
|
||||||
|
**From Ansible:**
|
||||||
|
Analyze existing playbook: [../../ansible/playbooks/proxmox-build-template.yml](../../ansible/playbooks/proxmox-build-template.yml)
|
||||||
|
|
||||||
|
**From Terraform:**
|
||||||
|
See examples in [../../terraform/netbox-vm/](../../terraform/netbox-vm/)
|
||||||
|
|
||||||
|
**Key Configuration:**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Ansible example
|
||||||
|
proxmox_kvm:
|
||||||
|
node: foxtrot
|
||||||
|
api_host: 192.168.3.5
|
||||||
|
vmid: 101
|
||||||
|
name: docker-01
|
||||||
|
clone: ubuntu-template
|
||||||
|
storage: local-lvm
|
||||||
|
# Network with VLAN
|
||||||
|
net:
|
||||||
|
net0: 'virtio,bridge=vmbr0,tag=30'
|
||||||
|
ipconfig:
|
||||||
|
ipconfig0: 'ip=192.168.3.100/24,gw=192.168.3.1'
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Network Configuration
|
||||||
|
|
||||||
|
This Virgo-Core cluster uses:
|
||||||
|
|
||||||
|
- **vmbr0**: Management (192.168.3.0/24, VLAN 9 for Corosync)
|
||||||
|
- **vmbr1**: CEPH Public (192.168.5.0/24, MTU 9000)
|
||||||
|
- **vmbr2**: CEPH Private (192.168.7.0/24, MTU 9000)
|
||||||
|
|
||||||
|
See [reference/networking.md](reference/networking.md) for:
|
||||||
|
|
||||||
|
- VLAN-aware bridge configuration
|
||||||
|
- Bond setup (802.3ad LACP)
|
||||||
|
- Routed vs bridged vs NAT setups
|
||||||
|
|
||||||
|
## Architecture Reference
|
||||||
|
|
||||||
|
### This Cluster ("Matrix")
|
||||||
|
|
||||||
|
**Nodes:** Foxtrot, Golf, Hotel (3× MINISFORUM MS-A2)
|
||||||
|
|
||||||
|
**Hardware per Node:**
|
||||||
|
|
||||||
|
- AMD Ryzen 9 9955HX (16C/32T)
|
||||||
|
- 64GB DDR5 @ 5600 MT/s
|
||||||
|
- 3× NVMe: 1× 1TB (boot), 2× 4TB (CEPH)
|
||||||
|
- 4× NICs: 2× 10GbE SFP+, 2× 2.5GbE
|
||||||
|
|
||||||
|
**Network Architecture:**
|
||||||
|
|
||||||
|
```text
|
||||||
|
enp4s0 → vmbr0 (mgmt + vlan9 for corosync)
|
||||||
|
enp5s0f0np0 → vmbr1 (ceph public, MTU 9000)
|
||||||
|
enp5s0f1np1 → vmbr2 (ceph private, MTU 9000)
|
||||||
|
```
|
||||||
|
|
||||||
|
See [../../docs/goals.md](../../docs/goals.md) for complete specs.
|
||||||
|
|
||||||
|
## Tools Available
|
||||||
|
|
||||||
|
### Python Scripts (uv)
|
||||||
|
|
||||||
|
**validate_template.py** - Validate template health via API
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./tools/validate_template.py --template-id 9000
|
||||||
|
```
|
||||||
|
|
||||||
|
**vm_diagnostics.py** - VM health checks
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./tools/vm_diagnostics.py --vmid 101
|
||||||
|
```
|
||||||
|
|
||||||
|
**cluster_status.py** - Cluster health metrics
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./tools/cluster_status.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Ansible Playbooks
|
||||||
|
|
||||||
|
**build-template.yml** - Automated template creation
|
||||||
|
|
||||||
|
- Downloads cloud image
|
||||||
|
- Creates VM with proper configuration
|
||||||
|
- Converts to template
|
||||||
|
|
||||||
|
**configure-networking.yml** - VLAN bridge setup
|
||||||
|
|
||||||
|
- Creates VLAN-aware bridges
|
||||||
|
- Configures bonds
|
||||||
|
- Sets MTU for storage networks
|
||||||
|
|
||||||
|
### OpenTofu Modules
|
||||||
|
|
||||||
|
**vm-module-example/** - Reusable VM provisioning
|
||||||
|
|
||||||
|
- Clone-based deployment
|
||||||
|
- Cloud-init integration
|
||||||
|
- Network configuration
|
||||||
|
|
||||||
|
See [examples/](examples/) directory.
|
||||||
|
|
||||||
|
**Real Examples from Repository**:
|
||||||
|
|
||||||
|
- **Multi-VM Cluster**: [../../terraform/examples/microk8s-cluster](../../terraform/examples/microk8s-cluster) - Comprehensive
|
||||||
|
3-node MicroK8s deployment using `for_each` pattern, cross-node cloning, **dual NIC with VLAN** (VLAN 30 primary,
|
||||||
|
VLAN 2 secondary), Ansible integration
|
||||||
|
- **Template with Cloud-Init**:
|
||||||
|
[../../terraform/examples/template-with-custom-cloudinit](../../terraform/examples/template-with-custom-cloudinit) -
|
||||||
|
Custom cloud-init snippet configuration
|
||||||
|
- **VLAN Bridge Configuration**:
|
||||||
|
[../../ansible/playbooks/proxmox-enable-vlan-bridging.yml](../../ansible/playbooks/proxmox-enable-vlan-bridging.yml) -
|
||||||
|
Enable VLAN-aware bridging on Proxmox nodes (supports VLANs 2-4094)
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
Common issues and solutions:
|
||||||
|
|
||||||
|
### Template Creation Issues
|
||||||
|
|
||||||
|
**Serial console required:**
|
||||||
|
Many cloud images need serial console configured.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm set <vmid> --serial0 socket --vga serial0
|
||||||
|
```
|
||||||
|
|
||||||
|
**Boot order:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm set <vmid> --boot order=scsi0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Network Issues
|
||||||
|
|
||||||
|
**VLAN not working:**
|
||||||
|
|
||||||
|
1. Check bridge is VLAN-aware:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
grep "bridge-vlan-aware" /etc/network/interfaces
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Verify VLAN in bridge-vids:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bridge vlan show
|
||||||
|
```
|
||||||
|
|
||||||
|
**MTU problems (CEPH):**
|
||||||
|
Ensure MTU 9000 on storage networks:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ip link show vmbr1 | grep mtu
|
||||||
|
```
|
||||||
|
|
||||||
|
### VM Won't Start
|
||||||
|
|
||||||
|
1. Check QEMU guest agent:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm agent <vmid> ping
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Review cloud-init logs (in VM):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cloud-init status --wait
|
||||||
|
cat /var/log/cloud-init.log
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Validate template exists:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm list | grep template
|
||||||
|
```
|
||||||
|
|
||||||
|
For more issues, see [troubleshooting/](troubleshooting/) directory.
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Always use templates** - Clone for consistency
|
||||||
|
2. **SSH keys only** - Never use password auth
|
||||||
|
3. **VLAN-aware bridges** - Enable for flexibility
|
||||||
|
4. **MTU 9000 for storage** - Essential for CEPH performance
|
||||||
|
5. **Serial console** - Required for most cloud images
|
||||||
|
6. **Guest agent** - Enable for IP detection and graceful shutdown
|
||||||
|
7. **Tag VMs** - Use meaningful tags for organization
|
||||||
|
|
||||||
|
## Progressive Disclosure
|
||||||
|
|
||||||
|
For deeper knowledge:
|
||||||
|
|
||||||
|
### Advanced Automation Workflows (from ProxSpray Analysis)
|
||||||
|
|
||||||
|
- [Cluster Formation](workflows/cluster-formation.md) - Complete cluster automation with idempotency
|
||||||
|
- [CEPH Deployment](workflows/ceph-deployment.md) - Automated CEPH storage deployment
|
||||||
|
|
||||||
|
### Core Reference
|
||||||
|
|
||||||
|
- [Cloud-Init patterns](reference/cloud-init-patterns.md) - Complete template creation guide
|
||||||
|
- [Network configuration](reference/networking.md) - VLANs, bonds, routing, NAT
|
||||||
|
- [API reference](reference/api-reference.md) - Proxmox API interactions
|
||||||
|
- [Storage management](reference/storage-management.md) - CEPH, LVM, datastores
|
||||||
|
- [QEMU guest agent](reference/qemu-guest-agent.md) - Integration and troubleshooting
|
||||||
|
|
||||||
|
### Anti-Patterns & Common Mistakes
|
||||||
|
|
||||||
|
- [Common Mistakes](anti-patterns/common-mistakes.md) - Real-world pitfalls from OpenTofu/Ansible deployments, template
|
||||||
|
creation, and remote backend configuration
|
||||||
|
|
||||||
|
## Related Skills
|
||||||
|
|
||||||
|
- **NetBox + PowerDNS Integration** - Automatic DNS for Proxmox VMs
|
||||||
|
- **Ansible Best Practices** - Playbook patterns used in this cluster
|
||||||
313
skills/proxmox-infrastructure/anti-patterns/common-mistakes.md
Normal file
313
skills/proxmox-infrastructure/anti-patterns/common-mistakes.md
Normal file
@@ -0,0 +1,313 @@
|
|||||||
|
# Common Mistakes and Anti-Patterns
|
||||||
|
|
||||||
|
Lessons learned from real-world Proxmox deployments. Avoid these pitfalls to save time and frustration.
|
||||||
|
|
||||||
|
## VM Provisioning with OpenTofu
|
||||||
|
|
||||||
|
**Note**: Use `tofu` CLI (not `terraform`). All examples use OpenTofu.
|
||||||
|
|
||||||
|
### ❌ Cloud-Init File Not on Target Node
|
||||||
|
|
||||||
|
**Problem**: `tofu plan` succeeds but VM fails to start or configure properly.
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
# BAD - Cloud-init file only exists locally
|
||||||
|
resource "proxmox_virtual_environment_vm" "example" {
|
||||||
|
initialization {
|
||||||
|
user_data_file_id = "local:snippets/user-data.yaml" # File doesn't exist on node!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Solution**: Cloud-init YAML file MUST exist on the target Proxmox node's datastore.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Upload to Proxmox node first
|
||||||
|
scp user-data.yaml root@foxtrot:/var/lib/vz/snippets/
|
||||||
|
|
||||||
|
# Or use Ansible to deploy it
|
||||||
|
ansible proxmox_nodes -m copy -a "src=user-data.yaml dest=/var/lib/vz/snippets/"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Reference**: See `terraform/netbox-template/user-data.yaml.example` for the required format.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Template Missing on Target Node
|
||||||
|
|
||||||
|
**Problem**: `tofu apply` fails with "template not found" error.
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
# BAD - Template referenced but doesn't exist
|
||||||
|
resource "proxmox_virtual_environment_vm" "example" {
|
||||||
|
node_name = "foxtrot"
|
||||||
|
clone {
|
||||||
|
vm_id = 9000 # Template doesn't exist on foxtrot!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Solution**: Ensure template exists on the specific node you're deploying to.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check template exists
|
||||||
|
ssh root@foxtrot "qm list | grep 9000"
|
||||||
|
|
||||||
|
# Clone template to another node if needed
|
||||||
|
ssh root@foxtrot "qm clone 9000 9000 --pool templates"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Better**: Use Ansible playbook to create templates consistently across nodes:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ansible && uv run ansible-playbook playbooks/proxmox-build-template.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Remote Backend Configuration Errors
|
||||||
|
|
||||||
|
**Problem**: OpenTofu fails to authenticate with Proxmox when using Scalr remote backend.
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
# BAD - Incorrect provider config for remote backend
|
||||||
|
provider "proxmox" {
|
||||||
|
endpoint = var.proxmox_api_url
|
||||||
|
ssh {
|
||||||
|
agent = true # ❌ Doesn't work with remote backend!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Solution (Remote Backend - Scalr)**:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
provider "proxmox" {
|
||||||
|
endpoint = var.proxmox_api_url
|
||||||
|
username = var.proxmox_username # Must use variables
|
||||||
|
password = var.proxmox_password # Must use variables
|
||||||
|
|
||||||
|
ssh {
|
||||||
|
agent = false # Critical: false for remote backend
|
||||||
|
username = var.ssh_username
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Required environment variables:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export SCALR_HOSTNAME="your-scalr-host"
|
||||||
|
export SCALR_TOKEN="your-scalr-token"
|
||||||
|
export TF_VAR_proxmox_username="root@pam"
|
||||||
|
export TF_VAR_proxmox_password="your-password"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Solution (Local Testing)**:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
provider "proxmox" {
|
||||||
|
endpoint = var.proxmox_api_url
|
||||||
|
|
||||||
|
ssh {
|
||||||
|
agent = true # Use SSH agent for local testing
|
||||||
|
username = "root"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Reference Architecture**:
|
||||||
|
|
||||||
|
- Local examples: `terraform/examples/`
|
||||||
|
- Versioned root modules: `basher83/Triangulum-Prime/terraform-bgp-vm`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Template Creation
|
||||||
|
|
||||||
|
### ❌ Cloud Image Not Downloaded to Target Node
|
||||||
|
|
||||||
|
**Problem**: Ansible playbook fails when creating template from cloud image.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# BAD - Assuming image exists
|
||||||
|
- name: Create VM from cloud image
|
||||||
|
ansible.builtin.command: >
|
||||||
|
qm importdisk {{ template_id }} ubuntu-22.04.img local-lvm
|
||||||
|
# Fails: ubuntu-22.04.img doesn't exist!
|
||||||
|
```
|
||||||
|
|
||||||
|
**Solution**: Download cloud image to target node first.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# GOOD - Download first
|
||||||
|
- name: Download Ubuntu cloud image
|
||||||
|
ansible.builtin.get_url:
|
||||||
|
url: https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-amd64.img
|
||||||
|
dest: /tmp/ubuntu-22.04.img
|
||||||
|
checksum: sha256:...
|
||||||
|
|
||||||
|
- name: Import disk to VM
|
||||||
|
ansible.builtin.command: >
|
||||||
|
qm importdisk {{ template_id }} /tmp/ubuntu-22.04.img local-lvm
|
||||||
|
```
|
||||||
|
|
||||||
|
**Reference**: See `ansible/playbooks/proxmox-build-template.yml` for complete workflow.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Cloud-Init Snippet Format Violations
|
||||||
|
|
||||||
|
**Problem**: VM boots but cloud-init doesn't configure properly.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# BAD - Wrong format
|
||||||
|
#cloud-config
|
||||||
|
users:
|
||||||
|
- name: admin
|
||||||
|
sudo: ALL=(ALL) NOPASSWD:ALL
|
||||||
|
# Missing critical fields!
|
||||||
|
```
|
||||||
|
|
||||||
|
**Solution**: Use the standardized snippet format pre-configured for Ansible.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# GOOD - Complete format
|
||||||
|
#cloud-config
|
||||||
|
users:
|
||||||
|
- name: ansible
|
||||||
|
groups: sudo
|
||||||
|
shell: /bin/bash
|
||||||
|
sudo: ALL=(ALL) NOPASSWD:ALL
|
||||||
|
ssh_authorized_keys:
|
||||||
|
- ssh-ed25519 AAAA...
|
||||||
|
|
||||||
|
package_update: true
|
||||||
|
package_upgrade: false
|
||||||
|
|
||||||
|
packages:
|
||||||
|
- qemu-guest-agent
|
||||||
|
- python3
|
||||||
|
- python3-pip
|
||||||
|
|
||||||
|
runcmd:
|
||||||
|
- systemctl enable qemu-guest-agent
|
||||||
|
- systemctl start qemu-guest-agent
|
||||||
|
```
|
||||||
|
|
||||||
|
**Critical Requirements**:
|
||||||
|
|
||||||
|
- ✅ MUST include `qemu-guest-agent` package
|
||||||
|
- ✅ MUST include `python3` for Ansible compatibility
|
||||||
|
- ✅ MUST configure SSH key for Ansible user
|
||||||
|
- ✅ MUST enable qemu-guest-agent service
|
||||||
|
|
||||||
|
**Reference Format**: `terraform/netbox-template/user-data.yaml.example`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Mixing Terraform and Ansible Provisioning
|
||||||
|
|
||||||
|
**Problem**: Confusion about which tool is responsible for what.
|
||||||
|
|
||||||
|
**Anti-Pattern**:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
# BAD - Complex provisioning in Terraform
|
||||||
|
resource "proxmox_virtual_environment_vm" "example" {
|
||||||
|
initialization {
|
||||||
|
user_data_file_id = "local:snippets/complex-setup.yaml"
|
||||||
|
# Hundreds of lines of cloud-init doing app setup
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Best Practice**: Clear separation of concerns.
|
||||||
|
|
||||||
|
**OpenTofu Responsibility**:
|
||||||
|
|
||||||
|
- VM resource allocation (CPU, memory, disk)
|
||||||
|
- Network configuration
|
||||||
|
- Basic cloud-init (user, SSH keys, qemu-guest-agent)
|
||||||
|
- Infrastructure provisioning
|
||||||
|
|
||||||
|
**Ansible Responsibility**:
|
||||||
|
|
||||||
|
- Application installation
|
||||||
|
- Configuration management
|
||||||
|
- Service orchestration
|
||||||
|
- Ongoing management
|
||||||
|
|
||||||
|
**Pattern**:
|
||||||
|
|
||||||
|
1. OpenTofu: Provision VM with minimal cloud-init
|
||||||
|
2. Cloud-init: Create ansible user, install qemu-guest-agent, python3
|
||||||
|
3. Ansible: Configure everything else
|
||||||
|
|
||||||
|
**Reference Architecture**:
|
||||||
|
|
||||||
|
- Template creation: `basher83/Triangulum-Prime/deployments/homelab/templates`
|
||||||
|
- OpenTofu examples: `terraform/examples/`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Best Practices Summary
|
||||||
|
|
||||||
|
### Template Creation
|
||||||
|
|
||||||
|
1. ✅ Download cloud images to target node before import
|
||||||
|
2. ✅ Use standardized cloud-init snippet format
|
||||||
|
3. ✅ Always include qemu-guest-agent
|
||||||
|
4. ✅ Keep cloud-init minimal - let Ansible handle configuration
|
||||||
|
5. ✅ Reference: `basher83/Triangulum-Prime/deployments/homelab/templates`
|
||||||
|
|
||||||
|
### OpenTofu Provisioning
|
||||||
|
|
||||||
|
1. ✅ Verify template exists on target node
|
||||||
|
2. ✅ Upload cloud-init snippets before referencing
|
||||||
|
3. ✅ Use `ssh.agent = false` for remote backends (Scalr)
|
||||||
|
4. ✅ Use `ssh.agent = true` for local testing
|
||||||
|
5. ✅ Set credentials via OpenTofu variables, not hardcoded
|
||||||
|
6. ✅ Reference: `terraform/examples/` and `basher83/Triangulum-Prime`
|
||||||
|
|
||||||
|
### Workflow
|
||||||
|
|
||||||
|
1. ✅ Create template once per node (or sync across nodes)
|
||||||
|
2. ✅ Upload cloud-init snippets to `/var/lib/vz/snippets/`
|
||||||
|
3. ✅ Provision VM via OpenTofu (infrastructure)
|
||||||
|
4. ✅ Configure VM via Ansible (applications/services)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Troubleshooting
|
||||||
|
|
||||||
|
### VM Won't Start After tofu apply
|
||||||
|
|
||||||
|
**Check**:
|
||||||
|
|
||||||
|
1. Does template exist? `qm list | grep <template-id>`
|
||||||
|
2. Does cloud-init file exist? `ls -la /var/lib/vz/snippets/`
|
||||||
|
3. Is qemu-guest-agent installed? `qm agent <vmid> ping`
|
||||||
|
|
||||||
|
### tofu Can't Connect to Proxmox
|
||||||
|
|
||||||
|
**Remote Backend**:
|
||||||
|
|
||||||
|
1. `ssh.agent = false`? ✅
|
||||||
|
2. `SCALR_HOSTNAME` and `SCALR_TOKEN` set? ✅
|
||||||
|
3. Using OpenTofu variables for credentials? ✅
|
||||||
|
|
||||||
|
**Local Testing**:
|
||||||
|
|
||||||
|
1. `ssh.agent = true`? ✅
|
||||||
|
2. SSH key in agent? `ssh-add -l` ✅
|
||||||
|
3. Can you SSH to node? `ssh root@foxtrot` ✅
|
||||||
|
|
||||||
|
### Cloud-Init Didn't Configure VM
|
||||||
|
|
||||||
|
**Check**:
|
||||||
|
|
||||||
|
1. File format matches `user-data.yaml.example`? ✅
|
||||||
|
2. Includes qemu-guest-agent? ✅
|
||||||
|
3. Includes python3? ✅
|
||||||
|
4. VM console logs: `qm terminal <vmid>` then check `/var/log/cloud-init.log`
|
||||||
245
skills/proxmox-infrastructure/examples/01-basic-vm/README.md
Normal file
245
skills/proxmox-infrastructure/examples/01-basic-vm/README.md
Normal file
@@ -0,0 +1,245 @@
|
|||||||
|
# Basic VM Deployment Example
|
||||||
|
|
||||||
|
**Learning objective:** Deploy your first VM using the unified VM module with minimal configuration.
|
||||||
|
|
||||||
|
## What This Example Shows
|
||||||
|
|
||||||
|
- ✅ Minimal required configuration for VM deployment
|
||||||
|
- ✅ Cloning from an existing template
|
||||||
|
- ✅ Static IP address configuration with cloud-init
|
||||||
|
- ✅ SSH key injection
|
||||||
|
- ✅ Module defaults (what you DON'T need to specify)
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
1. **Proxmox template** exists (VMID 9000)
|
||||||
|
- Create one using: `terraform/netbox-template/` or Ansible playbook
|
||||||
|
- Or use Triangulum-Prime template examples
|
||||||
|
|
||||||
|
2. **Proxmox API credentials** configured:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export PROXMOX_VE_ENDPOINT="https://192.168.3.5:8006"
|
||||||
|
export PROXMOX_VE_API_TOKEN="user@realm!token-id=secret"
|
||||||
|
# OR
|
||||||
|
export PROXMOX_VE_USERNAME="root@pam"
|
||||||
|
export PROXMOX_VE_PASSWORD="your-password"
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **SSH public key** available:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export TF_VAR_ssh_public_key="$(cat ~/.ssh/id_rsa.pub)"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Initialize Terraform
|
||||||
|
|
||||||
|
```bash
|
||||||
|
tofu init
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Review the Plan
|
||||||
|
|
||||||
|
```bash
|
||||||
|
tofu plan
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected resources:**
|
||||||
|
|
||||||
|
- 1 VM (cloned from template 9000)
|
||||||
|
- Cloud-init configuration
|
||||||
|
- Network interface with static IP
|
||||||
|
|
||||||
|
### 3. Deploy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
tofu apply
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Verify
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# SSH into the VM
|
||||||
|
ssh ansible@192.168.3.100
|
||||||
|
|
||||||
|
# Check VM in Proxmox
|
||||||
|
qm status 100 # Or whatever VMID was assigned
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Cleanup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
tofu destroy
|
||||||
|
```
|
||||||
|
|
||||||
|
## Understanding the Configuration
|
||||||
|
|
||||||
|
### What You MUST Specify
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
# These 6 parameters are required:
|
||||||
|
vm_type = "clone" # Clone from template
|
||||||
|
pve_node = "foxtrot" # Which node
|
||||||
|
vm_name = "test-vm-01" # VM name
|
||||||
|
src_clone = { ... } # Template to clone
|
||||||
|
vm_disk = { ... } # Disk config
|
||||||
|
vm_net_ifaces = { ... } # Network config
|
||||||
|
vm_init = { ... } # Cloud-init config
|
||||||
|
vm_efi_disk = { ... } # EFI boot disk
|
||||||
|
```
|
||||||
|
|
||||||
|
### What Uses Defaults
|
||||||
|
|
||||||
|
The module provides sensible defaults for:
|
||||||
|
|
||||||
|
| Setting | Default | Why It's Good |
|
||||||
|
|---------|---------|---------------|
|
||||||
|
| CPU cores | 2 | Minimal baseline |
|
||||||
|
| Memory | 2048 MB (2GB) | Enough for most services |
|
||||||
|
| CPU type | `host` | Best performance |
|
||||||
|
| Guest agent | Enabled | Needed for IP detection |
|
||||||
|
| BIOS | `ovmf` (UEFI) | Modern, secure |
|
||||||
|
| Machine | `q35` | Modern chipset |
|
||||||
|
| Display | Standard VGA | Works everywhere |
|
||||||
|
| Serial console | Enabled | Troubleshooting |
|
||||||
|
| RNG device | Enabled | Entropy for crypto |
|
||||||
|
|
||||||
|
**See:** [Module DEFAULTS.md](https://github.com/basher83/Triangulum-Prime/blob/main/terraform-bgp-vm/DEFAULTS.md)
|
||||||
|
|
||||||
|
## Customization
|
||||||
|
|
||||||
|
### Change VM Resources
|
||||||
|
|
||||||
|
Override defaults in `main.tf`:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
module "basic_vm" {
|
||||||
|
# ... required params ...
|
||||||
|
|
||||||
|
# Override CPU
|
||||||
|
vm_cpu = {
|
||||||
|
cores = 4 # Increase to 4 cores
|
||||||
|
}
|
||||||
|
|
||||||
|
# Override memory
|
||||||
|
vm_mem = {
|
||||||
|
dedicated = 8192 # 8GB
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Use Different Template
|
||||||
|
|
||||||
|
Change the template ID:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
src_clone = {
|
||||||
|
datastore_id = "local-lvm"
|
||||||
|
tpl_id = 9001 # Different template
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Add VLAN Tagging
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
vm_net_ifaces = {
|
||||||
|
net0 = {
|
||||||
|
bridge = "vmbr0"
|
||||||
|
vlan_id = 30 # Add VLAN tag
|
||||||
|
ipv4_addr = "192.168.3.100/24"
|
||||||
|
ipv4_gw = "192.168.3.1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Issues
|
||||||
|
|
||||||
|
### Issue: "Template 9000 not found"
|
||||||
|
|
||||||
|
**Solution:** Create a template first:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ../../.. # Back to repo root
|
||||||
|
cd terraform/netbox-template
|
||||||
|
tofu apply
|
||||||
|
```
|
||||||
|
|
||||||
|
### Issue: "IP address already in use"
|
||||||
|
|
||||||
|
**Solution:** Change `ip_address` variable:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
tofu apply -var="ip_address=192.168.3.101"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Issue: "Cannot connect to Proxmox API"
|
||||||
|
|
||||||
|
**Solution:** Check credentials:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
echo $PROXMOX_VE_ENDPOINT
|
||||||
|
echo $PROXMOX_VE_API_TOKEN
|
||||||
|
```
|
||||||
|
|
||||||
|
### Issue: "EFI disk creation failed"
|
||||||
|
|
||||||
|
**Solution:** Ensure datastore has space:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On Proxmox node
|
||||||
|
pvesm status
|
||||||
|
```
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
### Learn More
|
||||||
|
|
||||||
|
1. **Production Configuration:** See `../02-production-vm/`
|
||||||
|
- Shows common overrides for production
|
||||||
|
- Resource sizing best practices
|
||||||
|
- Tagging and organization
|
||||||
|
|
||||||
|
2. **Template Creation:** See `../03-template-creation/`
|
||||||
|
- How to create templates from cloud images
|
||||||
|
- Template best practices
|
||||||
|
|
||||||
|
3. **Complete Examples:** Triangulum-Prime repository
|
||||||
|
- [Single VM](https://github.com/basher83/Triangulum-Prime/tree/main/examples/single-vm)
|
||||||
|
- [MicroK8s Cluster](https://github.com/basher83/Triangulum-Prime/tree/main/examples/microk8s-cluster)
|
||||||
|
- [Custom Cloud-init](https://github.com/basher83/Triangulum-Prime/tree/main/examples/template-with-custom-cloudinit)
|
||||||
|
|
||||||
|
### Integration Examples
|
||||||
|
|
||||||
|
- **NetBox + DNS:** See `.claude/skills/netbox-powerdns-integration/examples/01-vm-with-dns/`
|
||||||
|
- **Ansible Configuration:** See `.claude/skills/ansible-best-practices/examples/`
|
||||||
|
|
||||||
|
## Module Documentation
|
||||||
|
|
||||||
|
- **README:** [terraform-bgp-vm](https://github.com/basher83/Triangulum-Prime/tree/main/terraform-bgp-vm)
|
||||||
|
- **DEFAULTS:** [DEFAULTS.md](https://github.com/basher83/Triangulum-Prime/blob/main/terraform-bgp-vm/DEFAULTS.md)
|
||||||
|
- **Full API:** Module variables.tf
|
||||||
|
|
||||||
|
## Philosophy: DRY (Don't Repeat Yourself)
|
||||||
|
|
||||||
|
This example follows the module's DRY principle:
|
||||||
|
|
||||||
|
✅ **Good:** Only specify what differs from defaults
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
vm_cpu = {
|
||||||
|
cores = 4 # Only override cores, use default type
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
❌ **Bad:** Repeating module defaults
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
vm_cpu = {
|
||||||
|
cores = 4
|
||||||
|
type = "host" # This is already the default!
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why?** Reduces maintenance burden and makes changes obvious.
|
||||||
138
skills/proxmox-infrastructure/examples/01-basic-vm/main.tf
Normal file
138
skills/proxmox-infrastructure/examples/01-basic-vm/main.tf
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
# =============================================================================
|
||||||
|
# Basic VM Deployment Example
|
||||||
|
# =============================================================================
|
||||||
|
# This is a minimal example for learning the VM module. It shows only the
|
||||||
|
# required parameters with sensible defaults for everything else.
|
||||||
|
#
|
||||||
|
# Use this as a starting point for understanding the module, then refer to
|
||||||
|
# Triangulum-Prime examples for production-ready configurations.
|
||||||
|
|
||||||
|
terraform {
|
||||||
|
required_version = ">= 1.0"
|
||||||
|
|
||||||
|
required_providers {
|
||||||
|
proxmox = {
|
||||||
|
source = "bpg/proxmox"
|
||||||
|
version = "~> 0.69"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Provider configuration (credentials from environment)
|
||||||
|
provider "proxmox" {
|
||||||
|
endpoint = var.proxmox_endpoint
|
||||||
|
# Uses PROXMOX_VE_API_TOKEN or PROXMOX_VE_USERNAME/PASSWORD from environment
|
||||||
|
}
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Basic VM Module Usage
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
module "basic_vm" {
|
||||||
|
source = "github.com/basher83/Triangulum-Prime//terraform-bgp-vm?ref=vm/1.0.1"
|
||||||
|
|
||||||
|
# === REQUIRED: Basic Configuration ===
|
||||||
|
vm_type = "clone" # Clone from existing template
|
||||||
|
pve_node = var.proxmox_node # Which Proxmox node to deploy on
|
||||||
|
vm_name = var.vm_name # Name of the VM
|
||||||
|
|
||||||
|
# === REQUIRED: Clone Source ===
|
||||||
|
# Specify which template to clone from
|
||||||
|
src_clone = {
|
||||||
|
datastore_id = "local-lvm"
|
||||||
|
tpl_id = 9000 # Your template VMID
|
||||||
|
}
|
||||||
|
|
||||||
|
# === REQUIRED: Disk Configuration ===
|
||||||
|
# Define the VM's disk
|
||||||
|
vm_disk = {
|
||||||
|
scsi0 = {
|
||||||
|
datastore_id = "local-lvm"
|
||||||
|
size = 20 # GB
|
||||||
|
main_disk = true
|
||||||
|
# Note: file_format, iothread, ssd, discard use optimal defaults
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# === REQUIRED: Network Configuration ===
|
||||||
|
# At minimum, configure one network interface
|
||||||
|
vm_net_ifaces = {
|
||||||
|
net0 = {
|
||||||
|
bridge = "vmbr0"
|
||||||
|
ipv4_addr = "${var.ip_address}/24"
|
||||||
|
ipv4_gw = var.gateway
|
||||||
|
# Note: model defaults to "virtio", vlan_id defaults to null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# === REQUIRED: Cloud-init Configuration ===
|
||||||
|
vm_init = {
|
||||||
|
datastore_id = "local-lvm"
|
||||||
|
|
||||||
|
user = {
|
||||||
|
name = var.username
|
||||||
|
keys = [var.ssh_public_key]
|
||||||
|
}
|
||||||
|
|
||||||
|
dns = {
|
||||||
|
domain = "spaceships.work"
|
||||||
|
servers = ["192.168.3.1"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# === REQUIRED: EFI Disk (for UEFI boot) ===
|
||||||
|
vm_efi_disk = {
|
||||||
|
datastore_id = "local-lvm"
|
||||||
|
# file_format defaults to "raw"
|
||||||
|
# type defaults to "4m"
|
||||||
|
}
|
||||||
|
|
||||||
|
# === OPTIONAL OVERRIDES ===
|
||||||
|
# These are only shown here for educational purposes.
|
||||||
|
# The module already provides these defaults - you DON'T need to specify them!
|
||||||
|
|
||||||
|
# CPU (defaults to 2 cores, "host" type)
|
||||||
|
# vm_cpu = {
|
||||||
|
# cores = 2
|
||||||
|
# type = "host"
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Memory (defaults to 2048 MB / 2GB)
|
||||||
|
# vm_mem = {
|
||||||
|
# dedicated = 2048
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Guest agent (defaults to enabled)
|
||||||
|
# vm_agent = {
|
||||||
|
# enabled = true
|
||||||
|
# }
|
||||||
|
|
||||||
|
# VM start behavior (defaults: start on deploy, start on boot)
|
||||||
|
# vm_start = {
|
||||||
|
# on_deploy = true
|
||||||
|
# on_boot = true
|
||||||
|
# }
|
||||||
|
|
||||||
|
# === Learn More ===
|
||||||
|
# See module DEFAULTS.md for complete list of defaults:
|
||||||
|
# https://github.com/basher83/Triangulum-Prime/blob/main/terraform-bgp-vm/DEFAULTS.md
|
||||||
|
}
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Outputs
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
output "vm_id" {
|
||||||
|
description = "The ID of the created VM"
|
||||||
|
value = module.basic_vm.vm_id
|
||||||
|
}
|
||||||
|
|
||||||
|
output "vm_name" {
|
||||||
|
description = "The name of the created VM"
|
||||||
|
value = module.basic_vm.vm_name
|
||||||
|
}
|
||||||
|
|
||||||
|
output "vm_ipv4_addresses" {
|
||||||
|
description = "IPv4 addresses assigned to the VM"
|
||||||
|
value = module.basic_vm.ipv4_addresses
|
||||||
|
}
|
||||||
@@ -0,0 +1,41 @@
|
|||||||
|
variable "proxmox_endpoint" {
|
||||||
|
description = "Proxmox API endpoint (e.g., https://192.168.3.5:8006)"
|
||||||
|
type = string
|
||||||
|
default = "https://192.168.3.5:8006"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "proxmox_node" {
|
||||||
|
description = "Proxmox node to deploy on"
|
||||||
|
type = string
|
||||||
|
default = "foxtrot"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "vm_name" {
|
||||||
|
description = "Name of the VM"
|
||||||
|
type = string
|
||||||
|
default = "test-vm-01"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "ip_address" {
|
||||||
|
description = "Static IP address for the VM (without CIDR)"
|
||||||
|
type = string
|
||||||
|
default = "192.168.3.100"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "gateway" {
|
||||||
|
description = "Network gateway"
|
||||||
|
type = string
|
||||||
|
default = "192.168.3.1"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "username" {
|
||||||
|
description = "VM username for cloud-init"
|
||||||
|
type = string
|
||||||
|
default = "ansible"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "ssh_public_key" {
|
||||||
|
description = "SSH public key for VM access"
|
||||||
|
type = string
|
||||||
|
# Set via environment variable or tfvars file
|
||||||
|
}
|
||||||
378
skills/proxmox-infrastructure/reference/api-reference.md
Normal file
378
skills/proxmox-infrastructure/reference/api-reference.md
Normal file
@@ -0,0 +1,378 @@
|
|||||||
|
# Proxmox API Reference
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The Proxmox API enables programmatic management of the cluster via REST. This reference focuses on common patterns for Python (proxmoxer) and Terraform/Ansible usage.
|
||||||
|
|
||||||
|
## Authentication Methods
|
||||||
|
|
||||||
|
### API Tokens (Recommended)
|
||||||
|
|
||||||
|
**Create API token via CLI:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pveum user token add <user>@<realm> <token-id> --privsep 0
|
||||||
|
```
|
||||||
|
|
||||||
|
**Environment variables:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export PROXMOX_VE_API_TOKEN="user@realm!token-id=secret"
|
||||||
|
export PROXMOX_VE_ENDPOINT="https://192.168.3.5:8006"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Password Authentication
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export PROXMOX_VE_USERNAME="root@pam"
|
||||||
|
export PROXMOX_VE_PASSWORD="password"
|
||||||
|
export PROXMOX_VE_ENDPOINT="https://192.168.3.5:8006"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Python API Usage (proxmoxer)
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Using uv inline script metadata
|
||||||
|
# /// script
|
||||||
|
# dependencies = ["proxmoxer", "requests"]
|
||||||
|
# ///
|
||||||
|
```
|
||||||
|
|
||||||
|
### Basic Connection
|
||||||
|
|
||||||
|
```python
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# /// script
|
||||||
|
# dependencies = ["proxmoxer", "requests"]
|
||||||
|
# ///
|
||||||
|
|
||||||
|
from proxmoxer import ProxmoxAPI
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Connect using API token
|
||||||
|
proxmox = ProxmoxAPI(
|
||||||
|
os.getenv("PROXMOX_VE_ENDPOINT").replace("https://", "").replace(":8006", ""),
|
||||||
|
user=os.getenv("PROXMOX_VE_USERNAME"),
|
||||||
|
token_name=os.getenv("PROXMOX_VE_TOKEN_NAME"),
|
||||||
|
token_value=os.getenv("PROXMOX_VE_TOKEN_VALUE"),
|
||||||
|
verify_ssl=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# OR using password
|
||||||
|
proxmox = ProxmoxAPI(
|
||||||
|
'192.168.3.5',
|
||||||
|
user='root@pam',
|
||||||
|
password=os.getenv("PROXMOX_VE_PASSWORD"),
|
||||||
|
verify_ssl=False
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Common Operations
|
||||||
|
|
||||||
|
**List VMs:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Get all VMs across cluster
|
||||||
|
for node in proxmox.nodes.get():
|
||||||
|
node_name = node['node']
|
||||||
|
for vm in proxmox.nodes(node_name).qemu.get():
|
||||||
|
print(f"VM {vm['vmid']}: {vm['name']} on {node_name} - {vm['status']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Get VM Configuration:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
vmid = 101
|
||||||
|
node = "foxtrot"
|
||||||
|
|
||||||
|
vm_config = proxmox.nodes(node).qemu(vmid).config.get()
|
||||||
|
print(f"VM {vmid} config: {vm_config}")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Clone Template:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
template_id = 9000
|
||||||
|
new_vmid = 101
|
||||||
|
node = "foxtrot"
|
||||||
|
|
||||||
|
# Clone template
|
||||||
|
proxmox.nodes(node).qemu(template_id).clone.post(
|
||||||
|
newid=new_vmid,
|
||||||
|
name="docker-01-nexus",
|
||||||
|
full=1, # Full clone (not linked)
|
||||||
|
storage="local-lvm"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for clone to complete
|
||||||
|
import time
|
||||||
|
while True:
|
||||||
|
tasks = proxmox.nodes(node).tasks.get()
|
||||||
|
clone_task = next((t for t in tasks if t['type'] == 'qmclone' and str(t['id']) == str(new_vmid)), None)
|
||||||
|
if not clone_task or clone_task['status'] == 'stopped':
|
||||||
|
break
|
||||||
|
time.sleep(2)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Update VM Configuration:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Set cloud-init parameters
|
||||||
|
proxmox.nodes(node).qemu(vmid).config.put(
|
||||||
|
ipconfig0="ip=192.168.3.100/24,gw=192.168.3.1",
|
||||||
|
nameserver="192.168.3.1",
|
||||||
|
searchdomain="spaceships.work",
|
||||||
|
sshkeys="ssh-rsa AAAA..."
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Start/Stop VM:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Start VM
|
||||||
|
proxmox.nodes(node).qemu(vmid).status.start.post()
|
||||||
|
|
||||||
|
# Stop VM (graceful)
|
||||||
|
proxmox.nodes(node).qemu(vmid).status.shutdown.post()
|
||||||
|
|
||||||
|
# Force stop
|
||||||
|
proxmox.nodes(node).qemu(vmid).status.stop.post()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Delete VM:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
proxmox.nodes(node).qemu(vmid).delete()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Cluster Operations
|
||||||
|
|
||||||
|
**Get Cluster Status:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
cluster_status = proxmox.cluster.status.get()
|
||||||
|
for node in cluster_status:
|
||||||
|
if node['type'] == 'node':
|
||||||
|
print(f"Node: {node['name']} - {node['online']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Get Node Resources:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
node_status = proxmox.nodes(node).status.get()
|
||||||
|
print(f"CPU: {node_status['cpu']*100:.1f}%")
|
||||||
|
print(f"Memory: {node_status['memory']['used']/1024**3:.1f}GB / {node_status['memory']['total']/1024**3:.1f}GB")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Storage Operations
|
||||||
|
|
||||||
|
**List Storage:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
for storage in proxmox.storage.get():
|
||||||
|
print(f"Storage: {storage['storage']} - Type: {storage['type']} - {storage['active']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Get Storage Content:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
storage = "local-lvm"
|
||||||
|
content = proxmox.storage(storage).content.get()
|
||||||
|
for item in content:
|
||||||
|
print(f"{item['volid']} - {item.get('vmid', 'N/A')} - {item['size']/1024**3:.1f}GB")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Terraform Provider Patterns
|
||||||
|
|
||||||
|
### Basic Resource (VM from Clone)
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
resource "proxmox_vm_qemu" "docker_host" {
|
||||||
|
name = "docker-01-nexus"
|
||||||
|
target_node = "foxtrot"
|
||||||
|
vmid = 101
|
||||||
|
|
||||||
|
clone = "ubuntu-template"
|
||||||
|
full_clone = true
|
||||||
|
|
||||||
|
cores = 4
|
||||||
|
memory = 8192
|
||||||
|
sockets = 1
|
||||||
|
|
||||||
|
network {
|
||||||
|
bridge = "vmbr0"
|
||||||
|
model = "virtio"
|
||||||
|
tag = 30 # VLAN 30
|
||||||
|
}
|
||||||
|
|
||||||
|
disk {
|
||||||
|
storage = "local-lvm"
|
||||||
|
type = "scsi"
|
||||||
|
size = "50G"
|
||||||
|
}
|
||||||
|
|
||||||
|
ipconfig0 = "ip=192.168.3.100/24,gw=192.168.3.1"
|
||||||
|
|
||||||
|
sshkeys = file("~/.ssh/id_rsa.pub")
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Data Sources
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
# Get template information
|
||||||
|
data "proxmox_vm_qemu" "template" {
|
||||||
|
name = "ubuntu-template"
|
||||||
|
target_node = "foxtrot"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get storage information
|
||||||
|
data "proxmox_storage" "local_lvm" {
|
||||||
|
node = "foxtrot"
|
||||||
|
storage = "local-lvm"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Ansible Module Patterns
|
||||||
|
|
||||||
|
### Create VM from Template
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: Clone template to create VM
|
||||||
|
community.proxmox.proxmox_kvm:
|
||||||
|
api_host: "{{ proxmox_api_host }}"
|
||||||
|
api_user: "{{ proxmox_api_user }}"
|
||||||
|
api_token_id: "{{ proxmox_token_id }}"
|
||||||
|
api_token_secret: "{{ proxmox_token_secret }}"
|
||||||
|
node: foxtrot
|
||||||
|
vmid: 101
|
||||||
|
name: docker-01-nexus
|
||||||
|
clone: ubuntu-template
|
||||||
|
full: true
|
||||||
|
storage: local-lvm
|
||||||
|
net:
|
||||||
|
net0: 'virtio,bridge=vmbr0,tag=30'
|
||||||
|
ipconfig:
|
||||||
|
ipconfig0: 'ip=192.168.3.100/24,gw=192.168.3.1'
|
||||||
|
cores: 4
|
||||||
|
memory: 8192
|
||||||
|
agent: 1
|
||||||
|
state: present
|
||||||
|
```
|
||||||
|
|
||||||
|
### Start VM
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: Start VM
|
||||||
|
community.proxmox.proxmox_kvm:
|
||||||
|
api_host: "{{ proxmox_api_host }}"
|
||||||
|
api_user: "{{ proxmox_api_user }}"
|
||||||
|
api_token_id: "{{ proxmox_token_id }}"
|
||||||
|
api_token_secret: "{{ proxmox_token_secret }}"
|
||||||
|
node: foxtrot
|
||||||
|
vmid: 101
|
||||||
|
state: started
|
||||||
|
```
|
||||||
|
|
||||||
|
## Matrix Cluster Specifics
|
||||||
|
|
||||||
|
### Node IP Addresses
|
||||||
|
|
||||||
|
```python
|
||||||
|
MATRIX_NODES = {
|
||||||
|
"foxtrot": "192.168.3.5",
|
||||||
|
"golf": "192.168.3.6",
|
||||||
|
"hotel": "192.168.3.7"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Storage Pools
|
||||||
|
|
||||||
|
```python
|
||||||
|
STORAGE_POOLS = {
|
||||||
|
"local": "dir", # Local directory
|
||||||
|
"local-lvm": "lvmthin", # LVM thin on boot disk
|
||||||
|
"ceph-pool": "rbd" # CEPH RBD (when configured)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Network Bridges
|
||||||
|
|
||||||
|
```python
|
||||||
|
BRIDGES = {
|
||||||
|
"vmbr0": "192.168.3.0/24", # Management + VLAN 9 (Corosync)
|
||||||
|
"vmbr1": "192.168.5.0/24", # CEPH Public (MTU 9000)
|
||||||
|
"vmbr2": "192.168.7.0/24" # CEPH Private (MTU 9000)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
### Python Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
from proxmoxer import ProxmoxAPI, ResourceException
|
||||||
|
import sys
|
||||||
|
|
||||||
|
try:
|
||||||
|
proxmox = ProxmoxAPI('192.168.3.5', user='root@pam', password='pass', verify_ssl=False)
|
||||||
|
vm_config = proxmox.nodes('foxtrot').qemu(101).config.get()
|
||||||
|
except ResourceException as e:
|
||||||
|
print(f"API Error: {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Unexpected error: {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Ansible Example
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: Clone VM with error handling
|
||||||
|
community.proxmox.proxmox_kvm:
|
||||||
|
api_host: "{{ proxmox_api_host }}"
|
||||||
|
# ... config ...
|
||||||
|
register: clone_result
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Check clone result
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: "Failed to clone VM: {{ clone_result.msg }}"
|
||||||
|
when: clone_result.failed
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Endpoints Reference
|
||||||
|
|
||||||
|
### Common Endpoints
|
||||||
|
|
||||||
|
```text
|
||||||
|
GET /api2/json/nodes # List nodes
|
||||||
|
GET /api2/json/nodes/{node}/qemu # List VMs on node
|
||||||
|
GET /api2/json/nodes/{node}/qemu/{vmid} # Get VM status
|
||||||
|
POST /api2/json/nodes/{node}/qemu/{vmid}/clone # Clone VM
|
||||||
|
PUT /api2/json/nodes/{node}/qemu/{vmid}/config # Update config
|
||||||
|
POST /api2/json/nodes/{node}/qemu/{vmid}/status/start # Start VM
|
||||||
|
POST /api2/json/nodes/{node}/qemu/{vmid}/status/shutdown # Stop VM
|
||||||
|
DELETE /api2/json/nodes/{node}/qemu/{vmid} # Delete VM
|
||||||
|
|
||||||
|
GET /api2/json/cluster/status # Cluster status
|
||||||
|
GET /api2/json/storage # List storage
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Use API tokens** - More secure than password authentication
|
||||||
|
2. **Handle SSL properly** - Use `verify_ssl=True` with proper CA cert in production
|
||||||
|
3. **Check task completion** - Clone/migrate operations are async, poll for completion
|
||||||
|
4. **Error handling** - Always catch ResourceException and provide meaningful errors
|
||||||
|
5. **Rate limiting** - Don't hammer the API, add delays in loops
|
||||||
|
6. **Idempotency** - Check if resource exists before creating
|
||||||
|
7. **Use VMID ranges** - Reserve ranges for different purposes (templates: 9000-9999, VMs: 100-999)
|
||||||
|
|
||||||
|
## Further Reading
|
||||||
|
|
||||||
|
- [Proxmox VE API Documentation](https://pve.proxmox.com/pve-docs/api-viewer/)
|
||||||
|
- [proxmoxer GitHub](https://github.com/proxmoxer/proxmoxer)
|
||||||
|
- [community.proxmox Collection](https://docs.ansible.com/ansible/latest/collections/community/proxmox/)
|
||||||
163
skills/proxmox-infrastructure/reference/cloud-init-patterns.md
Normal file
163
skills/proxmox-infrastructure/reference/cloud-init-patterns.md
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
# Cloud-Init Patterns for Proxmox VE
|
||||||
|
|
||||||
|
*Source: <https://pve.proxmox.com/wiki/Cloud-Init_Support*>
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Cloud-Init is the de facto multi-distribution package that handles early initialization of virtual machines. When a VM starts for the first time, Cloud-Init applies network and SSH key settings configured on the hypervisor.
|
||||||
|
|
||||||
|
## Template Creation Workflow
|
||||||
|
|
||||||
|
### Download and Import Cloud Image
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Download Ubuntu cloud image
|
||||||
|
wget https://cloud-images.ubuntu.com/bionic/current/bionic-server-cloudimg-amd64.img
|
||||||
|
|
||||||
|
# Create VM with VirtIO SCSI controller
|
||||||
|
qm create 9000 --memory 2048 --net0 virtio,bridge=vmbr0 --scsihw virtio-scsi-pci
|
||||||
|
|
||||||
|
# Import disk to storage
|
||||||
|
qm set 9000 --scsi0 local-lvm:0,import-from=/path/to/bionic-server-cloudimg-amd64.img
|
||||||
|
```
|
||||||
|
|
||||||
|
**Important**: Ubuntu Cloud-Init images require `virtio-scsi-pci` controller type for SCSI drives.
|
||||||
|
|
||||||
|
### Configure Cloud-Init Components
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Add Cloud-Init CD-ROM drive
|
||||||
|
qm set 9000 --ide2 local-lvm:cloudinit
|
||||||
|
|
||||||
|
# Set boot order (speeds up boot)
|
||||||
|
qm set 9000 --boot order=scsi0
|
||||||
|
|
||||||
|
# Configure serial console (required for many cloud images)
|
||||||
|
qm set 9000 --serial0 socket --vga serial0
|
||||||
|
|
||||||
|
# Convert to template
|
||||||
|
qm template 9000
|
||||||
|
```
|
||||||
|
|
||||||
|
## Deploying from Templates
|
||||||
|
|
||||||
|
### Clone Template
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone template to new VM
|
||||||
|
qm clone 9000 123 --name ubuntu2
|
||||||
|
```
|
||||||
|
|
||||||
|
### Configure VM
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Set SSH public key
|
||||||
|
qm set 123 --sshkey ~/.ssh/id_rsa.pub
|
||||||
|
|
||||||
|
# Configure network
|
||||||
|
qm set 123 --ipconfig0 ip=10.0.10.123/24,gw=10.0.10.1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Custom Cloud-Init Configuration
|
||||||
|
|
||||||
|
### Using Custom Config Files
|
||||||
|
|
||||||
|
Proxmox allows custom cloud-init configurations via the `cicustom` option:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm set 9000 --cicustom "user=<volume>,network=<volume>,meta=<volume>"
|
||||||
|
```
|
||||||
|
|
||||||
|
Example using local snippets storage:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm set 9000 --cicustom "user=local:snippets/userconfig.yaml"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Dump Generated Config
|
||||||
|
|
||||||
|
Use as a base for custom configurations:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm cloudinit dump 9000 user
|
||||||
|
qm cloudinit dump 9000 network
|
||||||
|
qm cloudinit dump 9000 meta
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cloud-Init Options Reference
|
||||||
|
|
||||||
|
### cicustom
|
||||||
|
|
||||||
|
Specify custom files to replace automatically generated ones:
|
||||||
|
|
||||||
|
- `meta=<volume>` - Meta data (provider specific)
|
||||||
|
- `network=<volume>` - Network data
|
||||||
|
- `user=<volume>` - User data
|
||||||
|
- `vendor=<volume>` - Vendor data
|
||||||
|
|
||||||
|
### cipassword
|
||||||
|
|
||||||
|
Password for the user. **Not recommended** - use SSH keys instead.
|
||||||
|
|
||||||
|
### citype
|
||||||
|
|
||||||
|
Configuration format: `configdrive2 | nocloud | opennebula`
|
||||||
|
|
||||||
|
- Default: `nocloud` for Linux, `configdrive2` for Windows
|
||||||
|
|
||||||
|
### ciupgrade
|
||||||
|
|
||||||
|
Automatic package upgrade after first boot (default: `true`)
|
||||||
|
|
||||||
|
### ciuser
|
||||||
|
|
||||||
|
Username to configure (instead of image's default user)
|
||||||
|
|
||||||
|
### ipconfig[n]
|
||||||
|
|
||||||
|
IP addresses and gateways for network interfaces.
|
||||||
|
|
||||||
|
Format: `[gw=<GatewayIPv4>] [,gw6=<GatewayIPv6>] [,ip=<IPv4Format/CIDR>] [,ip6=<IPv6Format/CIDR>]`
|
||||||
|
|
||||||
|
Special values:
|
||||||
|
|
||||||
|
- `ip=dhcp` - Use DHCP for IPv4
|
||||||
|
- `ip6=auto` - Use stateless autoconfiguration (requires cloud-init 19.4+)
|
||||||
|
|
||||||
|
### sshkeys
|
||||||
|
|
||||||
|
Public SSH keys (one per line, OpenSSH format)
|
||||||
|
|
||||||
|
### nameserver
|
||||||
|
|
||||||
|
DNS server IP address
|
||||||
|
|
||||||
|
### searchdomain
|
||||||
|
|
||||||
|
DNS search domains
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Use SSH keys** instead of passwords for authentication
|
||||||
|
2. **Configure serial console** for cloud images (many require it)
|
||||||
|
3. **Set boot order** to speed up boot process
|
||||||
|
4. **Convert to template** for fast linked clone deployment
|
||||||
|
5. **Store custom configs in snippets** storage (must be on all nodes for migration)
|
||||||
|
6. **Test with a clone** before modifying template
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Template Won't Boot
|
||||||
|
|
||||||
|
- Check if serial console is configured: `qm set <vmid> --serial0 socket --vga serial0`
|
||||||
|
- Verify boot order: `qm set <vmid> --boot order=scsi0`
|
||||||
|
|
||||||
|
### Network Not Configured
|
||||||
|
|
||||||
|
- Ensure cloud-init CD-ROM is attached: `qm set <vmid> --ide2 local-lvm:cloudinit`
|
||||||
|
- Check IP configuration: `qm config <vmid> | grep ipconfig`
|
||||||
|
|
||||||
|
### SSH Keys Not Working
|
||||||
|
|
||||||
|
- Verify sshkeys format (OpenSSH format, one per line)
|
||||||
|
- Check cloud-init logs in VM: `cat /var/log/cloud-init.log`
|
||||||
373
skills/proxmox-infrastructure/reference/networking.md
Normal file
373
skills/proxmox-infrastructure/reference/networking.md
Normal file
@@ -0,0 +1,373 @@
|
|||||||
|
# Proxmox Network Configuration
|
||||||
|
|
||||||
|
*Source: <https://pve.proxmox.com/wiki/Network_Configuration*>
|
||||||
|
|
||||||
|
## Key Concepts
|
||||||
|
|
||||||
|
### Configuration File
|
||||||
|
|
||||||
|
All network configuration is in `/etc/network/interfaces`. GUI changes write to `/etc/network/interfaces.new` for safety.
|
||||||
|
|
||||||
|
### Applying Changes
|
||||||
|
|
||||||
|
**ifupdown2 (recommended):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Apply from GUI or run:
|
||||||
|
ifreload -a
|
||||||
|
```
|
||||||
|
|
||||||
|
**Reboot method:**
|
||||||
|
The `pvenetcommit` service activates staging file before `networking` service applies it.
|
||||||
|
|
||||||
|
## Naming Conventions
|
||||||
|
|
||||||
|
### Current (Proxmox VE 5.0+)
|
||||||
|
|
||||||
|
- Ethernet: `en*` (systemd predictable names)
|
||||||
|
- `eno1` - first on-board NIC
|
||||||
|
- `enp3s0f1` - function 1 of NIC on PCI bus 3, slot 0
|
||||||
|
- Bridges: `vmbr[0-4094]`
|
||||||
|
- Bonds: `bond[N]`
|
||||||
|
- VLANs: Add VLAN number after period: `eno1.50`, `bond1.30`
|
||||||
|
|
||||||
|
### Legacy (pre-5.0)
|
||||||
|
|
||||||
|
- Ethernet: `eth[N]` (eth0, eth1, ...)
|
||||||
|
|
||||||
|
### Pinning Naming Scheme Version
|
||||||
|
|
||||||
|
Add to kernel command line to prevent name changes:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
net.naming-scheme=v252
|
||||||
|
```
|
||||||
|
|
||||||
|
### Overriding Device Names
|
||||||
|
|
||||||
|
**Automatic tool:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Generate .link files for all interfaces
|
||||||
|
pve-network-interface-pinning generate
|
||||||
|
|
||||||
|
# With custom prefix
|
||||||
|
pve-network-interface-pinning generate --prefix myprefix
|
||||||
|
|
||||||
|
# Pin specific interface
|
||||||
|
pve-network-interface-pinning generate --interface enp1s0 --target-name if42
|
||||||
|
```
|
||||||
|
|
||||||
|
**Manual method** (`/etc/systemd/network/10-enwan0.link`):
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Match]
|
||||||
|
MACAddress=aa:bb:cc:dd:ee:ff
|
||||||
|
Type=ether
|
||||||
|
|
||||||
|
[Link]
|
||||||
|
Name=enwan0
|
||||||
|
```
|
||||||
|
|
||||||
|
After creating link files:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
update-initramfs -u -k all
|
||||||
|
# Then reboot
|
||||||
|
```
|
||||||
|
|
||||||
|
## Network Setups
|
||||||
|
|
||||||
|
### Default Bridged Configuration
|
||||||
|
|
||||||
|
```bash
|
||||||
|
auto lo
|
||||||
|
iface lo inet loopback
|
||||||
|
|
||||||
|
iface eno1 inet manual
|
||||||
|
|
||||||
|
auto vmbr0
|
||||||
|
iface vmbr0 inet static
|
||||||
|
address 192.168.10.2/24
|
||||||
|
gateway 192.168.10.1
|
||||||
|
bridge-ports eno1
|
||||||
|
bridge-stp off
|
||||||
|
bridge-fd 0
|
||||||
|
```
|
||||||
|
|
||||||
|
VMs behave as if directly connected to physical network.
|
||||||
|
|
||||||
|
### Routed Configuration
|
||||||
|
|
||||||
|
For hosting providers that block multiple MACs:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
auto lo
|
||||||
|
iface lo inet loopback
|
||||||
|
|
||||||
|
auto eno0
|
||||||
|
iface eno0 inet static
|
||||||
|
address 198.51.100.5/29
|
||||||
|
gateway 198.51.100.1
|
||||||
|
post-up echo 1 > /proc/sys/net/ipv4/ip_forward
|
||||||
|
post-up echo 1 > /proc/sys/net/ipv4/conf/eno0/proxy_arp
|
||||||
|
|
||||||
|
auto vmbr0
|
||||||
|
iface vmbr0 inet static
|
||||||
|
address 203.0.113.17/28
|
||||||
|
bridge-ports none
|
||||||
|
bridge-stp off
|
||||||
|
bridge-fd 0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Masquerading (NAT)
|
||||||
|
|
||||||
|
For VMs with private IPs:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
auto lo
|
||||||
|
iface lo inet loopback
|
||||||
|
|
||||||
|
auto eno1
|
||||||
|
iface eno1 inet static
|
||||||
|
address 198.51.100.5/24
|
||||||
|
gateway 198.51.100.1
|
||||||
|
|
||||||
|
auto vmbr0
|
||||||
|
iface vmbr0 inet static
|
||||||
|
address 10.10.10.1/24
|
||||||
|
bridge-ports none
|
||||||
|
bridge-stp off
|
||||||
|
bridge-fd 0
|
||||||
|
post-up echo 1 > /proc/sys/net/ipv4/ip_forward
|
||||||
|
post-up iptables -t nat -A POSTROUTING -s '10.10.10.0/24' -o eno1 -j MASQUERADE
|
||||||
|
post-down iptables -t nat -D POSTROUTING -s '10.10.10.0/24' -o eno1 -j MASQUERADE
|
||||||
|
```
|
||||||
|
|
||||||
|
**Conntrack zones fix** (if firewall blocks outgoing):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
post-up iptables -t raw -I PREROUTING -i fwbr+ -j CT --zone 1
|
||||||
|
post-down iptables -t raw -D PREROUTING -i fwbr+ -j CT --zone 1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Linux Bonding
|
||||||
|
|
||||||
|
### Bond Modes
|
||||||
|
|
||||||
|
1. **balance-rr** - Round-robin (load balancing + fault tolerance)
|
||||||
|
2. **active-backup** - Only one active NIC (fault tolerance only)
|
||||||
|
3. **balance-xor** - XOR selection (load balancing + fault tolerance)
|
||||||
|
4. **broadcast** - Transmit on all slaves (fault tolerance)
|
||||||
|
5. **802.3ad (LACP)** - IEEE 802.3ad Dynamic link aggregation (requires switch support)
|
||||||
|
6. **balance-tlb** - Adaptive transmit load balancing
|
||||||
|
7. **balance-alb** - Adaptive load balancing (balance-tlb + receive balancing)
|
||||||
|
|
||||||
|
**Recommendation:**
|
||||||
|
|
||||||
|
- If switch supports LACP → use 802.3ad
|
||||||
|
- Otherwise → use active-backup
|
||||||
|
|
||||||
|
### Bond with Fixed IP
|
||||||
|
|
||||||
|
```bash
|
||||||
|
auto lo
|
||||||
|
iface lo inet loopback
|
||||||
|
|
||||||
|
iface eno1 inet manual
|
||||||
|
iface eno2 inet manual
|
||||||
|
|
||||||
|
auto bond0
|
||||||
|
iface bond0 inet static
|
||||||
|
bond-slaves eno1 eno2
|
||||||
|
address 192.168.1.2/24
|
||||||
|
bond-miimon 100
|
||||||
|
bond-mode 802.3ad
|
||||||
|
bond-xmit-hash-policy layer2+3
|
||||||
|
|
||||||
|
auto vmbr0
|
||||||
|
iface vmbr0 inet static
|
||||||
|
address 10.10.10.2/24
|
||||||
|
gateway 10.10.10.1
|
||||||
|
bridge-ports eno3
|
||||||
|
bridge-stp off
|
||||||
|
bridge-fd 0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Bond as Bridge Port
|
||||||
|
|
||||||
|
For fault-tolerant guest network:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
auto lo
|
||||||
|
iface lo inet loopback
|
||||||
|
|
||||||
|
iface eno1 inet manual
|
||||||
|
iface eno2 inet manual
|
||||||
|
|
||||||
|
auto bond0
|
||||||
|
iface bond0 inet manual
|
||||||
|
bond-slaves eno1 eno2
|
||||||
|
bond-miimon 100
|
||||||
|
bond-mode 802.3ad
|
||||||
|
bond-xmit-hash-policy layer2+3
|
||||||
|
|
||||||
|
auto vmbr0
|
||||||
|
iface vmbr0 inet static
|
||||||
|
address 10.10.10.2/24
|
||||||
|
gateway 10.10.10.1
|
||||||
|
bridge-ports bond0
|
||||||
|
bridge-stp off
|
||||||
|
bridge-fd 0
|
||||||
|
```
|
||||||
|
|
||||||
|
## VLAN Configuration (802.1Q)
|
||||||
|
|
||||||
|
### VLAN Awareness on Bridge
|
||||||
|
|
||||||
|
**Guest VLANs** - Configure VLAN tag in VM settings, bridge handles transparently.
|
||||||
|
|
||||||
|
**Bridge with VLAN awareness:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
auto vmbr0
|
||||||
|
iface vmbr0 inet manual
|
||||||
|
bridge-ports eno1
|
||||||
|
bridge-stp off
|
||||||
|
bridge-fd 0
|
||||||
|
bridge-vlan-aware yes
|
||||||
|
bridge-vids 2-4094
|
||||||
|
```
|
||||||
|
|
||||||
|
### Host Management on VLAN
|
||||||
|
|
||||||
|
**With VLAN-aware bridge:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
auto lo
|
||||||
|
iface lo inet loopback
|
||||||
|
|
||||||
|
iface eno1 inet manual
|
||||||
|
|
||||||
|
auto vmbr0.5
|
||||||
|
iface vmbr0.5 inet static
|
||||||
|
address 10.10.10.2/24
|
||||||
|
gateway 10.10.10.1
|
||||||
|
|
||||||
|
auto vmbr0
|
||||||
|
iface vmbr0 inet manual
|
||||||
|
bridge-ports eno1
|
||||||
|
bridge-stp off
|
||||||
|
bridge-fd 0
|
||||||
|
bridge-vlan-aware yes
|
||||||
|
bridge-vids 2-4094
|
||||||
|
```
|
||||||
|
|
||||||
|
**Traditional VLAN:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
auto lo
|
||||||
|
iface lo inet loopback
|
||||||
|
|
||||||
|
iface eno1 inet manual
|
||||||
|
iface eno1.5 inet manual
|
||||||
|
|
||||||
|
auto vmbr0v5
|
||||||
|
iface vmbr0v5 inet static
|
||||||
|
address 10.10.10.2/24
|
||||||
|
gateway 10.10.10.1
|
||||||
|
bridge-ports eno1.5
|
||||||
|
bridge-stp off
|
||||||
|
bridge-fd 0
|
||||||
|
|
||||||
|
auto vmbr0
|
||||||
|
iface vmbr0 inet manual
|
||||||
|
bridge-ports eno1
|
||||||
|
bridge-stp off
|
||||||
|
bridge-fd 0
|
||||||
|
```
|
||||||
|
|
||||||
|
### VLAN with Bonding
|
||||||
|
|
||||||
|
```bash
|
||||||
|
auto lo
|
||||||
|
iface lo inet loopback
|
||||||
|
|
||||||
|
iface eno1 inet manual
|
||||||
|
iface eno2 inet manual
|
||||||
|
|
||||||
|
auto bond0
|
||||||
|
iface bond0 inet manual
|
||||||
|
bond-slaves eno1 eno2
|
||||||
|
bond-miimon 100
|
||||||
|
bond-mode 802.3ad
|
||||||
|
bond-xmit-hash-policy layer2+3
|
||||||
|
|
||||||
|
iface bond0.5 inet manual
|
||||||
|
|
||||||
|
auto vmbr0v5
|
||||||
|
iface vmbr0v5 inet static
|
||||||
|
address 10.10.10.2/24
|
||||||
|
gateway 10.10.10.1
|
||||||
|
bridge-ports bond0.5
|
||||||
|
bridge-stp off
|
||||||
|
bridge-fd 0
|
||||||
|
|
||||||
|
auto vmbr0
|
||||||
|
iface vmbr0 inet manual
|
||||||
|
bridge-ports bond0
|
||||||
|
bridge-stp off
|
||||||
|
bridge-fd 0
|
||||||
|
```
|
||||||
|
|
||||||
|
## Advanced Features
|
||||||
|
|
||||||
|
### Disable MAC Learning
|
||||||
|
|
||||||
|
Available since Proxmox VE 7.3:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
auto vmbr0
|
||||||
|
iface vmbr0 inet static
|
||||||
|
address 10.10.10.2/24
|
||||||
|
gateway 10.10.10.1
|
||||||
|
bridge-ports ens18
|
||||||
|
bridge-stp off
|
||||||
|
bridge-fd 0
|
||||||
|
bridge-disable-mac-learning 1
|
||||||
|
```
|
||||||
|
|
||||||
|
Proxmox VE manually adds VM/CT MAC addresses to forwarding database.
|
||||||
|
|
||||||
|
### Disable IPv6
|
||||||
|
|
||||||
|
Create `/etc/sysctl.d/disable-ipv6.conf`:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
net.ipv6.conf.all.disable_ipv6 = 1
|
||||||
|
net.ipv6.conf.default.disable_ipv6 = 1
|
||||||
|
```
|
||||||
|
|
||||||
|
Then: `sysctl -p /etc/sysctl.d/disable-ipv6.conf`
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Avoid ifup/ifdown
|
||||||
|
|
||||||
|
**Don't use** `ifup`/`ifdown` on bridges as they interrupt guest traffic without reconnecting.
|
||||||
|
|
||||||
|
**Use instead:**
|
||||||
|
|
||||||
|
- GUI "Apply Configuration" button
|
||||||
|
- `ifreload -a` command
|
||||||
|
- Reboot
|
||||||
|
|
||||||
|
### Network Changes Not Applied
|
||||||
|
|
||||||
|
1. Check `/etc/network/interfaces.new` exists
|
||||||
|
2. Click "Apply Configuration" in GUI or run `ifreload -a`
|
||||||
|
3. If issues persist, reboot
|
||||||
|
|
||||||
|
### Bond Not Working with Corosync
|
||||||
|
|
||||||
|
Some bond modes are problematic for Corosync. Use multiple networks instead of bonding for cluster traffic.
|
||||||
467
skills/proxmox-infrastructure/reference/qemu-guest-agent.md
Normal file
467
skills/proxmox-infrastructure/reference/qemu-guest-agent.md
Normal file
@@ -0,0 +1,467 @@
|
|||||||
|
# QEMU Guest Agent Integration
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The QEMU Guest Agent (`qemu-guest-agent`) is a service running inside VMs that enables communication between Proxmox and the guest OS. It provides IP address detection, graceful shutdowns, filesystem freezing for snapshots, and more.
|
||||||
|
|
||||||
|
## Why Use QEMU Guest Agent?
|
||||||
|
|
||||||
|
**Without Guest Agent:**
|
||||||
|
|
||||||
|
- VM IP address unknown to Proxmox
|
||||||
|
- Shutdown = hard power off
|
||||||
|
- Snapshots don't freeze filesystem (risk of corruption)
|
||||||
|
- No guest-level monitoring
|
||||||
|
|
||||||
|
**With Guest Agent:**
|
||||||
|
|
||||||
|
- Automatic IP address detection
|
||||||
|
- Graceful shutdown/reboot
|
||||||
|
- Consistent snapshots with filesystem freeze
|
||||||
|
- Execute commands inside VM
|
||||||
|
- Query guest information (hostname, users, OS details)
|
||||||
|
|
||||||
|
## Installation in Guest VM
|
||||||
|
|
||||||
|
### Ubuntu/Debian
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install qemu-guest-agent
|
||||||
|
sudo systemctl enable qemu-guest-agent
|
||||||
|
sudo systemctl start qemu-guest-agent
|
||||||
|
```
|
||||||
|
|
||||||
|
### RHEL/Rocky/AlmaLinux
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo dnf install qemu-guest-agent
|
||||||
|
sudo systemctl enable qemu-guest-agent
|
||||||
|
sudo systemctl start qemu-guest-agent
|
||||||
|
```
|
||||||
|
|
||||||
|
### Verify Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
systemctl status qemu-guest-agent
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected output:**
|
||||||
|
|
||||||
|
```text
|
||||||
|
● qemu-guest-agent.service - QEMU Guest Agent
|
||||||
|
Loaded: loaded (/lib/systemd/system/qemu-guest-agent.service; enabled)
|
||||||
|
Active: active (running)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Enable in VM Configuration
|
||||||
|
|
||||||
|
### Via Proxmox Web UI
|
||||||
|
|
||||||
|
**VM → Hardware → Add → QEMU Guest Agent**
|
||||||
|
|
||||||
|
OR edit VM options:
|
||||||
|
|
||||||
|
**VM → Options → QEMU Guest Agent → Edit → Check "Use QEMU Guest Agent"**
|
||||||
|
|
||||||
|
### Via CLI
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm set <vmid> --agent 1
|
||||||
|
```
|
||||||
|
|
||||||
|
**With custom options:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Enable with filesystem freeze support
|
||||||
|
qm set <vmid> --agent enabled=1,fstrim_cloned_disks=1
|
||||||
|
```
|
||||||
|
|
||||||
|
### Via Terraform
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
resource "proxmox_vm_qemu" "vm" {
|
||||||
|
name = "my-vm"
|
||||||
|
# ... other config ...
|
||||||
|
|
||||||
|
agent = 1 # Enable guest agent
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Via Ansible
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: Enable QEMU guest agent
|
||||||
|
community.proxmox.proxmox_kvm:
|
||||||
|
api_host: "{{ proxmox_api_host }}"
|
||||||
|
api_user: "{{ proxmox_api_user }}"
|
||||||
|
api_token_id: "{{ proxmox_token_id }}"
|
||||||
|
api_token_secret: "{{ proxmox_token_secret }}"
|
||||||
|
node: foxtrot
|
||||||
|
vmid: 101
|
||||||
|
agent: 1
|
||||||
|
update: true
|
||||||
|
```
|
||||||
|
|
||||||
|
## Using Guest Agent
|
||||||
|
|
||||||
|
### Check Agent Status
|
||||||
|
|
||||||
|
**Via CLI:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test if agent is responding
|
||||||
|
qm agent 101 ping
|
||||||
|
|
||||||
|
# Get guest info
|
||||||
|
qm agent 101 info
|
||||||
|
|
||||||
|
# Get network interfaces
|
||||||
|
qm agent 101 network-get-interfaces
|
||||||
|
|
||||||
|
# Get IP addresses
|
||||||
|
qm agent 101 get-osinfo
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example output:**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"result": {
|
||||||
|
"id": "ubuntu",
|
||||||
|
"kernel-release": "5.15.0-91-generic",
|
||||||
|
"kernel-version": "#101-Ubuntu SMP",
|
||||||
|
"machine": "x86_64",
|
||||||
|
"name": "Ubuntu",
|
||||||
|
"pretty-name": "Ubuntu 22.04.3 LTS",
|
||||||
|
"version": "22.04",
|
||||||
|
"version-id": "22.04"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Execute Commands
|
||||||
|
|
||||||
|
**Via CLI:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Execute command in guest
|
||||||
|
qm guest exec 101 -- whoami
|
||||||
|
|
||||||
|
# With arguments
|
||||||
|
qm guest exec 101 -- ls -la /tmp
|
||||||
|
```
|
||||||
|
|
||||||
|
**Via Python API:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from proxmoxer import ProxmoxAPI
|
||||||
|
|
||||||
|
proxmox = ProxmoxAPI('192.168.3.5', user='root@pam', password='pass')
|
||||||
|
|
||||||
|
# Execute command
|
||||||
|
result = proxmox.nodes('foxtrot').qemu(101).agent.exec.post(
|
||||||
|
command=['whoami']
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get execution result
|
||||||
|
pid = result['pid']
|
||||||
|
exec_status = proxmox.nodes('foxtrot').qemu(101).agent('exec-status').get(pid=pid)
|
||||||
|
print(exec_status)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Graceful Shutdown/Reboot
|
||||||
|
|
||||||
|
**Shutdown (graceful with agent):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Sends ACPI shutdown to guest, waits for agent to shutdown OS
|
||||||
|
qm shutdown 101
|
||||||
|
|
||||||
|
# Force shutdown if doesn't complete in 60s
|
||||||
|
qm shutdown 101 --timeout 60 --forceStop 1
|
||||||
|
```
|
||||||
|
|
||||||
|
**Reboot:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm reboot 101
|
||||||
|
```
|
||||||
|
|
||||||
|
## Snapshot Integration
|
||||||
|
|
||||||
|
### Filesystem Freeze for Consistent Snapshots
|
||||||
|
|
||||||
|
When guest agent is enabled, Proxmox can freeze the filesystem before taking a snapshot, ensuring consistency.
|
||||||
|
|
||||||
|
**Create snapshot with FS freeze:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Guest agent automatically freezes filesystem
|
||||||
|
qm snapshot 101 before-upgrade --vmstate 0 --description "Before upgrade"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Rollback to snapshot:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm rollback 101 before-upgrade
|
||||||
|
```
|
||||||
|
|
||||||
|
**Delete snapshot:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm delsnapshot 101 before-upgrade
|
||||||
|
```
|
||||||
|
|
||||||
|
## IP Address Detection
|
||||||
|
|
||||||
|
### Automatic IP Assignment
|
||||||
|
|
||||||
|
With guest agent, Proxmox automatically detects VM IP addresses.
|
||||||
|
|
||||||
|
**View in Web UI:**
|
||||||
|
|
||||||
|
VM → Summary → IPs section shows detected IPs
|
||||||
|
|
||||||
|
**Via CLI:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm agent 101 network-get-interfaces | jq '.result[] | select(.name=="eth0") | ."ip-addresses"'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Via Python:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
interfaces = proxmox.nodes('foxtrot').qemu(101).agent('network-get-interfaces').get()
|
||||||
|
|
||||||
|
for iface in interfaces['result']:
|
||||||
|
if iface['name'] == 'eth0':
|
||||||
|
for ip in iface.get('ip-addresses', []):
|
||||||
|
if ip['ip-address-type'] == 'ipv4':
|
||||||
|
print(f"IPv4: {ip['ip-address']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Advanced Configuration
|
||||||
|
|
||||||
|
### Guest Agent Options
|
||||||
|
|
||||||
|
**Full options syntax:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm set <vmid> --agent [enabled=]<1|0>[,fstrim_cloned_disks=<1|0>][,type=<virtio|isa>]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
|
||||||
|
- `enabled` - Enable/disable guest agent (default: 1)
|
||||||
|
- `fstrim_cloned_disks` - Run fstrim after cloning disk (default: 0)
|
||||||
|
- `type` - Agent communication type: virtio or isa (default: virtio)
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Enable with fstrim on cloned disks
|
||||||
|
qm set 101 --agent enabled=1,fstrim_cloned_disks=1
|
||||||
|
```
|
||||||
|
|
||||||
|
### Filesystem Trim (fstrim)
|
||||||
|
|
||||||
|
For VMs on thin-provisioned storage (LVM-thin, CEPH), fstrim helps reclaim unused space.
|
||||||
|
|
||||||
|
**Manual fstrim:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Inside VM
|
||||||
|
sudo fstrim -av
|
||||||
|
```
|
||||||
|
|
||||||
|
**Automatic on clone:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm set <vmid> --agent enabled=1,fstrim_cloned_disks=1
|
||||||
|
```
|
||||||
|
|
||||||
|
**Scheduled fstrim (inside VM):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Enable weekly fstrim timer
|
||||||
|
sudo systemctl enable fstrim.timer
|
||||||
|
sudo systemctl start fstrim.timer
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cloud-Init Integration
|
||||||
|
|
||||||
|
### Include in Cloud-Init Template
|
||||||
|
|
||||||
|
**During template creation:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install agent package
|
||||||
|
virt-customize -a ubuntu-22.04.img \
|
||||||
|
--install qemu-guest-agent \
|
||||||
|
--run-command "systemctl enable qemu-guest-agent"
|
||||||
|
|
||||||
|
# Create VM from image
|
||||||
|
qm create 9000 --name ubuntu-template --memory 2048 --cores 2 --net0 virtio,bridge=vmbr0
|
||||||
|
qm importdisk 9000 ubuntu-22.04.img local-lvm
|
||||||
|
qm set 9000 --scsihw virtio-scsi-pci --scsi0 local-lvm:vm-9000-disk-0
|
||||||
|
qm set 9000 --agent 1 # Enable guest agent
|
||||||
|
qm set 9000 --ide2 local-lvm:cloudinit
|
||||||
|
qm template 9000
|
||||||
|
```
|
||||||
|
|
||||||
|
### Cloud-Init User Data
|
||||||
|
|
||||||
|
**Include in cloud-init config:**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
#cloud-config
|
||||||
|
packages:
|
||||||
|
- qemu-guest-agent
|
||||||
|
|
||||||
|
runcmd:
|
||||||
|
- systemctl enable qemu-guest-agent
|
||||||
|
- systemctl start qemu-guest-agent
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Guest Agent Not Responding
|
||||||
|
|
||||||
|
**1. Check if service is running in guest:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Inside VM
|
||||||
|
systemctl status qemu-guest-agent
|
||||||
|
journalctl -u qemu-guest-agent
|
||||||
|
```
|
||||||
|
|
||||||
|
**2. Check if agent is enabled in VM config:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On Proxmox host
|
||||||
|
qm config 101 | grep agent
|
||||||
|
```
|
||||||
|
|
||||||
|
**3. Check virtio serial device:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Inside VM
|
||||||
|
ls -l /dev/virtio-ports/
|
||||||
|
# Should show: org.qemu.guest_agent.0
|
||||||
|
```
|
||||||
|
|
||||||
|
**4. Restart agent:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Inside VM
|
||||||
|
sudo systemctl restart qemu-guest-agent
|
||||||
|
```
|
||||||
|
|
||||||
|
**5. Check Proxmox can communicate:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On Proxmox host
|
||||||
|
qm agent 101 ping
|
||||||
|
```
|
||||||
|
|
||||||
|
### IP Address Not Detected
|
||||||
|
|
||||||
|
**Possible causes:**
|
||||||
|
|
||||||
|
1. Guest agent not running
|
||||||
|
2. Network interface not configured
|
||||||
|
3. DHCP not assigning IP
|
||||||
|
4. Firewall blocking communication
|
||||||
|
|
||||||
|
**Debug:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check all interfaces
|
||||||
|
qm agent 101 network-get-interfaces | jq
|
||||||
|
|
||||||
|
# Verify cloud-init completed
|
||||||
|
# Inside VM
|
||||||
|
cloud-init status
|
||||||
|
```
|
||||||
|
|
||||||
|
### Filesystem Freeze Timeout
|
||||||
|
|
||||||
|
**Symptoms:**
|
||||||
|
|
||||||
|
Snapshot creation hangs or times out.
|
||||||
|
|
||||||
|
**Solution:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Disable FS freeze for snapshots
|
||||||
|
qm set 101 --agent enabled=1
|
||||||
|
|
||||||
|
# Take snapshot without FS freeze
|
||||||
|
qm snapshot 101 test --vmstate 0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Agent Installed but Not Enabled
|
||||||
|
|
||||||
|
**Check VM config:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm config 101 | grep agent
|
||||||
|
```
|
||||||
|
|
||||||
|
**If missing, enable:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm set 101 --agent 1
|
||||||
|
```
|
||||||
|
|
||||||
|
**Restart VM for changes to take effect:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm reboot 101
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Always install in templates** - Include qemu-guest-agent in VM templates
|
||||||
|
2. **Enable during provisioning** - Set `--agent 1` when creating VMs
|
||||||
|
3. **Use for production VMs** - Critical for graceful shutdowns and monitoring
|
||||||
|
4. **Enable fstrim for thin storage** - Helps reclaim space on LVM-thin and CEPH
|
||||||
|
5. **Test before snapshots** - Verify agent works: `qm agent <vmid> ping`
|
||||||
|
6. **Cloud-init integration** - Automate installation via cloud-init packages
|
||||||
|
7. **Monitor agent status** - Check agent is running in monitoring tools
|
||||||
|
|
||||||
|
## Ansible Automation Example
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
---
|
||||||
|
- name: Ensure QEMU guest agent is configured
|
||||||
|
hosts: proxmox_vms
|
||||||
|
become: true
|
||||||
|
tasks:
|
||||||
|
- name: Install qemu-guest-agent
|
||||||
|
ansible.builtin.apt:
|
||||||
|
name: qemu-guest-agent
|
||||||
|
state: present
|
||||||
|
when: ansible_os_family == "Debian"
|
||||||
|
|
||||||
|
- name: Enable and start qemu-guest-agent
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
name: qemu-guest-agent
|
||||||
|
enabled: true
|
||||||
|
state: started
|
||||||
|
|
||||||
|
- name: Verify agent is running
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
name: qemu-guest-agent
|
||||||
|
register: agent_status
|
||||||
|
|
||||||
|
- name: Report agent status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Guest agent is {{ agent_status.status.ActiveState }}"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Further Reading
|
||||||
|
|
||||||
|
- [Proxmox QEMU Guest Agent Documentation](https://pve.proxmox.com/wiki/Qemu-guest-agent)
|
||||||
|
- [QEMU Guest Agent Protocol](https://www.qemu.org/docs/master/interop/qemu-ga.html)
|
||||||
486
skills/proxmox-infrastructure/reference/storage-management.md
Normal file
486
skills/proxmox-infrastructure/reference/storage-management.md
Normal file
@@ -0,0 +1,486 @@
|
|||||||
|
# Proxmox Storage Management
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Proxmox VE supports multiple storage backends. This guide focuses on the storage architecture of the Matrix cluster: LVM-thin for boot disks and CEPH for distributed storage.
|
||||||
|
|
||||||
|
## Matrix Cluster Storage Architecture
|
||||||
|
|
||||||
|
### Hardware Configuration
|
||||||
|
|
||||||
|
**Per Node (Foxtrot, Golf, Hotel):**
|
||||||
|
|
||||||
|
```text
|
||||||
|
nvme0n1 - 1TB Crucial P3 → Boot disk + LVM
|
||||||
|
nvme1n1 - 4TB Samsung 990 PRO → CEPH OSD (2 OSDs)
|
||||||
|
nvme2n1 - 4TB Samsung 990 PRO → CEPH OSD (2 OSDs)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Total Cluster:**
|
||||||
|
|
||||||
|
- 3× 1TB boot disks (LVM local storage)
|
||||||
|
- 6× 4TB NVMe drives (24TB raw CEPH capacity)
|
||||||
|
- 12 CEPH OSDs total (2 per NVMe drive)
|
||||||
|
|
||||||
|
### Storage Pools
|
||||||
|
|
||||||
|
```text
|
||||||
|
Storage Pool Type Backend Purpose
|
||||||
|
------------- ---- ------- -------
|
||||||
|
local dir Directory ISO images, templates, backups
|
||||||
|
local-lvm lvmthin LVM-thin VM disks (local)
|
||||||
|
ceph-pool rbd CEPH RBD VM disks (distributed, HA)
|
||||||
|
ceph-fs cephfs CephFS Shared filesystem
|
||||||
|
```
|
||||||
|
|
||||||
|
## LVM Storage
|
||||||
|
|
||||||
|
### LVM-thin Configuration
|
||||||
|
|
||||||
|
**Advantages:**
|
||||||
|
|
||||||
|
- Thin provisioning (overcommit storage)
|
||||||
|
- Fast snapshots
|
||||||
|
- Local to each node (low latency)
|
||||||
|
- No network overhead
|
||||||
|
|
||||||
|
**Disadvantages:**
|
||||||
|
|
||||||
|
- No HA (tied to single node)
|
||||||
|
- No live migration with storage
|
||||||
|
- Limited to node's local disk size
|
||||||
|
|
||||||
|
**Check LVM usage:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# View volume groups
|
||||||
|
vgs
|
||||||
|
|
||||||
|
# View logical volumes
|
||||||
|
lvs
|
||||||
|
|
||||||
|
# View thin pool usage
|
||||||
|
lvs -a | grep thin
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example output:**
|
||||||
|
|
||||||
|
```text
|
||||||
|
LV VG Attr LSize Pool Origin Data%
|
||||||
|
data pve twi-aotz-- 850.00g 45.23
|
||||||
|
vm-101-disk-0 pve Vwi-aotz-- 50.00g data 12.45
|
||||||
|
```
|
||||||
|
|
||||||
|
### Managing LVM Storage
|
||||||
|
|
||||||
|
**Extend thin pool (if boot disk has space):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check free space in VG
|
||||||
|
vgs pve
|
||||||
|
|
||||||
|
# Extend thin pool
|
||||||
|
lvextend -L +100G pve/data
|
||||||
|
```
|
||||||
|
|
||||||
|
**Create VM disk manually:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create 50GB disk for VM 101
|
||||||
|
lvcreate -V 50G -T pve/data -n vm-101-disk-0
|
||||||
|
```
|
||||||
|
|
||||||
|
## CEPH Storage
|
||||||
|
|
||||||
|
### CEPH Architecture for Matrix
|
||||||
|
|
||||||
|
**Network Configuration:**
|
||||||
|
|
||||||
|
```text
|
||||||
|
vmbr1 (192.168.5.0/24, MTU 9000) → CEPH Public Network
|
||||||
|
vmbr2 (192.168.7.0/24, MTU 9000) → CEPH Private Network
|
||||||
|
```
|
||||||
|
|
||||||
|
**OSD Distribution:**
|
||||||
|
|
||||||
|
```text
|
||||||
|
Node NVMe OSDs Capacity
|
||||||
|
------- ------ ---- --------
|
||||||
|
foxtrot nvme1n1 2 4TB
|
||||||
|
foxtrot nvme2n1 2 4TB
|
||||||
|
golf nvme1n1 2 4TB
|
||||||
|
golf nvme2n1 2 4TB
|
||||||
|
hotel nvme1n1 2 4TB
|
||||||
|
hotel nvme2n1 2 4TB
|
||||||
|
------- ------ ---- --------
|
||||||
|
Total 12 24TB raw
|
||||||
|
```
|
||||||
|
|
||||||
|
**Usable capacity (replica 3):** ~8TB
|
||||||
|
|
||||||
|
### CEPH Deployment Commands
|
||||||
|
|
||||||
|
**Install CEPH:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On first node (foxtrot)
|
||||||
|
pveceph install --version reef
|
||||||
|
|
||||||
|
# Initialize cluster
|
||||||
|
pveceph init --network 192.168.5.0/24 --cluster-network 192.168.7.0/24
|
||||||
|
```
|
||||||
|
|
||||||
|
**Create Monitors (3 for quorum):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On each node
|
||||||
|
pveceph mon create
|
||||||
|
```
|
||||||
|
|
||||||
|
**Create Manager (on each node):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pveceph mgr create
|
||||||
|
```
|
||||||
|
|
||||||
|
**Create OSDs:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On each node - 2 OSDs per NVMe drive
|
||||||
|
|
||||||
|
# For nvme1n1 (4TB)
|
||||||
|
pveceph osd create /dev/nvme1n1 --crush-device-class nvme
|
||||||
|
|
||||||
|
# For nvme2n1 (4TB)
|
||||||
|
pveceph osd create /dev/nvme2n1 --crush-device-class nvme
|
||||||
|
```
|
||||||
|
|
||||||
|
**Create CEPH Pool:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create RBD pool for VMs
|
||||||
|
pveceph pool create ceph-pool --add_storages
|
||||||
|
|
||||||
|
# Create CephFS for shared storage
|
||||||
|
pveceph fs create --name cephfs --add-storage
|
||||||
|
```
|
||||||
|
|
||||||
|
### CEPH Configuration Best Practices
|
||||||
|
|
||||||
|
**Optimize for NVMe:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# /etc/pve/ceph.conf
|
||||||
|
[global]
|
||||||
|
public_network = 192.168.5.0/24
|
||||||
|
cluster_network = 192.168.7.0/24
|
||||||
|
osd_pool_default_size = 3
|
||||||
|
osd_pool_default_min_size = 2
|
||||||
|
|
||||||
|
[osd]
|
||||||
|
osd_memory_target = 4294967296 # 4GB per OSD
|
||||||
|
osd_max_backfills = 1
|
||||||
|
osd_recovery_max_active = 1
|
||||||
|
```
|
||||||
|
|
||||||
|
**Restart CEPH services after config change:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
systemctl restart ceph-osd@*.service
|
||||||
|
```
|
||||||
|
|
||||||
|
### CEPH Monitoring
|
||||||
|
|
||||||
|
**Check cluster health:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ceph status
|
||||||
|
ceph health detail
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example healthy output:**
|
||||||
|
|
||||||
|
```text
|
||||||
|
cluster:
|
||||||
|
id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
||||||
|
health: HEALTH_OK
|
||||||
|
|
||||||
|
services:
|
||||||
|
mon: 3 daemons, quorum foxtrot,golf,hotel
|
||||||
|
mgr: foxtrot(active), standbys: golf, hotel
|
||||||
|
osd: 12 osds: 12 up, 12 in
|
||||||
|
|
||||||
|
data:
|
||||||
|
pools: 2 pools, 128 pgs
|
||||||
|
objects: 1.23k objects, 45 GiB
|
||||||
|
usage: 135 GiB used, 23.8 TiB / 24 TiB avail
|
||||||
|
pgs: 128 active+clean
|
||||||
|
```
|
||||||
|
|
||||||
|
**Check OSD performance:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ceph osd df
|
||||||
|
ceph osd perf
|
||||||
|
```
|
||||||
|
|
||||||
|
**Check pool usage:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ceph df
|
||||||
|
rados df
|
||||||
|
```
|
||||||
|
|
||||||
|
## Storage Configuration in Proxmox
|
||||||
|
|
||||||
|
### Add Storage via Web UI
|
||||||
|
|
||||||
|
**Datacenter → Storage → Add:**
|
||||||
|
|
||||||
|
1. **Directory** - For ISOs and backups
|
||||||
|
2. **LVM-Thin** - For local VM disks
|
||||||
|
3. **RBD** - For CEPH VM disks
|
||||||
|
4. **CephFS** - For shared files
|
||||||
|
|
||||||
|
### Add Storage via CLI
|
||||||
|
|
||||||
|
**CEPH RBD:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pvesm add rbd ceph-pool \
|
||||||
|
--pool ceph-pool \
|
||||||
|
--content images,rootdir \
|
||||||
|
--nodes foxtrot,golf,hotel
|
||||||
|
```
|
||||||
|
|
||||||
|
**CephFS:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pvesm add cephfs cephfs \
|
||||||
|
--path /mnt/pve/cephfs \
|
||||||
|
--content backup,iso,vztmpl \
|
||||||
|
--nodes foxtrot,golf,hotel
|
||||||
|
```
|
||||||
|
|
||||||
|
**NFS (if using external NAS):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pvesm add nfs nas-storage \
|
||||||
|
--server 192.168.3.10 \
|
||||||
|
--export /mnt/tank/proxmox \
|
||||||
|
--content images,backup,iso \
|
||||||
|
--nodes foxtrot,golf,hotel
|
||||||
|
```
|
||||||
|
|
||||||
|
## VM Disk Management
|
||||||
|
|
||||||
|
### Create VM Disk on CEPH
|
||||||
|
|
||||||
|
**Via CLI:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create 100GB disk for VM 101 on CEPH
|
||||||
|
qm set 101 --scsi1 ceph-pool:100
|
||||||
|
```
|
||||||
|
|
||||||
|
**Via API (Python):**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from proxmoxer import ProxmoxAPI
|
||||||
|
|
||||||
|
proxmox = ProxmoxAPI('192.168.3.5', user='root@pam', password='pass')
|
||||||
|
proxmox.nodes('foxtrot').qemu(101).config.put(scsi1='ceph-pool:100')
|
||||||
|
```
|
||||||
|
|
||||||
|
### Move VM Disk Between Storage
|
||||||
|
|
||||||
|
**Move from local-lvm to CEPH:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm move-disk 101 scsi0 ceph-pool --delete 1
|
||||||
|
```
|
||||||
|
|
||||||
|
**Move with live migration:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
qm move-disk 101 scsi0 ceph-pool --delete 1 --online 1
|
||||||
|
```
|
||||||
|
|
||||||
|
### Resize VM Disk
|
||||||
|
|
||||||
|
**Grow disk (can't shrink):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Grow VM 101's scsi0 by 50GB
|
||||||
|
qm resize 101 scsi0 +50G
|
||||||
|
```
|
||||||
|
|
||||||
|
**Inside VM (expand filesystem):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# For ext4
|
||||||
|
sudo resize2fs /dev/sda1
|
||||||
|
|
||||||
|
# For XFS
|
||||||
|
sudo xfs_growfs /
|
||||||
|
```
|
||||||
|
|
||||||
|
## Backup and Restore
|
||||||
|
|
||||||
|
### Backup to Storage
|
||||||
|
|
||||||
|
**Create backup:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Backup VM 101 to local storage
|
||||||
|
vzdump 101 --storage local --mode snapshot --compress zstd
|
||||||
|
|
||||||
|
# Backup to CephFS
|
||||||
|
vzdump 101 --storage cephfs --mode snapshot --compress zstd
|
||||||
|
```
|
||||||
|
|
||||||
|
**Scheduled backups (via Web UI):**
|
||||||
|
|
||||||
|
Datacenter → Backup → Add:
|
||||||
|
|
||||||
|
- Schedule: Daily at 2 AM
|
||||||
|
- Storage: cephfs
|
||||||
|
- Mode: Snapshot
|
||||||
|
- Compression: ZSTD
|
||||||
|
- Retention: Keep last 7
|
||||||
|
|
||||||
|
### Restore from Backup
|
||||||
|
|
||||||
|
**List backups:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ls /var/lib/vz/dump/
|
||||||
|
# OR
|
||||||
|
ls /mnt/pve/cephfs/dump/
|
||||||
|
```
|
||||||
|
|
||||||
|
**Restore:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Restore to same VMID
|
||||||
|
qmrestore /var/lib/vz/dump/vzdump-qemu-101-2024_01_15-02_00_00.vma.zst 101
|
||||||
|
|
||||||
|
# Restore to new VMID
|
||||||
|
qmrestore /var/lib/vz/dump/vzdump-qemu-101-2024_01_15-02_00_00.vma.zst 102 --storage ceph-pool
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performance Tuning
|
||||||
|
|
||||||
|
### CEPH Performance
|
||||||
|
|
||||||
|
**For NVMe OSDs:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Set proper device class
|
||||||
|
ceph osd crush set-device-class nvme osd.0
|
||||||
|
ceph osd crush set-device-class nvme osd.1
|
||||||
|
# ... repeat for all OSDs
|
||||||
|
```
|
||||||
|
|
||||||
|
**Create performance pool:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ceph osd pool create fast-pool 128 128
|
||||||
|
ceph osd pool application enable fast-pool rbd
|
||||||
|
```
|
||||||
|
|
||||||
|
**Enable RBD cache:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# /etc/pve/ceph.conf
|
||||||
|
[client]
|
||||||
|
rbd_cache = true
|
||||||
|
rbd_cache_size = 134217728 # 128MB
|
||||||
|
rbd_cache_writethrough_until_flush = false
|
||||||
|
```
|
||||||
|
|
||||||
|
### LVM Performance
|
||||||
|
|
||||||
|
**Use SSD discard:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Enable discard on VM disk
|
||||||
|
qm set 101 --scsi0 local-lvm:vm-101-disk-0,discard=on,ssd=1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### CEPH Not Healthy
|
||||||
|
|
||||||
|
**Check OSD status:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ceph osd tree
|
||||||
|
ceph osd stat
|
||||||
|
```
|
||||||
|
|
||||||
|
**Restart stuck OSD:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
systemctl restart ceph-osd@0.service
|
||||||
|
```
|
||||||
|
|
||||||
|
**Check network connectivity:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# From one node to another
|
||||||
|
ping -c 3 -M do -s 8972 192.168.5.6 # Test MTU 9000
|
||||||
|
```
|
||||||
|
|
||||||
|
### LVM Out of Space
|
||||||
|
|
||||||
|
**Check thin pool usage:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
lvs pve/data -o lv_name,data_percent,metadata_percent
|
||||||
|
```
|
||||||
|
|
||||||
|
**If thin pool > 90% full:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Extend if VG has space
|
||||||
|
lvextend -L +100G pve/data
|
||||||
|
|
||||||
|
# OR delete unused VM disks
|
||||||
|
lvremove pve/vm-XXX-disk-0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Storage Performance Issues
|
||||||
|
|
||||||
|
**Test disk I/O:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test sequential write
|
||||||
|
dd if=/dev/zero of=/tmp/test bs=1M count=1024 oflag=direct
|
||||||
|
|
||||||
|
# Test CEPH RBD performance
|
||||||
|
rbd bench --io-type write ceph-pool/test-image
|
||||||
|
```
|
||||||
|
|
||||||
|
**Monitor CEPH latency:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ceph osd perf
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Use CEPH for HA VMs** - Store critical VM disks on CEPH for live migration
|
||||||
|
2. **Use LVM for performance** - Non-critical VMs get better performance on local LVM
|
||||||
|
3. **MTU 9000 for CEPH** - Always use jumbo frames on CEPH networks
|
||||||
|
4. **Separate networks** - Public and private CEPH networks on different interfaces
|
||||||
|
5. **Monitor CEPH health** - Set up alerts for HEALTH_WARN/HEALTH_ERR
|
||||||
|
6. **Regular backups** - Automated daily backups to CephFS or external NAS
|
||||||
|
7. **Plan for growth** - Leave 20% free space in CEPH for rebalancing
|
||||||
|
8. **Use replica 3** - Essential for data safety, especially with only 3 nodes
|
||||||
|
|
||||||
|
## Further Reading
|
||||||
|
|
||||||
|
- [Proxmox VE Storage Documentation](https://pve.proxmox.com/wiki/Storage)
|
||||||
|
- [CEPH Documentation](https://docs.ceph.com/)
|
||||||
|
- [Proxmox CEPH Guide](https://pve.proxmox.com/wiki/Deploy_Hyper-Converged_Ceph_Cluster)
|
||||||
469
skills/proxmox-infrastructure/tools/check_ceph_health.py
Executable file
469
skills/proxmox-infrastructure/tools/check_ceph_health.py
Executable file
@@ -0,0 +1,469 @@
|
|||||||
|
#!/usr/bin/env -S uv run --script --quiet
|
||||||
|
# /// script
|
||||||
|
# requires-python = ">=3.11"
|
||||||
|
# dependencies = []
|
||||||
|
# ///
|
||||||
|
"""
|
||||||
|
CEPH Cluster Health Checker
|
||||||
|
|
||||||
|
Validates CEPH storage cluster health including:
|
||||||
|
- Cluster health status
|
||||||
|
- Monitor and manager status
|
||||||
|
- OSD status and distribution
|
||||||
|
- Pool configuration and usage
|
||||||
|
- PG state verification
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python check_ceph_health.py [--node NODE] [--json]
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
# Check CEPH health (requires SSH access to cluster node)
|
||||||
|
python check_ceph_health.py --node foxtrot
|
||||||
|
|
||||||
|
# Output as JSON for parsing
|
||||||
|
python check_ceph_health.py --node foxtrot --json
|
||||||
|
|
||||||
|
# Check minimum OSD count
|
||||||
|
python check_ceph_health.py --node foxtrot --min-osds 12
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass, asdict, field
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OSDStatus:
|
||||||
|
"""OSD status information"""
|
||||||
|
osd_id: int
|
||||||
|
host: str
|
||||||
|
status: str # up/down
|
||||||
|
in_cluster: bool
|
||||||
|
weight: float
|
||||||
|
device_class: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PoolStatus:
|
||||||
|
"""Pool status information"""
|
||||||
|
name: str
|
||||||
|
pool_id: int
|
||||||
|
size: int
|
||||||
|
min_size: int
|
||||||
|
pg_num: int
|
||||||
|
pgp_num: int
|
||||||
|
used_bytes: int
|
||||||
|
max_avail_bytes: int
|
||||||
|
percent_used: float
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MonitorStatus:
|
||||||
|
"""Monitor status"""
|
||||||
|
name: str
|
||||||
|
rank: int
|
||||||
|
address: str
|
||||||
|
in_quorum: bool
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ManagerStatus:
|
||||||
|
"""Manager status"""
|
||||||
|
name: str
|
||||||
|
active: bool
|
||||||
|
address: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CEPHHealth:
|
||||||
|
"""Overall CEPH health"""
|
||||||
|
status: str # HEALTH_OK, HEALTH_WARN, HEALTH_ERR
|
||||||
|
num_osds: int
|
||||||
|
num_up_osds: int
|
||||||
|
num_in_osds: int
|
||||||
|
num_pgs: int
|
||||||
|
num_active_clean_pgs: int
|
||||||
|
monitors: List[MonitorStatus] = field(default_factory=list)
|
||||||
|
managers: List[ManagerStatus] = field(default_factory=list)
|
||||||
|
osds: List[OSDStatus] = field(default_factory=list)
|
||||||
|
pools: List[PoolStatus] = field(default_factory=list)
|
||||||
|
data_bytes: int = 0
|
||||||
|
used_bytes: int = 0
|
||||||
|
avail_bytes: int = 0
|
||||||
|
warnings: List[str] = field(default_factory=list)
|
||||||
|
errors: List[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_healthy(self) -> bool:
|
||||||
|
"""Check if CEPH is in healthy state"""
|
||||||
|
return (
|
||||||
|
self.status == 'HEALTH_OK' and
|
||||||
|
self.num_up_osds == self.num_osds and
|
||||||
|
self.num_in_osds == self.num_osds and
|
||||||
|
self.num_active_clean_pgs == self.num_pgs and
|
||||||
|
len(self.errors) == 0
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def percent_used(self) -> float:
|
||||||
|
"""Calculate cluster usage percentage"""
|
||||||
|
if self.data_bytes == 0:
|
||||||
|
return 0.0
|
||||||
|
return (self.used_bytes / self.data_bytes) * 100
|
||||||
|
|
||||||
|
|
||||||
|
class CEPHHealthChecker:
|
||||||
|
"""Check CEPH cluster health via SSH"""
|
||||||
|
|
||||||
|
def __init__(self, node: str):
|
||||||
|
# Validate node is a valid hostname or IP address
|
||||||
|
if not self._validate_node(node):
|
||||||
|
raise ValueError(f"Invalid node name or IP address: {node}")
|
||||||
|
self.node = node
|
||||||
|
self.health = CEPHHealth(
|
||||||
|
status="UNKNOWN",
|
||||||
|
num_osds=0,
|
||||||
|
num_up_osds=0,
|
||||||
|
num_in_osds=0,
|
||||||
|
num_pgs=0,
|
||||||
|
num_active_clean_pgs=0
|
||||||
|
)
|
||||||
|
|
||||||
|
def _validate_node(self, node: str) -> bool:
|
||||||
|
"""Validate node is a valid hostname or IP address"""
|
||||||
|
# Allow valid hostnames and IPv4/IPv6 addresses
|
||||||
|
hostname_pattern = r'^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*$'
|
||||||
|
ipv4_pattern = r'^(\d{1,3}\.){3}\d{1,3}$'
|
||||||
|
ipv6_pattern = r'^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$'
|
||||||
|
return bool(
|
||||||
|
re.match(hostname_pattern, node) or
|
||||||
|
re.match(ipv4_pattern, node) or
|
||||||
|
re.match(ipv6_pattern, node)
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_command(self, command: str) -> str:
|
||||||
|
"""Execute command on remote node via SSH"""
|
||||||
|
try:
|
||||||
|
# Use -- to prevent SSH option injection
|
||||||
|
result = subprocess.run(
|
||||||
|
["ssh", "-o", "BatchMode=yes", f"root@{self.node}", "--", command],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
return result.stdout
|
||||||
|
except subprocess.TimeoutExpired as e:
|
||||||
|
error_msg = f"Command timed out after 30s: {command}"
|
||||||
|
self.health.errors.append(error_msg)
|
||||||
|
raise RuntimeError(error_msg) from e
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
error_msg = f"Command failed: {command}: {e.stderr}"
|
||||||
|
self.health.errors.append(error_msg)
|
||||||
|
raise RuntimeError(error_msg) from e
|
||||||
|
|
||||||
|
def check_ceph_status(self):
|
||||||
|
"""Check ceph status output"""
|
||||||
|
output = self.run_command("ceph status --format json")
|
||||||
|
if not output:
|
||||||
|
self.health.errors.append("Failed to get CEPH status")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
status_data = json.loads(output)
|
||||||
|
|
||||||
|
# Parse overall health
|
||||||
|
self.health.status = status_data.get('health', {}).get('status', 'UNKNOWN')
|
||||||
|
|
||||||
|
# Parse OSD map
|
||||||
|
osd_map = status_data.get('osdmap', {}).get('osdmap', {})
|
||||||
|
self.health.num_osds = osd_map.get('num_osds', 0)
|
||||||
|
self.health.num_up_osds = osd_map.get('num_up_osds', 0)
|
||||||
|
self.health.num_in_osds = osd_map.get('num_in_osds', 0)
|
||||||
|
|
||||||
|
# Parse PG map
|
||||||
|
pg_map = status_data.get('pgmap', {})
|
||||||
|
self.health.num_pgs = pg_map.get('num_pgs', 0)
|
||||||
|
|
||||||
|
# Parse PG states
|
||||||
|
pg_states = pg_map.get('pgs_by_state', [])
|
||||||
|
for state in pg_states:
|
||||||
|
if state.get('state_name') == 'active+clean':
|
||||||
|
self.health.num_active_clean_pgs = state.get('count', 0)
|
||||||
|
|
||||||
|
# Parse storage usage
|
||||||
|
self.health.data_bytes = pg_map.get('data_bytes', 0)
|
||||||
|
self.health.used_bytes = pg_map.get('bytes_used', 0)
|
||||||
|
self.health.avail_bytes = pg_map.get('bytes_avail', 0)
|
||||||
|
|
||||||
|
# Check for health warnings
|
||||||
|
health_checks = status_data.get('health', {}).get('checks', {})
|
||||||
|
for check_name, check_data in health_checks.items():
|
||||||
|
severity = check_data.get('severity', '')
|
||||||
|
summary = check_data.get('summary', {}).get('message', '')
|
||||||
|
|
||||||
|
if severity == 'HEALTH_ERR':
|
||||||
|
self.health.errors.append(f"{check_name}: {summary}")
|
||||||
|
elif severity == 'HEALTH_WARN':
|
||||||
|
self.health.warnings.append(f"{check_name}: {summary}")
|
||||||
|
|
||||||
|
except (json.JSONDecodeError, KeyError) as e:
|
||||||
|
self.health.errors.append(f"Failed to parse CEPH status: {e}")
|
||||||
|
|
||||||
|
def check_monitors(self):
|
||||||
|
"""Check monitor status"""
|
||||||
|
output = self.run_command("ceph mon dump --format json")
|
||||||
|
if not output:
|
||||||
|
self.health.warnings.append("Failed to get monitor status")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
mon_data = json.loads(output)
|
||||||
|
quorum = set()
|
||||||
|
|
||||||
|
# Get quorum members
|
||||||
|
quorum_output = self.run_command("ceph quorum_status --format json")
|
||||||
|
if quorum_output:
|
||||||
|
quorum_data = json.loads(quorum_output)
|
||||||
|
quorum = set(quorum_data.get('quorum', []))
|
||||||
|
|
||||||
|
# Parse monitors
|
||||||
|
for mon in mon_data.get('mons', []):
|
||||||
|
self.health.monitors.append(MonitorStatus(
|
||||||
|
name=mon.get('name', ''),
|
||||||
|
rank=mon.get('rank', -1),
|
||||||
|
address=mon.get('addr', ''),
|
||||||
|
in_quorum=mon.get('rank', -1) in quorum
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check if all monitors are in quorum
|
||||||
|
not_in_quorum = [m.name for m in self.health.monitors if not m.in_quorum]
|
||||||
|
if not_in_quorum:
|
||||||
|
self.health.warnings.append(
|
||||||
|
f"Monitors not in quorum: {', '.join(not_in_quorum)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
except (json.JSONDecodeError, KeyError) as e:
|
||||||
|
self.health.warnings.append(f"Failed to parse monitor status: {e}")
|
||||||
|
|
||||||
|
def check_managers(self):
|
||||||
|
"""Check manager status"""
|
||||||
|
output = self.run_command("ceph mgr dump --format json")
|
||||||
|
if not output:
|
||||||
|
self.health.warnings.append("Failed to get manager status")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
mgr_data = json.loads(output)
|
||||||
|
|
||||||
|
# Active manager
|
||||||
|
active_name = mgr_data.get('active_name', '')
|
||||||
|
active_addr = mgr_data.get('active_addr', '')
|
||||||
|
if active_name:
|
||||||
|
self.health.managers.append(ManagerStatus(
|
||||||
|
name=active_name,
|
||||||
|
active=True,
|
||||||
|
address=active_addr
|
||||||
|
))
|
||||||
|
|
||||||
|
# Standby managers
|
||||||
|
for standby in mgr_data.get('standbys', []):
|
||||||
|
self.health.managers.append(ManagerStatus(
|
||||||
|
name=standby.get('name', ''),
|
||||||
|
active=False,
|
||||||
|
address=standby.get('gid', '')
|
||||||
|
))
|
||||||
|
|
||||||
|
except (json.JSONDecodeError, KeyError) as e:
|
||||||
|
self.health.warnings.append(f"Failed to parse manager status: {e}")
|
||||||
|
|
||||||
|
def check_osds(self):
|
||||||
|
"""Check OSD status"""
|
||||||
|
output = self.run_command("ceph osd tree --format json")
|
||||||
|
if not output:
|
||||||
|
self.health.warnings.append("Failed to get OSD tree")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
osd_data = json.loads(output)
|
||||||
|
|
||||||
|
# Parse OSD nodes
|
||||||
|
for node in osd_data.get('nodes', []):
|
||||||
|
if node.get('type') == 'osd':
|
||||||
|
osd_id = node.get('id', -1)
|
||||||
|
status = node.get('status', 'unknown')
|
||||||
|
in_cluster = node.get('exists', 0) == 1
|
||||||
|
|
||||||
|
self.health.osds.append(OSDStatus(
|
||||||
|
osd_id=osd_id,
|
||||||
|
host=node.get('name', 'unknown'),
|
||||||
|
status=status,
|
||||||
|
in_cluster=in_cluster,
|
||||||
|
weight=node.get('crush_weight', 0.0),
|
||||||
|
device_class=node.get('device_class', 'unknown')
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check for down OSDs
|
||||||
|
down_osds = [o.osd_id for o in self.health.osds if o.status != 'up']
|
||||||
|
if down_osds:
|
||||||
|
self.health.errors.append(f"OSDs down: {down_osds}")
|
||||||
|
|
||||||
|
except (json.JSONDecodeError, KeyError) as e:
|
||||||
|
self.health.warnings.append(f"Failed to parse OSD tree: {e}")
|
||||||
|
|
||||||
|
def check_pools(self):
|
||||||
|
"""Check pool status"""
|
||||||
|
output = self.run_command("ceph osd pool ls detail --format json")
|
||||||
|
if not output:
|
||||||
|
self.health.warnings.append("Failed to get pool information")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
pool_data = json.loads(output)
|
||||||
|
|
||||||
|
for pool in pool_data:
|
||||||
|
pool_name = pool.get('pool_name', '')
|
||||||
|
|
||||||
|
# Get pool stats
|
||||||
|
stats_output = self.run_command(f"ceph osd pool stats {pool_name} --format json")
|
||||||
|
if stats_output:
|
||||||
|
stats = json.loads(stats_output)
|
||||||
|
pool_stats = stats[0] if stats else {}
|
||||||
|
|
||||||
|
self.health.pools.append(PoolStatus(
|
||||||
|
name=pool_name,
|
||||||
|
pool_id=pool.get('pool', 0),
|
||||||
|
size=pool.get('size', 0),
|
||||||
|
min_size=pool.get('min_size', 0),
|
||||||
|
pg_num=pool.get('pg_num', 0),
|
||||||
|
pgp_num=pool.get('pgp_num', 0),
|
||||||
|
used_bytes=pool_stats.get('bytes_used', 0),
|
||||||
|
max_avail_bytes=pool_stats.get('max_avail', 0),
|
||||||
|
percent_used=pool_stats.get('percent_used', 0.0) * 100
|
||||||
|
))
|
||||||
|
|
||||||
|
except (json.JSONDecodeError, KeyError) as e:
|
||||||
|
self.health.warnings.append(f"Failed to parse pool information: {e}")
|
||||||
|
|
||||||
|
def check_pg_state(self):
|
||||||
|
"""Verify all PGs are active+clean"""
|
||||||
|
if self.health.num_active_clean_pgs != self.health.num_pgs:
|
||||||
|
self.health.errors.append(
|
||||||
|
f"Not all PGs active+clean: {self.health.num_active_clean_pgs}/{self.health.num_pgs}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_all_checks(self) -> CEPHHealth:
|
||||||
|
"""Run all health checks"""
|
||||||
|
self.check_ceph_status()
|
||||||
|
self.check_monitors()
|
||||||
|
self.check_managers()
|
||||||
|
self.check_osds()
|
||||||
|
self.check_pools()
|
||||||
|
self.check_pg_state()
|
||||||
|
|
||||||
|
return self.health
|
||||||
|
|
||||||
|
|
||||||
|
def human_readable_size(bytes_val: int) -> str:
|
||||||
|
"""Convert bytes to human readable format"""
|
||||||
|
for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
|
||||||
|
if bytes_val < 1024.0:
|
||||||
|
return f"{bytes_val:.2f} {unit}"
|
||||||
|
bytes_val /= 1024.0
|
||||||
|
return f"{bytes_val:.2f} EB"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Check CEPH cluster health",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog=__doc__
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--node',
|
||||||
|
default='foxtrot',
|
||||||
|
help='Cluster node to check (default: foxtrot)'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--json',
|
||||||
|
action='store_true',
|
||||||
|
help='Output as JSON'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--min-osds',
|
||||||
|
type=int,
|
||||||
|
help='Minimum expected OSD count (error if below this)'
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Run health checks
|
||||||
|
checker = CEPHHealthChecker(args.node)
|
||||||
|
health = checker.run_all_checks()
|
||||||
|
|
||||||
|
# Check minimum OSD count
|
||||||
|
if args.min_osds and health.num_osds < args.min_osds:
|
||||||
|
health.errors.append(
|
||||||
|
f"OSD count below minimum: {health.num_osds} < {args.min_osds}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if args.json:
|
||||||
|
# Output as JSON
|
||||||
|
print(json.dumps(asdict(health), indent=2))
|
||||||
|
# Exit with appropriate code based on health status
|
||||||
|
sys.exit(0 if health.is_healthy else 1)
|
||||||
|
else:
|
||||||
|
# Human-readable output
|
||||||
|
print("CEPH Cluster Health Check")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Overall Status: {health.status}")
|
||||||
|
print(f"OSDs: {health.num_up_osds}/{health.num_osds} up, {health.num_in_osds}/{health.num_osds} in")
|
||||||
|
print(f"PGs: {health.num_active_clean_pgs}/{health.num_pgs} active+clean")
|
||||||
|
print(f"Usage: {health.percent_used:.1f}% ({human_readable_size(health.used_bytes)}/{human_readable_size(health.data_bytes)})")
|
||||||
|
|
||||||
|
print("\nMonitors:")
|
||||||
|
for mon in health.monitors:
|
||||||
|
quorum_status = "✓" if mon.in_quorum else "✗"
|
||||||
|
print(f" {quorum_status} {mon.name} (rank: {mon.rank}, {mon.address})")
|
||||||
|
|
||||||
|
print("\nManagers:")
|
||||||
|
for mgr in health.managers:
|
||||||
|
active_status = "ACTIVE" if mgr.active else "STANDBY"
|
||||||
|
print(f" {mgr.name} ({active_status}, {mgr.address})")
|
||||||
|
|
||||||
|
print("\nOSDs:")
|
||||||
|
for osd in health.osds:
|
||||||
|
status = "✓" if osd.status == 'up' else "✗"
|
||||||
|
in_status = "in" if osd.in_cluster else "out"
|
||||||
|
print(f" {status} osd.{osd.osd_id} on {osd.host} ({in_status}, {osd.device_class})")
|
||||||
|
|
||||||
|
print("\nPools:")
|
||||||
|
for pool in health.pools:
|
||||||
|
print(f" {pool.name}: size={pool.size}, min_size={pool.min_size}, "
|
||||||
|
f"pgs={pool.pg_num}, used={pool.percent_used:.1f}%")
|
||||||
|
|
||||||
|
if health.warnings:
|
||||||
|
print("\nWarnings:")
|
||||||
|
for warning in health.warnings:
|
||||||
|
print(f" ⚠ {warning}")
|
||||||
|
|
||||||
|
if health.errors:
|
||||||
|
print("\nErrors:")
|
||||||
|
for error in health.errors:
|
||||||
|
print(f" ✗ {error}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
if health.is_healthy:
|
||||||
|
print("Status: ✓ HEALTHY")
|
||||||
|
sys.exit(0)
|
||||||
|
else:
|
||||||
|
print("Status: ✗ UNHEALTHY")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
339
skills/proxmox-infrastructure/tools/check_cluster_health.py
Executable file
339
skills/proxmox-infrastructure/tools/check_cluster_health.py
Executable file
@@ -0,0 +1,339 @@
|
|||||||
|
#!/usr/bin/env -S uv run --script --quiet
|
||||||
|
# /// script
|
||||||
|
# requires-python = ">=3.11"
|
||||||
|
# dependencies = []
|
||||||
|
# ///
|
||||||
|
"""
|
||||||
|
Proxmox Cluster Health Checker
|
||||||
|
|
||||||
|
Validates Proxmox cluster health including:
|
||||||
|
- Cluster quorum status
|
||||||
|
- Node membership and status
|
||||||
|
- Corosync ring health
|
||||||
|
- Resource manager status
|
||||||
|
- Configuration version sync
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python check_cluster_health.py [--node NODE] [--json]
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
# Check cluster health (requires SSH access to cluster node)
|
||||||
|
python check_cluster_health.py --node foxtrot
|
||||||
|
|
||||||
|
# Output as JSON for parsing
|
||||||
|
python check_cluster_health.py --node foxtrot --json
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NodeStatus:
|
||||||
|
"""Cluster node status"""
|
||||||
|
name: str
|
||||||
|
online: bool
|
||||||
|
node_id: int
|
||||||
|
ip: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CorosyncStatus:
|
||||||
|
"""Corosync ring status"""
|
||||||
|
ring_id: int
|
||||||
|
nodes: List[str]
|
||||||
|
status: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ClusterHealth:
|
||||||
|
"""Overall cluster health"""
|
||||||
|
cluster_name: str
|
||||||
|
quorate: bool
|
||||||
|
node_count: int
|
||||||
|
expected_votes: int
|
||||||
|
total_votes: int
|
||||||
|
nodes: List[NodeStatus]
|
||||||
|
corosync_rings: List[CorosyncStatus]
|
||||||
|
config_version: Optional[int]
|
||||||
|
warnings: List[str]
|
||||||
|
errors: List[str]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_healthy(self) -> bool:
|
||||||
|
"""Check if cluster is in healthy state"""
|
||||||
|
return self.quorate and len(self.errors) == 0
|
||||||
|
|
||||||
|
|
||||||
|
class ClusterHealthChecker:
|
||||||
|
"""Check Proxmox cluster health via SSH"""
|
||||||
|
|
||||||
|
def __init__(self, node: str):
|
||||||
|
# Validate node is a valid hostname or IP address
|
||||||
|
if not self._validate_node(node):
|
||||||
|
raise ValueError(f"Invalid node name or IP address: {node}")
|
||||||
|
self.node = node
|
||||||
|
self.health = ClusterHealth(
|
||||||
|
cluster_name="",
|
||||||
|
quorate=False,
|
||||||
|
node_count=0,
|
||||||
|
expected_votes=0,
|
||||||
|
total_votes=0,
|
||||||
|
nodes=[],
|
||||||
|
corosync_rings=[],
|
||||||
|
config_version=None,
|
||||||
|
warnings=[],
|
||||||
|
errors=[]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _validate_node(self, node: str) -> bool:
|
||||||
|
"""Validate node is a valid hostname or IP address"""
|
||||||
|
import re
|
||||||
|
# Allow valid hostnames and IPv4/IPv6 addresses
|
||||||
|
hostname_pattern = r'^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*$'
|
||||||
|
ipv4_pattern = r'^(\d{1,3}\.){3}\d{1,3}$'
|
||||||
|
ipv6_pattern = r'^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$'
|
||||||
|
return bool(
|
||||||
|
re.match(hostname_pattern, node) or
|
||||||
|
re.match(ipv4_pattern, node) or
|
||||||
|
re.match(ipv6_pattern, node)
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_command(self, command: str) -> str:
|
||||||
|
"""Execute command on remote node via SSH"""
|
||||||
|
try:
|
||||||
|
# Use -- to prevent SSH option injection
|
||||||
|
result = subprocess.run(
|
||||||
|
["ssh", "-o", "BatchMode=yes", f"root@{self.node}", "--", command],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
return result.stdout
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
self.health.errors.append(f"Command timed out: {command}")
|
||||||
|
return ""
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
self.health.errors.append(f"Command failed: {command}: {e.stderr}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def check_cluster_status(self):
|
||||||
|
"""Check pvecm status output"""
|
||||||
|
output = self.run_command("pvecm status")
|
||||||
|
if not output:
|
||||||
|
self.health.errors.append("Failed to get cluster status")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse cluster name
|
||||||
|
cluster_match = re.search(r'Cluster name:\s+(\S+)', output)
|
||||||
|
if cluster_match:
|
||||||
|
self.health.cluster_name = cluster_match.group(1)
|
||||||
|
|
||||||
|
# Parse quorum status
|
||||||
|
quorum_match = re.search(r'Quorate:\s+(\w+)', output)
|
||||||
|
if quorum_match:
|
||||||
|
self.health.quorate = quorum_match.group(1).lower() == 'yes'
|
||||||
|
|
||||||
|
if not self.health.quorate:
|
||||||
|
self.health.errors.append("Cluster does not have quorum!")
|
||||||
|
|
||||||
|
# Parse node count
|
||||||
|
node_match = re.search(r'Nodes:\s+(\d+)', output)
|
||||||
|
if node_match:
|
||||||
|
self.health.node_count = int(node_match.group(1))
|
||||||
|
|
||||||
|
# Parse expected votes
|
||||||
|
expected_match = re.search(r'Expected votes:\s+(\d+)', output)
|
||||||
|
if expected_match:
|
||||||
|
self.health.expected_votes = int(expected_match.group(1))
|
||||||
|
|
||||||
|
# Parse total votes
|
||||||
|
total_match = re.search(r'Total votes:\s+(\d+)', output)
|
||||||
|
if total_match:
|
||||||
|
self.health.total_votes = int(total_match.group(1))
|
||||||
|
|
||||||
|
# Check if we have majority
|
||||||
|
if self.health.total_votes < (self.health.expected_votes // 2 + 1):
|
||||||
|
self.health.errors.append(
|
||||||
|
f"Insufficient votes: {self.health.total_votes}/{self.health.expected_votes}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_nodes(self):
|
||||||
|
"""Check node membership"""
|
||||||
|
output = self.run_command("pvecm nodes")
|
||||||
|
if not output:
|
||||||
|
self.health.warnings.append("Failed to get node list")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse node list (skip header)
|
||||||
|
lines = output.strip().split('\n')[1:] # Skip header
|
||||||
|
for line in lines:
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Example: " 1 0x00000001 foxtrot 192.168.3.5"
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 3:
|
||||||
|
try:
|
||||||
|
node_id = int(parts[0])
|
||||||
|
name = parts[2] if len(parts) >= 3 else "unknown"
|
||||||
|
ip = parts[3] if len(parts) >= 4 else "unknown"
|
||||||
|
online = True # If in list, assumed online
|
||||||
|
|
||||||
|
self.health.nodes.append(NodeStatus(
|
||||||
|
name=name,
|
||||||
|
online=online,
|
||||||
|
node_id=node_id,
|
||||||
|
ip=ip
|
||||||
|
))
|
||||||
|
except (ValueError, IndexError) as e:
|
||||||
|
self.health.warnings.append(f"Failed to parse node line: {line}: {e}")
|
||||||
|
|
||||||
|
# Verify expected node count
|
||||||
|
if len(self.health.nodes) != self.health.node_count:
|
||||||
|
self.health.warnings.append(
|
||||||
|
f"Node count mismatch: expected {self.health.node_count}, found {len(self.health.nodes)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_corosync(self):
|
||||||
|
"""Check corosync ring status"""
|
||||||
|
output = self.run_command("corosync-cfgtool -s")
|
||||||
|
if not output:
|
||||||
|
self.health.warnings.append("Failed to get corosync status")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse corosync status
|
||||||
|
# Example output:
|
||||||
|
# Printing ring status.
|
||||||
|
# Local node ID 1
|
||||||
|
# RING ID 0
|
||||||
|
# id = 192.168.8.5
|
||||||
|
# status = ring 0 active with no faults
|
||||||
|
|
||||||
|
current_ring = None
|
||||||
|
for line in output.split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
|
||||||
|
if line.startswith('RING ID'):
|
||||||
|
ring_match = re.search(r'RING ID (\d+)', line)
|
||||||
|
if ring_match:
|
||||||
|
current_ring = int(ring_match.group(1))
|
||||||
|
|
||||||
|
elif 'status' in line.lower() and current_ring is not None:
|
||||||
|
status_match = re.search(r'status\s*=\s*(.+)', line)
|
||||||
|
if status_match:
|
||||||
|
status = status_match.group(1)
|
||||||
|
|
||||||
|
# Check for faults
|
||||||
|
if 'no faults' not in status.lower():
|
||||||
|
self.health.errors.append(f"Corosync ring {current_ring}: {status}")
|
||||||
|
|
||||||
|
self.health.corosync_rings.append(CorosyncStatus(
|
||||||
|
ring_id=current_ring,
|
||||||
|
nodes=[], # Could parse this if needed
|
||||||
|
status=status
|
||||||
|
))
|
||||||
|
|
||||||
|
def check_config_version(self):
|
||||||
|
"""Check cluster configuration version"""
|
||||||
|
output = self.run_command("corosync-cmapctl -b totem.config_version")
|
||||||
|
if output:
|
||||||
|
try:
|
||||||
|
self.health.config_version = int(output.strip())
|
||||||
|
except ValueError:
|
||||||
|
self.health.warnings.append("Failed to parse config version")
|
||||||
|
|
||||||
|
def check_resource_manager(self):
|
||||||
|
"""Check pve-cluster service status"""
|
||||||
|
output = self.run_command("systemctl is-active pve-cluster")
|
||||||
|
if output.strip() != "active":
|
||||||
|
self.health.errors.append("pve-cluster service is not active")
|
||||||
|
|
||||||
|
# Check pmxcfs filesystem
|
||||||
|
output = self.run_command("pvecm status | grep -i 'cluster filesystem'")
|
||||||
|
if output and 'online' not in output.lower():
|
||||||
|
self.health.warnings.append("Cluster filesystem may not be online")
|
||||||
|
|
||||||
|
def run_all_checks(self) -> ClusterHealth:
|
||||||
|
"""Run all health checks"""
|
||||||
|
self.check_cluster_status()
|
||||||
|
self.check_nodes()
|
||||||
|
self.check_corosync()
|
||||||
|
self.check_config_version()
|
||||||
|
self.check_resource_manager()
|
||||||
|
|
||||||
|
return self.health
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Check Proxmox cluster health",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog=__doc__
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--node',
|
||||||
|
default='foxtrot',
|
||||||
|
help='Cluster node to check (default: foxtrot)'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--json',
|
||||||
|
action='store_true',
|
||||||
|
help='Output as JSON'
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Run health checks
|
||||||
|
checker = ClusterHealthChecker(args.node)
|
||||||
|
health = checker.run_all_checks()
|
||||||
|
|
||||||
|
if args.json:
|
||||||
|
# Output as JSON
|
||||||
|
print(json.dumps(asdict(health), indent=2))
|
||||||
|
else:
|
||||||
|
# Human-readable output
|
||||||
|
print(f"Cluster Health Check: {health.cluster_name}")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Quorum Status: {'✓ YES' if health.quorate else '✗ NO'}")
|
||||||
|
print(f"Nodes: {health.node_count} ({health.total_votes}/{health.expected_votes} votes)")
|
||||||
|
|
||||||
|
if health.config_version:
|
||||||
|
print(f"Config Version: {health.config_version}")
|
||||||
|
|
||||||
|
print("\nNodes:")
|
||||||
|
for node in health.nodes:
|
||||||
|
status = "✓" if node.online else "✗"
|
||||||
|
print(f" {status} {node.name} (ID: {node.node_id}, IP: {node.ip})")
|
||||||
|
|
||||||
|
print("\nCorosync Rings:")
|
||||||
|
for ring in health.corosync_rings:
|
||||||
|
print(f" Ring {ring.ring_id}: {ring.status}")
|
||||||
|
|
||||||
|
if health.warnings:
|
||||||
|
print("\nWarnings:")
|
||||||
|
for warning in health.warnings:
|
||||||
|
print(f" ⚠ {warning}")
|
||||||
|
|
||||||
|
if health.errors:
|
||||||
|
print("\nErrors:")
|
||||||
|
for error in health.errors:
|
||||||
|
print(f" ✗ {error}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
if health.is_healthy:
|
||||||
|
print("Status: ✓ HEALTHY")
|
||||||
|
sys.exit(0)
|
||||||
|
else:
|
||||||
|
print("Status: ✗ UNHEALTHY")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
252
skills/proxmox-infrastructure/tools/cluster_status.py
Executable file
252
skills/proxmox-infrastructure/tools/cluster_status.py
Executable file
@@ -0,0 +1,252 @@
|
|||||||
|
#!/usr/bin/env -S uv run --script --quiet
|
||||||
|
# /// script
|
||||||
|
# dependencies = ["proxmoxer", "requests"]
|
||||||
|
# ///
|
||||||
|
"""
|
||||||
|
Display Proxmox cluster health and resource usage.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
./cluster_status.py
|
||||||
|
./cluster_status.py --node foxtrot
|
||||||
|
./cluster_status.py --detailed
|
||||||
|
|
||||||
|
Environment Variables:
|
||||||
|
PROXMOX_VE_ENDPOINT - Proxmox API endpoint (e.g., https://192.168.3.5:8006)
|
||||||
|
PROXMOX_VE_USERNAME - Username (e.g., root@pam)
|
||||||
|
PROXMOX_VE_PASSWORD - Password
|
||||||
|
OR
|
||||||
|
PROXMOX_VE_API_TOKEN - API token (user@realm!token-id=secret)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from proxmoxer import ProxmoxAPI, ResourceException
|
||||||
|
|
||||||
|
|
||||||
|
class ClusterMonitor:
|
||||||
|
"""Monitor Proxmox cluster health and resources."""
|
||||||
|
|
||||||
|
def __init__(self, endpoint: str, auth_type: str, **auth_kwargs):
|
||||||
|
"""Initialize Proxmox connection."""
|
||||||
|
self.endpoint = endpoint.replace("https://", "").replace(":8006", "")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if auth_type == "token":
|
||||||
|
user, token = auth_kwargs["token"].split("!")
|
||||||
|
token_name, token_value = token.split("=")
|
||||||
|
self.proxmox = ProxmoxAPI(
|
||||||
|
self.endpoint,
|
||||||
|
user=user,
|
||||||
|
token_name=token_name,
|
||||||
|
token_value=token_value,
|
||||||
|
verify_ssl=False
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.proxmox = ProxmoxAPI(
|
||||||
|
self.endpoint,
|
||||||
|
user=auth_kwargs["user"],
|
||||||
|
password=auth_kwargs["password"],
|
||||||
|
verify_ssl=False
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed to connect to Proxmox: {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def get_cluster_status(self):
|
||||||
|
"""Get cluster status and quorum info."""
|
||||||
|
try:
|
||||||
|
status = self.proxmox.cluster.status.get()
|
||||||
|
return status
|
||||||
|
except ResourceException as e:
|
||||||
|
print(f"❌ Failed to get cluster status: {e}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_node_status(self, node_name: str):
|
||||||
|
"""Get detailed node status."""
|
||||||
|
try:
|
||||||
|
status = self.proxmox.nodes(node_name).status.get()
|
||||||
|
return status
|
||||||
|
except ResourceException as e:
|
||||||
|
print(f"❌ Failed to get node status: {e}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_node_vms(self, node_name: str):
|
||||||
|
"""Get VMs on a node."""
|
||||||
|
try:
|
||||||
|
vms = self.proxmox.nodes(node_name).qemu.get()
|
||||||
|
return vms
|
||||||
|
except ResourceException as e:
|
||||||
|
print(f"❌ Failed to get VMs: {e}", file=sys.stderr)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def display_cluster_overview(self):
|
||||||
|
"""Display cluster overview."""
|
||||||
|
print("🖥️ Proxmox Cluster Status")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
cluster_status = self.get_cluster_status()
|
||||||
|
if not cluster_status:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Find cluster info
|
||||||
|
cluster_info = next((item for item in cluster_status if item['type'] == 'cluster'), None)
|
||||||
|
if cluster_info:
|
||||||
|
print(f"\n📊 Cluster: {cluster_info.get('name', 'N/A')}")
|
||||||
|
print(f" Quorum: {cluster_info.get('quorate', 0)} (nodes: {cluster_info.get('nodes', 0)})")
|
||||||
|
|
||||||
|
# Node statuses
|
||||||
|
nodes = [item for item in cluster_status if item['type'] == 'node']
|
||||||
|
|
||||||
|
print(f"\n🔧 Nodes ({len(nodes)}):")
|
||||||
|
print(f"{'Node':<15} {'Status':<10} {'CPU':<12} {'Memory':<20} {'VMs':<8}")
|
||||||
|
print("-" * 70)
|
||||||
|
|
||||||
|
for node_info in nodes:
|
||||||
|
node_name = node_info['name']
|
||||||
|
online = "✓ Online" if node_info.get('online', 0) == 1 else "✗ Offline"
|
||||||
|
|
||||||
|
# Get detailed status
|
||||||
|
detailed = self.get_node_status(node_name)
|
||||||
|
if not detailed:
|
||||||
|
print(f"{node_name:<15} {online:<10} {'N/A':<12} {'N/A':<20} {'N/A':<8}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# CPU usage
|
||||||
|
cpu_pct = detailed.get('cpu', 0) * 100
|
||||||
|
cpu_str = f"{cpu_pct:.1f}%"
|
||||||
|
|
||||||
|
# Memory usage
|
||||||
|
mem_used = detailed.get('memory', {}).get('used', 0) / (1024**3) # GB
|
||||||
|
mem_total = detailed.get('memory', {}).get('total', 0) / (1024**3) # GB
|
||||||
|
mem_pct = (mem_used / mem_total * 100) if mem_total > 0 else 0
|
||||||
|
mem_str = f"{mem_used:.1f}/{mem_total:.1f}GB ({mem_pct:.1f}%)"
|
||||||
|
|
||||||
|
# VM count
|
||||||
|
vms = self.get_node_vms(node_name)
|
||||||
|
vm_count = len(vms)
|
||||||
|
running_vms = len([vm for vm in vms if vm.get('status') == 'running'])
|
||||||
|
vm_str = f"{running_vms}/{vm_count}"
|
||||||
|
|
||||||
|
print(f"{node_name:<15} {online:<10} {cpu_str:<12} {mem_str:<20} {vm_str:<8}")
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
def display_node_detail(self, node_name: str):
|
||||||
|
"""Display detailed node information."""
|
||||||
|
print(f"\n🔍 Node Details: {node_name}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
status = self.get_node_status(node_name)
|
||||||
|
if not status:
|
||||||
|
return
|
||||||
|
|
||||||
|
# System info
|
||||||
|
print(f"\n💻 System:")
|
||||||
|
print(f" Uptime: {status.get('uptime', 0) / 86400:.1f} days")
|
||||||
|
print(f" Load Average: {status.get('loadavg', ['N/A', 'N/A', 'N/A'])[0]:.2f}")
|
||||||
|
print(f" CPU Cores: {status.get('cpuinfo', {}).get('cpus', 'N/A')}")
|
||||||
|
|
||||||
|
# CPU
|
||||||
|
cpu_pct = status.get('cpu', 0) * 100
|
||||||
|
print(f"\n🖥️ CPU Usage: {cpu_pct:.1f}%")
|
||||||
|
|
||||||
|
# Memory
|
||||||
|
mem = status.get('memory', {})
|
||||||
|
mem_used = mem.get('used', 0) / (1024**3)
|
||||||
|
mem_total = mem.get('total', 0) / (1024**3)
|
||||||
|
mem_free = mem.get('free', 0) / (1024**3)
|
||||||
|
mem_pct = (mem_used / mem_total * 100) if mem_total > 0 else 0
|
||||||
|
|
||||||
|
print(f"\n💾 Memory:")
|
||||||
|
print(f" Used: {mem_used:.2f} GB ({mem_pct:.1f}%)")
|
||||||
|
print(f" Free: {mem_free:.2f} GB")
|
||||||
|
print(f" Total: {mem_total:.2f} GB")
|
||||||
|
|
||||||
|
# Storage
|
||||||
|
root = status.get('rootfs', {})
|
||||||
|
root_used = root.get('used', 0) / (1024**3)
|
||||||
|
root_total = root.get('total', 0) / (1024**3)
|
||||||
|
root_avail = root.get('avail', 0) / (1024**3)
|
||||||
|
root_pct = (root_used / root_total * 100) if root_total > 0 else 0
|
||||||
|
|
||||||
|
print(f"\n💿 Root Filesystem:")
|
||||||
|
print(f" Used: {root_used:.2f} GB ({root_pct:.1f}%)")
|
||||||
|
print(f" Available: {root_avail:.2f} GB")
|
||||||
|
print(f" Total: {root_total:.2f} GB")
|
||||||
|
|
||||||
|
# VMs
|
||||||
|
vms = self.get_node_vms(node_name)
|
||||||
|
print(f"\n🖼️ Virtual Machines ({len(vms)}):")
|
||||||
|
|
||||||
|
if vms:
|
||||||
|
print(f" {'VMID':<8} {'Name':<25} {'Status':<10} {'CPU':<8} {'Memory':<15}")
|
||||||
|
print(" " + "-" * 66)
|
||||||
|
|
||||||
|
for vm in vms:
|
||||||
|
vmid = vm.get('vmid', 'N/A')
|
||||||
|
name = vm.get('name', 'N/A')[:24]
|
||||||
|
status = vm.get('status', 'unknown')
|
||||||
|
cpu_pct = vm.get('cpu', 0) * 100 if vm.get('status') == 'running' else 0
|
||||||
|
mem = vm.get('mem', 0) / (1024**2) if vm.get('status') == 'running' else 0 # MB
|
||||||
|
|
||||||
|
status_icon = "▶️" if status == "running" else "⏸️"
|
||||||
|
print(f" {vmid:<8} {name:<25} {status_icon} {status:<8} {cpu_pct:>6.1f}% {mem:>8.0f} MB")
|
||||||
|
else:
|
||||||
|
print(" No VMs found")
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Display Proxmox cluster health and resource usage"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--node",
|
||||||
|
help="Show detailed info for specific node"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--detailed",
|
||||||
|
action="store_true",
|
||||||
|
help="Show detailed info for all nodes"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Get authentication from environment
|
||||||
|
endpoint = os.getenv("PROXMOX_VE_ENDPOINT")
|
||||||
|
api_token = os.getenv("PROXMOX_VE_API_TOKEN")
|
||||||
|
username = os.getenv("PROXMOX_VE_USERNAME")
|
||||||
|
password = os.getenv("PROXMOX_VE_PASSWORD")
|
||||||
|
|
||||||
|
if not endpoint:
|
||||||
|
print("❌ PROXMOX_VE_ENDPOINT environment variable required", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Determine authentication method
|
||||||
|
if api_token:
|
||||||
|
monitor = ClusterMonitor(endpoint, "token", token=api_token)
|
||||||
|
elif username and password:
|
||||||
|
monitor = ClusterMonitor(endpoint, "password", user=username, password=password)
|
||||||
|
else:
|
||||||
|
print("❌ Authentication required: set PROXMOX_VE_API_TOKEN or PROXMOX_VE_USERNAME/PASSWORD", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Display status
|
||||||
|
if args.node:
|
||||||
|
monitor.display_node_detail(args.node)
|
||||||
|
elif args.detailed:
|
||||||
|
monitor.display_cluster_overview()
|
||||||
|
# Get all nodes and show details
|
||||||
|
cluster_status = monitor.get_cluster_status()
|
||||||
|
if cluster_status:
|
||||||
|
nodes = [item['name'] for item in cluster_status if item['type'] == 'node']
|
||||||
|
for node_name in nodes:
|
||||||
|
monitor.display_node_detail(node_name)
|
||||||
|
else:
|
||||||
|
monitor.display_cluster_overview()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
224
skills/proxmox-infrastructure/tools/validate_template.py
Executable file
224
skills/proxmox-infrastructure/tools/validate_template.py
Executable file
@@ -0,0 +1,224 @@
|
|||||||
|
#!/usr/bin/env -S uv run --script --quiet
|
||||||
|
# /// script
|
||||||
|
# dependencies = ["proxmoxer", "requests"]
|
||||||
|
# ///
|
||||||
|
"""
|
||||||
|
Validate Proxmox VM template health and configuration.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
./validate_template.py --template-id 9000 --node foxtrot
|
||||||
|
./validate_template.py --template-id 9000 --all-nodes
|
||||||
|
|
||||||
|
Environment Variables:
|
||||||
|
PROXMOX_VE_ENDPOINT - Proxmox API endpoint (e.g., https://192.168.3.5:8006)
|
||||||
|
PROXMOX_VE_USERNAME - Username (e.g., root@pam)
|
||||||
|
PROXMOX_VE_PASSWORD - Password
|
||||||
|
OR
|
||||||
|
PROXMOX_VE_API_TOKEN - API token (user@realm!token-id=secret)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from proxmoxer import ProxmoxAPI, ResourceException
|
||||||
|
|
||||||
|
|
||||||
|
class TemplateValidator:
|
||||||
|
"""Validates Proxmox VM templates."""
|
||||||
|
|
||||||
|
def __init__(self, endpoint: str, auth_type: str, **auth_kwargs):
|
||||||
|
"""Initialize Proxmox connection."""
|
||||||
|
self.endpoint = endpoint.replace("https://", "").replace(":8006", "")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if auth_type == "token":
|
||||||
|
user, token = auth_kwargs["token"].split("!")
|
||||||
|
token_name, token_value = token.split("=")
|
||||||
|
self.proxmox = ProxmoxAPI(
|
||||||
|
self.endpoint,
|
||||||
|
user=user,
|
||||||
|
token_name=token_name,
|
||||||
|
token_value=token_value,
|
||||||
|
verify_ssl=False
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.proxmox = ProxmoxAPI(
|
||||||
|
self.endpoint,
|
||||||
|
user=auth_kwargs["user"],
|
||||||
|
password=auth_kwargs["password"],
|
||||||
|
verify_ssl=False
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed to connect to Proxmox: {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def find_template(self, template_id: int, node: str = None):
|
||||||
|
"""Find template on cluster."""
|
||||||
|
nodes = [node] if node else [n['node'] for n in self.proxmox.nodes.get()]
|
||||||
|
|
||||||
|
for node_name in nodes:
|
||||||
|
try:
|
||||||
|
vms = self.proxmox.nodes(node_name).qemu.get()
|
||||||
|
for vm in vms:
|
||||||
|
if vm['vmid'] == template_id:
|
||||||
|
return node_name, vm
|
||||||
|
except ResourceException:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def validate_template(self, template_id: int, node: str = None):
|
||||||
|
"""Validate template configuration."""
|
||||||
|
print(f"🔍 Validating template {template_id}...")
|
||||||
|
|
||||||
|
# Find template
|
||||||
|
node_name, vm_info = self.find_template(template_id, node)
|
||||||
|
|
||||||
|
if not node_name:
|
||||||
|
print(f"❌ Template {template_id} not found", file=sys.stderr)
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"✓ Found on node: {node_name}")
|
||||||
|
|
||||||
|
# Check if it's actually a template
|
||||||
|
if vm_info.get('template', 0) != 1:
|
||||||
|
print(f"❌ VM {template_id} is not a template", file=sys.stderr)
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"✓ Confirmed as template")
|
||||||
|
|
||||||
|
# Get detailed config
|
||||||
|
try:
|
||||||
|
config = self.proxmox.nodes(node_name).qemu(template_id).config.get()
|
||||||
|
except ResourceException as e:
|
||||||
|
print(f"❌ Failed to get template config: {e}", file=sys.stderr)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validation checks
|
||||||
|
checks = {
|
||||||
|
"Cloud-init drive": self._check_cloudinit(config),
|
||||||
|
"QEMU guest agent": self._check_agent(config),
|
||||||
|
"SCSI controller": self._check_scsi(config),
|
||||||
|
"Boot disk": self._check_boot_disk(config),
|
||||||
|
"Serial console": self._check_serial(config),
|
||||||
|
"EFI disk": self._check_efi(config),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
print("\n📋 Validation Results:")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
for check_name, (passed, message) in checks.items():
|
||||||
|
status = "✓" if passed else "✗"
|
||||||
|
print(f"{status} {check_name}: {message}")
|
||||||
|
if not passed:
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
# Print template info
|
||||||
|
print(f"\n📊 Template Info:")
|
||||||
|
print(f" Name: {config.get('name', 'N/A')}")
|
||||||
|
print(f" Memory: {config.get('memory', 'N/A')} MB")
|
||||||
|
print(f" Cores: {config.get('cores', 'N/A')}")
|
||||||
|
print(f" Sockets: {config.get('sockets', 'N/A')}")
|
||||||
|
|
||||||
|
if all_passed:
|
||||||
|
print(f"\n✅ Template {template_id} is properly configured")
|
||||||
|
else:
|
||||||
|
print(f"\n⚠️ Template {template_id} has configuration issues")
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
def _check_cloudinit(self, config):
|
||||||
|
"""Check for cloud-init drive."""
|
||||||
|
for key in config:
|
||||||
|
if key.startswith('ide') and 'cloudinit' in str(config[key]):
|
||||||
|
return True, f"Found at {key}"
|
||||||
|
return False, "Missing cloud-init drive (should be ide2)"
|
||||||
|
|
||||||
|
def _check_agent(self, config):
|
||||||
|
"""Check for QEMU guest agent."""
|
||||||
|
agent = config.get('agent', '0')
|
||||||
|
if agent in ['1', 'enabled=1']:
|
||||||
|
return True, "Enabled"
|
||||||
|
return False, "Not enabled (recommended for IP detection)"
|
||||||
|
|
||||||
|
def _check_scsi(self, config):
|
||||||
|
"""Check SCSI controller type."""
|
||||||
|
scsihw = config.get('scsihw', '')
|
||||||
|
if 'virtio' in scsihw:
|
||||||
|
return True, f"Using {scsihw}"
|
||||||
|
return False, f"Not using virtio-scsi (found: {scsihw or 'none'})"
|
||||||
|
|
||||||
|
def _check_boot_disk(self, config):
|
||||||
|
"""Check for boot disk."""
|
||||||
|
for key in config:
|
||||||
|
if key.startswith('scsi') and key != 'scsihw':
|
||||||
|
return True, f"Found at {key}"
|
||||||
|
return False, "No SCSI disk found"
|
||||||
|
|
||||||
|
def _check_serial(self, config):
|
||||||
|
"""Check for serial console."""
|
||||||
|
if 'serial0' in config:
|
||||||
|
return True, "Configured"
|
||||||
|
return False, "Not configured (recommended for cloud images)"
|
||||||
|
|
||||||
|
def _check_efi(self, config):
|
||||||
|
"""Check for EFI disk."""
|
||||||
|
if 'efidisk0' in config:
|
||||||
|
return True, "Configured"
|
||||||
|
return False, "Not configured (needed for UEFI boot)"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Validate Proxmox VM template health and configuration"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--template-id",
|
||||||
|
type=int,
|
||||||
|
required=True,
|
||||||
|
help="Template VM ID (e.g., 9000)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--node",
|
||||||
|
help="Specific Proxmox node to check (default: search all nodes)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--all-nodes",
|
||||||
|
action="store_true",
|
||||||
|
help="Search all nodes in cluster"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Get authentication from environment
|
||||||
|
endpoint = os.getenv("PROXMOX_VE_ENDPOINT")
|
||||||
|
api_token = os.getenv("PROXMOX_VE_API_TOKEN")
|
||||||
|
username = os.getenv("PROXMOX_VE_USERNAME")
|
||||||
|
password = os.getenv("PROXMOX_VE_PASSWORD")
|
||||||
|
|
||||||
|
if not endpoint:
|
||||||
|
print("❌ PROXMOX_VE_ENDPOINT environment variable required", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Determine authentication method
|
||||||
|
if api_token:
|
||||||
|
validator = TemplateValidator(endpoint, "token", token=api_token)
|
||||||
|
elif username and password:
|
||||||
|
validator = TemplateValidator(endpoint, "password", user=username, password=password)
|
||||||
|
else:
|
||||||
|
print("❌ Authentication required: set PROXMOX_VE_API_TOKEN or PROXMOX_VE_USERNAME/PASSWORD", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Validate template
|
||||||
|
node = None if args.all_nodes else args.node
|
||||||
|
success = validator.validate_template(args.template_id, node)
|
||||||
|
|
||||||
|
sys.exit(0 if success else 1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
782
skills/proxmox-infrastructure/workflows/ceph-deployment.md
Normal file
782
skills/proxmox-infrastructure/workflows/ceph-deployment.md
Normal file
@@ -0,0 +1,782 @@
|
|||||||
|
# CEPH Storage Deployment Workflow
|
||||||
|
|
||||||
|
Complete guide to deploying CEPH storage on a Proxmox VE cluster with automated OSD creation, pool
|
||||||
|
configuration, and health verification.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This workflow automates CEPH deployment with:
|
||||||
|
|
||||||
|
- CEPH package installation
|
||||||
|
- Cluster initialization with proper network configuration
|
||||||
|
- Monitor and manager creation across all nodes
|
||||||
|
- Automated OSD creation with partition support
|
||||||
|
- Pool configuration with replication and compression
|
||||||
|
- Comprehensive health verification
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Before deploying CEPH:
|
||||||
|
|
||||||
|
1. **Cluster must be formed:**
|
||||||
|
- Proxmox cluster already initialized and healthy
|
||||||
|
- All nodes showing quorum
|
||||||
|
- See [Cluster Formation](cluster-formation.md) first
|
||||||
|
|
||||||
|
2. **Network requirements:**
|
||||||
|
- Dedicated CEPH public network (192.168.5.0/24 for Matrix)
|
||||||
|
- Dedicated CEPH private/cluster network (192.168.7.0/24 for Matrix)
|
||||||
|
- MTU 9000 (jumbo frames) configured on CEPH networks
|
||||||
|
- Bridges configured: vmbr1 (public), vmbr2 (private)
|
||||||
|
|
||||||
|
3. **Storage requirements:**
|
||||||
|
- Dedicated disks for OSDs (not boot disks)
|
||||||
|
- All OSD disks should be the same type (SSD/NVMe)
|
||||||
|
- Matrix: 2× 4TB Samsung 990 PRO NVMe per node = 24TB raw
|
||||||
|
|
||||||
|
4. **System requirements:**
|
||||||
|
- Minimum 3 nodes for production (replication factor 3)
|
||||||
|
- At least 4GB RAM per OSD
|
||||||
|
- Fast network (10GbE recommended for CEPH networks)
|
||||||
|
|
||||||
|
## Phase 1: Install CEPH Packages
|
||||||
|
|
||||||
|
### Step 1: Install CEPH
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_ceph/tasks/install.yml
|
||||||
|
---
|
||||||
|
- name: Check if CEPH is already installed
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: /etc/pve/ceph.conf
|
||||||
|
register: ceph_conf_check
|
||||||
|
|
||||||
|
- name: Check CEPH packages
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: dpkg -l ceph-common
|
||||||
|
register: ceph_package_check
|
||||||
|
failed_when: false
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Install CEPH packages via pveceph
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "pveceph install --repository {{ ceph_repository }}"
|
||||||
|
when: ceph_package_check.rc != 0
|
||||||
|
register: ceph_install
|
||||||
|
changed_when: "'installed' in ceph_install.stdout | default('')"
|
||||||
|
|
||||||
|
- name: Verify CEPH installation
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph --version
|
||||||
|
register: ceph_version
|
||||||
|
changed_when: false
|
||||||
|
failed_when: ceph_version.rc != 0
|
||||||
|
|
||||||
|
- name: Display CEPH version
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Installed CEPH version: {{ ceph_version.stdout }}"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Phase 2: Initialize CEPH Cluster
|
||||||
|
|
||||||
|
### Step 2: Initialize CEPH (First Node Only)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_ceph/tasks/init.yml
|
||||||
|
---
|
||||||
|
- name: Check if CEPH cluster is initialized
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph status
|
||||||
|
register: ceph_status_check
|
||||||
|
failed_when: false
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Set CEPH initialization facts
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
ceph_initialized: "{{ ceph_status_check.rc == 0 }}"
|
||||||
|
is_ceph_first_node: "{{ inventory_hostname == groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
|
||||||
|
- name: Initialize CEPH cluster on first node
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: >
|
||||||
|
pveceph init
|
||||||
|
--network {{ ceph_network }}
|
||||||
|
--cluster-network {{ ceph_cluster_network }}
|
||||||
|
when:
|
||||||
|
- is_ceph_first_node
|
||||||
|
- not ceph_initialized
|
||||||
|
register: ceph_init
|
||||||
|
changed_when: ceph_init.rc == 0
|
||||||
|
|
||||||
|
- name: Wait for CEPH cluster to initialize
|
||||||
|
ansible.builtin.pause:
|
||||||
|
seconds: 15
|
||||||
|
when: ceph_init.changed
|
||||||
|
|
||||||
|
- name: Verify CEPH initialization
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph status
|
||||||
|
register: ceph_init_verify
|
||||||
|
changed_when: false
|
||||||
|
when:
|
||||||
|
- is_ceph_first_node
|
||||||
|
failed_when:
|
||||||
|
- ceph_init_verify.rc != 0
|
||||||
|
|
||||||
|
- name: Display initial CEPH status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
var: ceph_init_verify.stdout_lines
|
||||||
|
when:
|
||||||
|
- is_ceph_first_node
|
||||||
|
- ceph_init.changed or ansible_verbosity > 0
|
||||||
|
```
|
||||||
|
|
||||||
|
## Phase 3: Create Monitors and Managers
|
||||||
|
|
||||||
|
### Step 3: Create CEPH Monitors
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_ceph/tasks/monitors.yml
|
||||||
|
---
|
||||||
|
- name: Check existing CEPH monitors
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph mon dump --format json
|
||||||
|
register: mon_dump
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
failed_when: false
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Parse monitor list
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
existing_monitors: "{{ (mon_dump.stdout | from_json).mons | map(attribute='name') | list }}"
|
||||||
|
when: mon_dump.rc == 0
|
||||||
|
|
||||||
|
- name: Set monitor facts
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
has_monitor: "{{ inventory_hostname_short in existing_monitors | default([]) }}"
|
||||||
|
|
||||||
|
- name: Create CEPH monitor on first node
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pveceph mon create
|
||||||
|
when:
|
||||||
|
- is_ceph_first_node
|
||||||
|
- not has_monitor
|
||||||
|
register: mon_create_first
|
||||||
|
changed_when: mon_create_first.rc == 0
|
||||||
|
|
||||||
|
- name: Wait for first monitor to stabilize
|
||||||
|
ansible.builtin.pause:
|
||||||
|
seconds: 10
|
||||||
|
when: mon_create_first.changed
|
||||||
|
|
||||||
|
- name: Create CEPH monitors on other nodes
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pveceph mon create
|
||||||
|
when:
|
||||||
|
- not is_ceph_first_node
|
||||||
|
- not has_monitor
|
||||||
|
register: mon_create_others
|
||||||
|
changed_when: mon_create_others.rc == 0
|
||||||
|
|
||||||
|
- name: Verify monitor quorum
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph quorum_status --format json
|
||||||
|
register: quorum_status
|
||||||
|
changed_when: false
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Check monitor quorum size
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- (quorum_status.stdout | from_json).quorum | length >= ((groups[cluster_group | default('matrix_cluster')] | length // 2) + 1)
|
||||||
|
fail_msg: "Monitor quorum not established"
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: Create CEPH Managers
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_ceph/tasks/managers.yml
|
||||||
|
---
|
||||||
|
- name: Check existing CEPH managers
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph mgr dump --format json
|
||||||
|
register: mgr_dump
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
failed_when: false
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Parse manager list
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
existing_managers: "{{ [(mgr_dump.stdout | from_json).active_name] + ((mgr_dump.stdout | from_json).standbys | map(attribute='name') | list) }}"
|
||||||
|
when: mgr_dump.rc == 0
|
||||||
|
|
||||||
|
- name: Initialize empty manager list if check failed
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
existing_managers: []
|
||||||
|
when: mgr_dump.rc != 0
|
||||||
|
|
||||||
|
- name: Set manager facts
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
has_manager: "{{ inventory_hostname_short in (existing_managers | default([])) }}"
|
||||||
|
|
||||||
|
- name: Create CEPH manager
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pveceph mgr create
|
||||||
|
when: not has_manager
|
||||||
|
register: mgr_create
|
||||||
|
changed_when: mgr_create.rc == 0
|
||||||
|
|
||||||
|
- name: Wait for managers to stabilize
|
||||||
|
ansible.builtin.pause:
|
||||||
|
seconds: 5
|
||||||
|
when: mgr_create.changed
|
||||||
|
|
||||||
|
- name: Enable CEPH dashboard module
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph mgr module enable dashboard
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
register: dashboard_enable
|
||||||
|
changed_when: "'already enabled' not in dashboard_enable.stderr"
|
||||||
|
failed_when:
|
||||||
|
- dashboard_enable.rc != 0
|
||||||
|
- "'already enabled' not in dashboard_enable.stderr"
|
||||||
|
|
||||||
|
- name: Enable Prometheus module
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph mgr module enable prometheus
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
register: prometheus_enable
|
||||||
|
changed_when: "'already enabled' not in prometheus_enable.stderr"
|
||||||
|
failed_when:
|
||||||
|
- prometheus_enable.rc != 0
|
||||||
|
- "'already enabled' not in prometheus_enable.stderr"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Phase 4: Create OSDs
|
||||||
|
|
||||||
|
### Step 5: Prepare and Create OSDs
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_ceph/tasks/osd_create.yml
|
||||||
|
---
|
||||||
|
- name: Get list of existing OSDs
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph osd ls
|
||||||
|
register: existing_osds
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Check OSD devices availability
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "lsblk -ndo NAME,SIZE,TYPE {{ item.device }}"
|
||||||
|
register: device_check
|
||||||
|
failed_when: device_check.rc != 0
|
||||||
|
changed_when: false
|
||||||
|
loop: "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.device }}"
|
||||||
|
|
||||||
|
- name: Display device information
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Device {{ item.item.device }}: {{ item.stdout }}"
|
||||||
|
loop: "{{ device_check.results }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.item.device }}"
|
||||||
|
when: ansible_verbosity > 0
|
||||||
|
|
||||||
|
- name: Wipe existing partitions on OSD devices
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "wipefs -a {{ item.device }}"
|
||||||
|
when:
|
||||||
|
- ceph_wipe_disks | default(false)
|
||||||
|
loop: "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.device }}"
|
||||||
|
register: wipe_result
|
||||||
|
changed_when: wipe_result.rc == 0
|
||||||
|
|
||||||
|
- name: Create OSDs from whole devices (no partitioning)
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: >
|
||||||
|
pveceph osd create {{ item.device }}
|
||||||
|
{% if item.db_device is defined and item.db_device %}--db_dev {{ item.db_device }}{% endif %}
|
||||||
|
{% if item.wal_device is defined and item.wal_device %}--wal_dev {{ item.wal_device }}{% endif %}
|
||||||
|
when:
|
||||||
|
- item.partitions | default(1) == 1
|
||||||
|
loop: "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.device }}"
|
||||||
|
register: osd_create_whole
|
||||||
|
changed_when: "'successfully created' in osd_create_whole.stdout | default('')"
|
||||||
|
failed_when:
|
||||||
|
- osd_create_whole.rc != 0
|
||||||
|
- "'already in use' not in osd_create_whole.stderr | default('')"
|
||||||
|
- "'ceph-volume' not in osd_create_whole.stderr | default('')"
|
||||||
|
|
||||||
|
- name: Create multiple OSDs per device (with partitioning)
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: >
|
||||||
|
pveceph osd create {{ item.0.device }}
|
||||||
|
--size {{ (item.0.device_size_gb | default(4000) / item.0.partitions) | int }}G
|
||||||
|
{% if item.0.db_device is defined and item.0.db_device %}--db_dev {{ item.0.db_device }}{% endif %}
|
||||||
|
{% if item.0.wal_device is defined and item.0.wal_device %}--wal_dev {{ item.0.wal_device }}{% endif %}
|
||||||
|
when:
|
||||||
|
- item.0.partitions > 1
|
||||||
|
with_subelements:
|
||||||
|
- "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
|
||||||
|
- partition_indices
|
||||||
|
- skip_missing: true
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.0.device }} partition {{ item.1 }}"
|
||||||
|
register: osd_create_partition
|
||||||
|
changed_when: "'successfully created' in osd_create_partition.stdout | default('')"
|
||||||
|
failed_when:
|
||||||
|
- osd_create_partition.rc != 0
|
||||||
|
- "'already in use' not in osd_create_partition.stderr | default('')"
|
||||||
|
|
||||||
|
- name: Wait for OSDs to come up
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph osd tree --format json
|
||||||
|
register: osd_tree
|
||||||
|
changed_when: false
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
until: >
|
||||||
|
(osd_tree.stdout | from_json).nodes
|
||||||
|
| selectattr('type', 'equalto', 'osd')
|
||||||
|
| selectattr('status', 'equalto', 'up')
|
||||||
|
| list | length >= expected_osd_count | int
|
||||||
|
retries: 20
|
||||||
|
delay: 10
|
||||||
|
vars:
|
||||||
|
expected_osd_count: >-
|
||||||
|
{{
|
||||||
|
ceph_osds.values()
|
||||||
|
| map('map', attribute='partitions')
|
||||||
|
| map('default', 1)
|
||||||
|
| sum
|
||||||
|
}}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Phase 5: Create and Configure Pools
|
||||||
|
|
||||||
|
### Step 6: Create CEPH Pools
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_ceph/tasks/pools.yml
|
||||||
|
---
|
||||||
|
- name: Get existing CEPH pools
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph osd pool ls
|
||||||
|
register: existing_pools
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Create CEPH pools
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: >
|
||||||
|
ceph osd pool create {{ item.name }}
|
||||||
|
{{ item.pg_num }}
|
||||||
|
{{ item.pgp_num | default(item.pg_num) }}
|
||||||
|
when: item.name not in existing_pools.stdout_lines
|
||||||
|
loop: "{{ ceph_pools }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name }}"
|
||||||
|
register: pool_create
|
||||||
|
changed_when: pool_create.rc == 0
|
||||||
|
|
||||||
|
- name: Set pool replication size
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "ceph osd pool set {{ item.name }} size {{ item.size }}"
|
||||||
|
loop: "{{ ceph_pools }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name }}"
|
||||||
|
register: pool_size
|
||||||
|
changed_when: "'set pool' in pool_size.stdout"
|
||||||
|
|
||||||
|
- name: Set pool minimum replication size
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "ceph osd pool set {{ item.name }} min_size {{ item.min_size }}"
|
||||||
|
loop: "{{ ceph_pools }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name }}"
|
||||||
|
register: pool_min_size
|
||||||
|
changed_when: "'set pool' in pool_min_size.stdout"
|
||||||
|
|
||||||
|
- name: Set pool application
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "ceph osd pool application enable {{ item.name }} {{ item.application }}"
|
||||||
|
when: item.application is defined
|
||||||
|
loop: "{{ ceph_pools }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name }}"
|
||||||
|
register: pool_app
|
||||||
|
changed_when: "'enabled application' in pool_app.stdout"
|
||||||
|
failed_when:
|
||||||
|
- pool_app.rc != 0
|
||||||
|
- "'already enabled' not in pool_app.stderr"
|
||||||
|
|
||||||
|
- name: Enable compression on pools
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "ceph osd pool set {{ item.name }} compression_mode aggressive"
|
||||||
|
when: item.compression | default(false)
|
||||||
|
loop: "{{ ceph_pools }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name }}"
|
||||||
|
register: pool_compression
|
||||||
|
changed_when: "'set pool' in pool_compression.stdout"
|
||||||
|
|
||||||
|
- name: Set compression algorithm
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "ceph osd pool set {{ item.name }} compression_algorithm {{ item.compression_algorithm | default('zstd') }}"
|
||||||
|
when: item.compression | default(false)
|
||||||
|
loop: "{{ ceph_pools }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name }}"
|
||||||
|
register: pool_compression_algo
|
||||||
|
changed_when: "'set pool' in pool_compression_algo.stdout"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Phase 6: Verify CEPH Health
|
||||||
|
|
||||||
|
### Step 7: Health Verification
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_ceph/tasks/verify.yml
|
||||||
|
---
|
||||||
|
- name: Wait for CEPH to stabilize
|
||||||
|
ansible.builtin.pause:
|
||||||
|
seconds: 30
|
||||||
|
|
||||||
|
- name: Check CEPH cluster health
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph health
|
||||||
|
register: ceph_health
|
||||||
|
changed_when: false
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Get CEPH status
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph status --format json
|
||||||
|
register: ceph_status
|
||||||
|
changed_when: false
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Parse CEPH status
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
ceph_status_data: "{{ ceph_status.stdout | from_json }}"
|
||||||
|
|
||||||
|
- name: Calculate expected OSD count
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
expected_osd_count: >-
|
||||||
|
{{
|
||||||
|
ceph_osds.values()
|
||||||
|
| map('map', attribute='partitions')
|
||||||
|
| map('default', 1)
|
||||||
|
| sum
|
||||||
|
}}
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Verify OSD count
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- ceph_status_data.osdmap.num_osds | int == expected_osd_count | int
|
||||||
|
fail_msg: "Expected {{ expected_osd_count }} OSDs but found {{ ceph_status_data.osdmap.num_osds }}"
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Verify all OSDs are up
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- ceph_status_data.osdmap.num_up_osds == ceph_status_data.osdmap.num_osds
|
||||||
|
fail_msg: "Not all OSDs are up: {{ ceph_status_data.osdmap.num_up_osds }}/{{ ceph_status_data.osdmap.num_osds }}"
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Verify all OSDs are in
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- ceph_status_data.osdmap.num_in_osds == ceph_status_data.osdmap.num_osds
|
||||||
|
fail_msg: "Not all OSDs are in cluster: {{ ceph_status_data.osdmap.num_in_osds }}/{{ ceph_status_data.osdmap.num_osds }}"
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Wait for PGs to become active+clean
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph pg stat --format json
|
||||||
|
register: pg_stat
|
||||||
|
changed_when: false
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
until: >
|
||||||
|
(pg_stat.stdout | from_json).num_pg_by_state
|
||||||
|
| selectattr('name', 'equalto', 'active+clean')
|
||||||
|
| map(attribute='num')
|
||||||
|
| sum == (pg_stat.stdout | from_json).num_pgs
|
||||||
|
retries: 60
|
||||||
|
delay: 10
|
||||||
|
|
||||||
|
- name: Display CEPH cluster summary
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: |
|
||||||
|
CEPH Cluster Health: {{ ceph_health.stdout }}
|
||||||
|
Total OSDs: {{ ceph_status_data.osdmap.num_osds }}
|
||||||
|
OSDs Up: {{ ceph_status_data.osdmap.num_up_osds }}
|
||||||
|
OSDs In: {{ ceph_status_data.osdmap.num_in_osds }}
|
||||||
|
PGs: {{ ceph_status_data.pgmap.num_pgs }}
|
||||||
|
Data: {{ ceph_status_data.pgmap.bytes_used | default(0) | human_readable }}
|
||||||
|
Available: {{ ceph_status_data.pgmap.bytes_avail | default(0) | human_readable }}
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
```
|
||||||
|
|
||||||
|
## Matrix Cluster Configuration Example
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# group_vars/matrix_cluster.yml (CEPH section)
|
||||||
|
---
|
||||||
|
# CEPH configuration
|
||||||
|
ceph_enabled: true
|
||||||
|
ceph_repository: "no-subscription" # or "enterprise" with subscription
|
||||||
|
ceph_network: "192.168.5.0/24" # vmbr1 - Public network
|
||||||
|
ceph_cluster_network: "192.168.7.0/24" # vmbr2 - Private network
|
||||||
|
|
||||||
|
# OSD configuration (4 OSDs per node = 12 total)
|
||||||
|
ceph_osds:
|
||||||
|
foxtrot:
|
||||||
|
- device: /dev/nvme1n1
|
||||||
|
partitions: 2 # Create 2 OSDs per 4TB NVMe
|
||||||
|
device_size_gb: 4000
|
||||||
|
partition_indices: [0, 1]
|
||||||
|
db_device: null
|
||||||
|
wal_device: null
|
||||||
|
crush_device_class: nvme
|
||||||
|
- device: /dev/nvme2n1
|
||||||
|
partitions: 2
|
||||||
|
device_size_gb: 4000
|
||||||
|
partition_indices: [0, 1]
|
||||||
|
db_device: null
|
||||||
|
wal_device: null
|
||||||
|
crush_device_class: nvme
|
||||||
|
|
||||||
|
golf:
|
||||||
|
- device: /dev/nvme1n1
|
||||||
|
partitions: 2
|
||||||
|
device_size_gb: 4000
|
||||||
|
partition_indices: [0, 1]
|
||||||
|
crush_device_class: nvme
|
||||||
|
- device: /dev/nvme2n1
|
||||||
|
partitions: 2
|
||||||
|
device_size_gb: 4000
|
||||||
|
partition_indices: [0, 1]
|
||||||
|
crush_device_class: nvme
|
||||||
|
|
||||||
|
hotel:
|
||||||
|
- device: /dev/nvme1n1
|
||||||
|
partitions: 2
|
||||||
|
device_size_gb: 4000
|
||||||
|
partition_indices: [0, 1]
|
||||||
|
crush_device_class: nvme
|
||||||
|
- device: /dev/nvme2n1
|
||||||
|
partitions: 2
|
||||||
|
device_size_gb: 4000
|
||||||
|
partition_indices: [0, 1]
|
||||||
|
crush_device_class: nvme
|
||||||
|
|
||||||
|
# Pool configuration
|
||||||
|
ceph_pools:
|
||||||
|
- name: vm_ssd
|
||||||
|
pg_num: 128
|
||||||
|
pgp_num: 128
|
||||||
|
size: 3 # Replicate across 3 nodes
|
||||||
|
min_size: 2 # Minimum 2 replicas required
|
||||||
|
application: rbd
|
||||||
|
compression: false
|
||||||
|
|
||||||
|
- name: vm_containers
|
||||||
|
pg_num: 64
|
||||||
|
pgp_num: 64
|
||||||
|
size: 3
|
||||||
|
min_size: 2
|
||||||
|
application: rbd
|
||||||
|
compression: true
|
||||||
|
compression_algorithm: zstd
|
||||||
|
|
||||||
|
# Safety flags
|
||||||
|
ceph_wipe_disks: false # Set to true for fresh deployment (DESTRUCTIVE!)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Complete Playbook Example
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# playbooks/ceph-deploy.yml
|
||||||
|
---
|
||||||
|
- name: Deploy CEPH Storage on Proxmox Cluster
|
||||||
|
hosts: "{{ cluster_group | default('matrix_cluster') }}"
|
||||||
|
become: true
|
||||||
|
serial: 1 # Deploy one node at a time
|
||||||
|
|
||||||
|
pre_tasks:
|
||||||
|
- name: Verify cluster is healthy
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pvecm status
|
||||||
|
register: cluster_check
|
||||||
|
changed_when: false
|
||||||
|
failed_when: "'Quorate: Yes' not in cluster_check.stdout"
|
||||||
|
|
||||||
|
- name: Verify CEPH networks MTU
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "ip link show {{ item }}"
|
||||||
|
register: mtu_check
|
||||||
|
changed_when: false
|
||||||
|
failed_when: "'mtu 9000' not in mtu_check.stdout"
|
||||||
|
loop:
|
||||||
|
- vmbr1 # CEPH public
|
||||||
|
- vmbr2 # CEPH private
|
||||||
|
|
||||||
|
- name: Display CEPH configuration
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: |
|
||||||
|
Deploying CEPH to cluster: {{ cluster_name }}
|
||||||
|
Public network: {{ ceph_network }}
|
||||||
|
Cluster network: {{ ceph_cluster_network }}
|
||||||
|
Expected OSDs: {{ ceph_osds.values() | map('map', attribute='partitions') | map('default', 1) | sum }}
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- role: proxmox_ceph
|
||||||
|
|
||||||
|
post_tasks:
|
||||||
|
- name: Display CEPH OSD tree
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph osd tree
|
||||||
|
register: osd_tree_final
|
||||||
|
changed_when: false
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Show OSD tree
|
||||||
|
ansible.builtin.debug:
|
||||||
|
var: osd_tree_final.stdout_lines
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Display pool information
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ceph osd pool ls detail
|
||||||
|
register: pool_info
|
||||||
|
changed_when: false
|
||||||
|
delegate_to: "{{ groups[cluster_group | default('matrix_cluster')][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Show pool details
|
||||||
|
ansible.builtin.debug:
|
||||||
|
var: pool_info.stdout_lines
|
||||||
|
run_once: true
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Deploy CEPH to Matrix Cluster
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check syntax
|
||||||
|
ansible-playbook playbooks/ceph-deploy.yml --syntax-check
|
||||||
|
|
||||||
|
# Deploy CEPH
|
||||||
|
ansible-playbook playbooks/ceph-deploy.yml --limit matrix_cluster
|
||||||
|
|
||||||
|
# Verify CEPH status
|
||||||
|
ansible -i inventory/proxmox.yml foxtrot -m shell -a "ceph status"
|
||||||
|
ansible -i inventory/proxmox.yml foxtrot -m shell -a "ceph osd tree"
|
||||||
|
ansible -i inventory/proxmox.yml foxtrot -m shell -a "ceph df"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Add mise Tasks
|
||||||
|
|
||||||
|
```toml
|
||||||
|
# .mise.toml
|
||||||
|
[tasks."ceph:deploy"]
|
||||||
|
description = "Deploy CEPH storage on cluster"
|
||||||
|
run = """
|
||||||
|
cd ansible
|
||||||
|
uv run ansible-playbook playbooks/ceph-deploy.yml
|
||||||
|
"""
|
||||||
|
|
||||||
|
[tasks."ceph:status"]
|
||||||
|
description = "Show CEPH cluster status"
|
||||||
|
run = """
|
||||||
|
ansible -i ansible/inventory/proxmox.yml foxtrot -m shell -a "ceph -s"
|
||||||
|
"""
|
||||||
|
|
||||||
|
[tasks."ceph:health"]
|
||||||
|
description = "Show CEPH health detail"
|
||||||
|
run = """
|
||||||
|
ansible -i ansible/inventory/proxmox.yml foxtrot -m shell -a "ceph health detail"
|
||||||
|
"""
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### OSDs Won't Create
|
||||||
|
|
||||||
|
**Symptoms:**
|
||||||
|
|
||||||
|
- `pveceph osd create` fails with "already in use" error
|
||||||
|
|
||||||
|
**Solutions:**
|
||||||
|
|
||||||
|
1. Check if disk has existing partitions: `lsblk /dev/nvme1n1`
|
||||||
|
2. Wipe disk: `wipefs -a /dev/nvme1n1` (DESTRUCTIVE!)
|
||||||
|
3. Set `ceph_wipe_disks: true` in group_vars
|
||||||
|
4. Check for existing LVM: `pvdisplay`, `lvdisplay`
|
||||||
|
|
||||||
|
### PGs Stuck in Creating
|
||||||
|
|
||||||
|
**Symptoms:**
|
||||||
|
|
||||||
|
- PGs stay in "creating" state for extended period
|
||||||
|
|
||||||
|
**Solutions:**
|
||||||
|
|
||||||
|
1. Check OSD status: `ceph osd tree`
|
||||||
|
2. Verify all OSDs are up and in: `ceph osd stat`
|
||||||
|
3. Check mon/mgr status: `ceph mon stat`, `ceph mgr stat`
|
||||||
|
4. Review logs: `journalctl -u ceph-osd@*.service -n 100`
|
||||||
|
|
||||||
|
### Poor CEPH Performance
|
||||||
|
|
||||||
|
**Symptoms:**
|
||||||
|
|
||||||
|
- Slow VM disk I/O
|
||||||
|
|
||||||
|
**Solutions:**
|
||||||
|
|
||||||
|
1. Verify MTU 9000: `ip link show vmbr1 | grep mtu`
|
||||||
|
2. Test network throughput: `iperf3` between nodes
|
||||||
|
3. Check OSD utilization: `ceph osd df`
|
||||||
|
4. Verify SSD/NVMe is being used: `ceph osd tree`
|
||||||
|
5. Check for rebalancing: `ceph -s` (look for "recovery")
|
||||||
|
|
||||||
|
## Related Workflows
|
||||||
|
|
||||||
|
- [Cluster Formation](cluster-formation.md) - Form cluster before CEPH
|
||||||
|
- [Network Configuration](../reference/networking.md) - Configure CEPH networks
|
||||||
|
- [Storage Management](../reference/storage-management.md) - Manage CEPH pools and OSDs
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- ProxSpray analysis: `docs/proxspray-analysis.md` (lines 1431-1562)
|
||||||
|
- Proxmox VE CEPH documentation
|
||||||
|
- CEPH deployment best practices
|
||||||
|
- [Ansible CEPH automation pattern](../../.claude/skills/ansible-best-practices/patterns/ceph-automation.md)
|
||||||
646
skills/proxmox-infrastructure/workflows/cluster-formation.md
Normal file
646
skills/proxmox-infrastructure/workflows/cluster-formation.md
Normal file
@@ -0,0 +1,646 @@
|
|||||||
|
# Proxmox Cluster Formation Workflow
|
||||||
|
|
||||||
|
Complete guide to forming a Proxmox VE cluster using Ansible automation with idempotent patterns.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This workflow automates the creation of a Proxmox VE cluster with:
|
||||||
|
|
||||||
|
- Hostname resolution configuration
|
||||||
|
- SSH key distribution for cluster operations
|
||||||
|
- Idempotent cluster initialization
|
||||||
|
- Corosync network configuration
|
||||||
|
- Quorum and health verification
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Before forming a cluster:
|
||||||
|
|
||||||
|
1. **All nodes must have:**
|
||||||
|
- Proxmox VE 9.x installed
|
||||||
|
- Network connectivity on management network
|
||||||
|
- Dedicated corosync network configured (VLAN 9 for Matrix)
|
||||||
|
- Unique hostnames
|
||||||
|
- Synchronized time (NTP configured)
|
||||||
|
|
||||||
|
2. **Minimum requirements:**
|
||||||
|
- At least 3 nodes for quorum (production)
|
||||||
|
- 1 node for development/testing (non-recommended)
|
||||||
|
|
||||||
|
3. **Network requirements:**
|
||||||
|
- All nodes must be able to resolve each other's hostnames
|
||||||
|
- Corosync network must be isolated (no VM traffic)
|
||||||
|
- Low latency between nodes (<2ms recommended)
|
||||||
|
- MTU 1500 on management network
|
||||||
|
|
||||||
|
## Phase 1: Prepare Cluster Nodes
|
||||||
|
|
||||||
|
### Step 1: Verify Prerequisites
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_cluster/tasks/prerequisites.yml
|
||||||
|
---
|
||||||
|
- name: Check Proxmox VE is installed
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: /usr/bin/pvecm
|
||||||
|
register: pvecm_binary
|
||||||
|
failed_when: not pvecm_binary.stat.exists
|
||||||
|
|
||||||
|
- name: Get Proxmox VE version
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pveversion
|
||||||
|
register: pve_version
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Verify minimum Proxmox VE version
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- "'pve-manager/9' in pve_version.stdout or 'pve-manager/8' in pve_version.stdout"
|
||||||
|
fail_msg: "Proxmox VE 8.x or 9.x required"
|
||||||
|
|
||||||
|
- name: Verify minimum node count for production
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- groups[cluster_group] | length >= 3
|
||||||
|
fail_msg: "Production cluster requires at least 3 nodes for quorum"
|
||||||
|
when: cluster_environment == 'production'
|
||||||
|
|
||||||
|
- name: Check no existing cluster membership
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pvecm status
|
||||||
|
register: existing_cluster
|
||||||
|
failed_when: false
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Display cluster warning if already member
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: |
|
||||||
|
WARNING: Node {{ inventory_hostname }} is already a cluster member.
|
||||||
|
Current cluster: {{ existing_cluster.stdout }}
|
||||||
|
This playbook will attempt to join the target cluster.
|
||||||
|
when:
|
||||||
|
- existing_cluster.rc == 0
|
||||||
|
- cluster_name not in existing_cluster.stdout
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Configure Hostname Resolution
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_cluster/tasks/hosts_config.yml
|
||||||
|
---
|
||||||
|
- name: Ensure cluster nodes in /etc/hosts (management IP)
|
||||||
|
ansible.builtin.lineinfile:
|
||||||
|
path: /etc/hosts
|
||||||
|
regexp: "^{{ item.management_ip }}\\s+"
|
||||||
|
line: "{{ item.management_ip }} {{ item.fqdn }} {{ item.short_name }}"
|
||||||
|
state: present
|
||||||
|
loop: "{{ cluster_nodes }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.short_name }}"
|
||||||
|
|
||||||
|
- name: Ensure corosync IPs in /etc/hosts
|
||||||
|
ansible.builtin.lineinfile:
|
||||||
|
path: /etc/hosts
|
||||||
|
regexp: "^{{ item.corosync_ip }}\\s+"
|
||||||
|
line: "{{ item.corosync_ip }} {{ item.short_name }}-corosync"
|
||||||
|
state: present
|
||||||
|
loop: "{{ cluster_nodes }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.short_name }}"
|
||||||
|
|
||||||
|
- name: Verify hostname resolution (forward)
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "getent hosts {{ item.fqdn }}"
|
||||||
|
register: host_lookup
|
||||||
|
failed_when: host_lookup.rc != 0
|
||||||
|
changed_when: false
|
||||||
|
loop: "{{ cluster_nodes }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.fqdn }}"
|
||||||
|
|
||||||
|
- name: Verify hostname resolution (reverse)
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "getent hosts {{ item.management_ip }}"
|
||||||
|
register: reverse_lookup
|
||||||
|
failed_when:
|
||||||
|
- reverse_lookup.rc != 0
|
||||||
|
changed_when: false
|
||||||
|
loop: "{{ cluster_nodes }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.management_ip }}"
|
||||||
|
|
||||||
|
- name: Test corosync network connectivity
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "ping -c 3 -W 2 {{ item.corosync_ip }}"
|
||||||
|
register: corosync_ping
|
||||||
|
changed_when: false
|
||||||
|
when: item.short_name != inventory_hostname_short
|
||||||
|
loop: "{{ cluster_nodes }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.short_name }}"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Distribute SSH Keys
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_cluster/tasks/ssh_keys.yml
|
||||||
|
---
|
||||||
|
- name: Generate SSH key for root (if not exists)
|
||||||
|
ansible.builtin.user:
|
||||||
|
name: root
|
||||||
|
generate_ssh_key: true
|
||||||
|
ssh_key_type: ed25519
|
||||||
|
ssh_key_comment: "root@{{ inventory_hostname }}"
|
||||||
|
register: root_ssh_key
|
||||||
|
|
||||||
|
- name: Fetch public keys from all nodes
|
||||||
|
ansible.builtin.slurp:
|
||||||
|
src: /root/.ssh/id_ed25519.pub
|
||||||
|
register: node_public_keys
|
||||||
|
|
||||||
|
- name: Distribute SSH keys to all nodes
|
||||||
|
ansible.posix.authorized_key:
|
||||||
|
user: root
|
||||||
|
state: present
|
||||||
|
key: "{{ hostvars[item].node_public_keys.content | b64decode }}"
|
||||||
|
comment: "cluster-{{ item }}"
|
||||||
|
loop: "{{ groups[cluster_group] }}"
|
||||||
|
when: item != inventory_hostname
|
||||||
|
|
||||||
|
- name: Populate known_hosts with node SSH keys
|
||||||
|
ansible.builtin.shell:
|
||||||
|
cmd: "ssh-keyscan -H {{ item }} >> /root/.ssh/known_hosts"
|
||||||
|
when: item != inventory_hostname
|
||||||
|
loop: "{{ groups[cluster_group] }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item }}"
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Test SSH connectivity to all nodes
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "ssh -o ConnectTimeout=5 {{ item }} hostname"
|
||||||
|
register: ssh_test
|
||||||
|
changed_when: false
|
||||||
|
when: item != inventory_hostname
|
||||||
|
loop: "{{ groups[cluster_group] }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item }}"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Phase 2: Initialize Cluster
|
||||||
|
|
||||||
|
### Step 4: Create Cluster (First Node Only)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_cluster/tasks/cluster_init.yml
|
||||||
|
---
|
||||||
|
- name: Check existing cluster status
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pvecm status
|
||||||
|
register: cluster_status
|
||||||
|
failed_when: false
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Get cluster nodes list
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pvecm nodes
|
||||||
|
register: cluster_nodes_check
|
||||||
|
failed_when: false
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Set cluster facts
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
in_target_cluster: "{{ cluster_status.rc == 0 and cluster_name in cluster_status.stdout }}"
|
||||||
|
|
||||||
|
- name: Create new cluster on first node
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "pvecm create {{ cluster_name }} --link0 {{ corosync_link0_address }}"
|
||||||
|
when: not in_target_cluster
|
||||||
|
register: cluster_create
|
||||||
|
changed_when: cluster_create.rc == 0
|
||||||
|
|
||||||
|
- name: Wait for cluster to initialize
|
||||||
|
ansible.builtin.pause:
|
||||||
|
seconds: 10
|
||||||
|
when: cluster_create.changed
|
||||||
|
|
||||||
|
- name: Verify cluster creation
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pvecm status
|
||||||
|
register: cluster_verify
|
||||||
|
changed_when: false
|
||||||
|
failed_when: cluster_name not in cluster_verify.stdout
|
||||||
|
|
||||||
|
- name: Display cluster status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
var: cluster_verify.stdout_lines
|
||||||
|
when: cluster_create.changed or ansible_verbosity > 0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 5: Join Nodes to Cluster
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_cluster/tasks/cluster_join.yml
|
||||||
|
---
|
||||||
|
- name: Check if already in cluster
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pvecm status
|
||||||
|
register: cluster_status
|
||||||
|
failed_when: false
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Set membership facts
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
is_cluster_member: "{{ cluster_status.rc == 0 }}"
|
||||||
|
in_target_cluster: "{{ cluster_status.rc == 0 and cluster_name in cluster_status.stdout }}"
|
||||||
|
|
||||||
|
- name: Get first node hostname
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
first_node_hostname: "{{ hostvars[groups[cluster_group][0]].inventory_hostname }}"
|
||||||
|
|
||||||
|
- name: Join cluster
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: >
|
||||||
|
pvecm add {{ first_node_hostname }}
|
||||||
|
--link0 {{ corosync_link0_address }}
|
||||||
|
when:
|
||||||
|
- not is_cluster_member or not in_target_cluster
|
||||||
|
register: cluster_join
|
||||||
|
changed_when: cluster_join.rc == 0
|
||||||
|
failed_when:
|
||||||
|
- cluster_join.rc != 0
|
||||||
|
- "'already in a cluster' not in cluster_join.stderr"
|
||||||
|
|
||||||
|
- name: Wait for node to join cluster
|
||||||
|
ansible.builtin.pause:
|
||||||
|
seconds: 10
|
||||||
|
when: cluster_join.changed
|
||||||
|
|
||||||
|
- name: Verify cluster membership
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pvecm status
|
||||||
|
register: join_verify
|
||||||
|
changed_when: false
|
||||||
|
failed_when:
|
||||||
|
- "'Quorate: Yes' not in join_verify.stdout"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Phase 3: Configure Corosync
|
||||||
|
|
||||||
|
### Step 6: Corosync Network Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_cluster/tasks/corosync.yml
|
||||||
|
---
|
||||||
|
- name: Get current corosync configuration
|
||||||
|
ansible.builtin.slurp:
|
||||||
|
src: /etc/pve/corosync.conf
|
||||||
|
register: corosync_conf_current
|
||||||
|
|
||||||
|
- name: Parse current corosync config
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
current_corosync: "{{ corosync_conf_current.content | b64decode }}"
|
||||||
|
|
||||||
|
- name: Check if corosync config needs update
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
corosync_needs_update: "{{ corosync_network not in current_corosync }}"
|
||||||
|
|
||||||
|
- name: Backup corosync.conf
|
||||||
|
ansible.builtin.copy:
|
||||||
|
src: /etc/pve/corosync.conf
|
||||||
|
dest: "/etc/pve/corosync.conf.{{ ansible_date_time.epoch }}.bak"
|
||||||
|
remote_src: true
|
||||||
|
mode: '0640'
|
||||||
|
when: corosync_needs_update
|
||||||
|
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Update corosync configuration
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: corosync.conf.j2
|
||||||
|
dest: /etc/pve/corosync.conf.new
|
||||||
|
validate: corosync-cfgtool -c %s
|
||||||
|
mode: '0640'
|
||||||
|
when: corosync_needs_update
|
||||||
|
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Apply new corosync configuration
|
||||||
|
ansible.builtin.copy:
|
||||||
|
src: /etc/pve/corosync.conf.new
|
||||||
|
dest: /etc/pve/corosync.conf
|
||||||
|
remote_src: true
|
||||||
|
mode: '0640'
|
||||||
|
when: corosync_needs_update
|
||||||
|
notify:
|
||||||
|
- reload corosync
|
||||||
|
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||||
|
run_once: true
|
||||||
|
```
|
||||||
|
|
||||||
|
**Corosync Template Example:**
|
||||||
|
|
||||||
|
```jinja2
|
||||||
|
# templates/corosync.conf.j2
|
||||||
|
totem {
|
||||||
|
version: 2
|
||||||
|
cluster_name: {{ cluster_name }}
|
||||||
|
transport: knet
|
||||||
|
crypto_cipher: aes256
|
||||||
|
crypto_hash: sha256
|
||||||
|
|
||||||
|
interface {
|
||||||
|
linknumber: 0
|
||||||
|
knet_link_priority: 255
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
nodelist {
|
||||||
|
{% for node in cluster_nodes %}
|
||||||
|
node {
|
||||||
|
name: {{ node.short_name }}
|
||||||
|
nodeid: {{ node.node_id }}
|
||||||
|
quorum_votes: 1
|
||||||
|
ring0_addr: {{ node.corosync_ip }}
|
||||||
|
}
|
||||||
|
{% endfor %}
|
||||||
|
}
|
||||||
|
|
||||||
|
quorum {
|
||||||
|
provider: corosync_votequorum
|
||||||
|
{% if cluster_nodes | length == 2 %}
|
||||||
|
two_node: 1
|
||||||
|
{% endif %}
|
||||||
|
}
|
||||||
|
|
||||||
|
logging {
|
||||||
|
to_logfile: yes
|
||||||
|
logfile: /var/log/corosync/corosync.log
|
||||||
|
to_syslog: yes
|
||||||
|
timestamp: on
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Phase 4: Verify Cluster Health
|
||||||
|
|
||||||
|
### Step 7: Health Checks
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# roles/proxmox_cluster/tasks/verify.yml
|
||||||
|
---
|
||||||
|
- name: Wait for cluster to stabilize
|
||||||
|
ansible.builtin.pause:
|
||||||
|
seconds: 15
|
||||||
|
|
||||||
|
- name: Check cluster quorum
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pvecm status
|
||||||
|
register: cluster_health
|
||||||
|
changed_when: false
|
||||||
|
failed_when: "'Quorate: Yes' not in cluster_health.stdout"
|
||||||
|
|
||||||
|
- name: Get cluster node count
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pvecm nodes
|
||||||
|
register: cluster_nodes_final
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Verify expected node count
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- cluster_nodes_final.stdout_lines | length >= groups[cluster_group] | length
|
||||||
|
fail_msg: "Expected {{ groups[cluster_group] | length }} nodes but found {{ cluster_nodes_final.stdout_lines | length }}"
|
||||||
|
|
||||||
|
- name: Check corosync ring status
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: corosync-cfgtool -s
|
||||||
|
register: corosync_status
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Verify all nodes in corosync
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- "'online' in corosync_status.stdout"
|
||||||
|
fail_msg: "Corosync ring issues detected"
|
||||||
|
|
||||||
|
- name: Get cluster configuration version
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: corosync-cmapctl -b totem.config_version
|
||||||
|
register: config_version
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Display cluster health summary
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: |
|
||||||
|
Cluster: {{ cluster_name }}
|
||||||
|
Quorum: {{ 'Yes' if 'Quorate: Yes' in cluster_health.stdout else 'No' }}
|
||||||
|
Nodes: {{ cluster_nodes_final.stdout_lines | length }}
|
||||||
|
Config Version: {{ config_version.stdout }}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Matrix Cluster Example Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# group_vars/matrix_cluster.yml
|
||||||
|
---
|
||||||
|
cluster_name: "Matrix"
|
||||||
|
cluster_group: "matrix_cluster"
|
||||||
|
cluster_environment: "production"
|
||||||
|
|
||||||
|
# Corosync configuration
|
||||||
|
corosync_network: "192.168.8.0/24" # VLAN 9
|
||||||
|
|
||||||
|
# Node configuration
|
||||||
|
cluster_nodes:
|
||||||
|
- short_name: foxtrot
|
||||||
|
fqdn: foxtrot.matrix.spaceships.work
|
||||||
|
management_ip: 192.168.3.5
|
||||||
|
corosync_ip: 192.168.8.5
|
||||||
|
node_id: 1
|
||||||
|
|
||||||
|
- short_name: golf
|
||||||
|
fqdn: golf.matrix.spaceships.work
|
||||||
|
management_ip: 192.168.3.6
|
||||||
|
corosync_ip: 192.168.8.6
|
||||||
|
node_id: 2
|
||||||
|
|
||||||
|
- short_name: hotel
|
||||||
|
fqdn: hotel.matrix.spaceships.work
|
||||||
|
management_ip: 192.168.3.7
|
||||||
|
corosync_ip: 192.168.8.7
|
||||||
|
node_id: 3
|
||||||
|
|
||||||
|
# Set per-node corosync address
|
||||||
|
corosync_link0_address: "{{ cluster_nodes | selectattr('short_name', 'equalto', inventory_hostname_short) | map(attribute='corosync_ip') | first }}"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Complete Playbook Example
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# playbooks/cluster-init.yml
|
||||||
|
---
|
||||||
|
- name: Initialize Proxmox Cluster
|
||||||
|
hosts: "{{ cluster_group | default('matrix_cluster') }}"
|
||||||
|
become: true
|
||||||
|
serial: 1 # One node at a time for safety
|
||||||
|
|
||||||
|
pre_tasks:
|
||||||
|
- name: Validate cluster group is defined
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- cluster_group is defined
|
||||||
|
- cluster_name is defined
|
||||||
|
- cluster_nodes is defined
|
||||||
|
fail_msg: "Required variables not defined in group_vars"
|
||||||
|
|
||||||
|
- name: Display cluster configuration
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: |
|
||||||
|
Forming cluster: {{ cluster_name }}
|
||||||
|
Nodes: {{ cluster_nodes | map(attribute='short_name') | join(', ') }}
|
||||||
|
Corosync network: {{ corosync_network }}
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Verify prerequisites
|
||||||
|
ansible.builtin.include_tasks: "{{ role_path }}/tasks/prerequisites.yml"
|
||||||
|
|
||||||
|
- name: Configure /etc/hosts
|
||||||
|
ansible.builtin.include_tasks: "{{ role_path }}/tasks/hosts_config.yml"
|
||||||
|
|
||||||
|
- name: Distribute SSH keys
|
||||||
|
ansible.builtin.include_tasks: "{{ role_path }}/tasks/ssh_keys.yml"
|
||||||
|
|
||||||
|
# First node creates cluster
|
||||||
|
- name: Initialize cluster on first node
|
||||||
|
ansible.builtin.include_tasks: "{{ role_path }}/tasks/cluster_init.yml"
|
||||||
|
when: inventory_hostname == groups[cluster_group][0]
|
||||||
|
|
||||||
|
# Wait for first node
|
||||||
|
- name: Wait for first node to complete
|
||||||
|
ansible.builtin.pause:
|
||||||
|
seconds: 20
|
||||||
|
when: inventory_hostname != groups[cluster_group][0]
|
||||||
|
|
||||||
|
# Other nodes join
|
||||||
|
- name: Join cluster on other nodes
|
||||||
|
ansible.builtin.include_tasks: "{{ role_path }}/tasks/cluster_join.yml"
|
||||||
|
when: inventory_hostname != groups[cluster_group][0]
|
||||||
|
|
||||||
|
- name: Configure corosync
|
||||||
|
ansible.builtin.include_tasks: "{{ role_path }}/tasks/corosync.yml"
|
||||||
|
|
||||||
|
- name: Verify cluster health
|
||||||
|
ansible.builtin.include_tasks: "{{ role_path }}/tasks/verify.yml"
|
||||||
|
|
||||||
|
post_tasks:
|
||||||
|
- name: Display final cluster status
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: pvecm status
|
||||||
|
register: final_status
|
||||||
|
changed_when: false
|
||||||
|
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Show cluster status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
var: final_status.stdout_lines
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
handlers:
|
||||||
|
- name: reload corosync
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
name: corosync
|
||||||
|
state: reloaded
|
||||||
|
throttle: 1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Initialize Matrix Cluster
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check syntax
|
||||||
|
ansible-playbook playbooks/cluster-init.yml --syntax-check
|
||||||
|
|
||||||
|
# Dry run (limited functionality)
|
||||||
|
ansible-playbook playbooks/cluster-init.yml --check --diff
|
||||||
|
|
||||||
|
# Initialize cluster
|
||||||
|
ansible-playbook playbooks/cluster-init.yml --limit matrix_cluster
|
||||||
|
|
||||||
|
# Verify cluster status
|
||||||
|
ansible -i inventory/proxmox.yml foxtrot -m shell -a "pvecm status"
|
||||||
|
ansible -i inventory/proxmox.yml foxtrot -m shell -a "pvecm nodes"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Add mise Task
|
||||||
|
|
||||||
|
```toml
|
||||||
|
# .mise.toml
|
||||||
|
[tasks."cluster:init"]
|
||||||
|
description = "Initialize Proxmox cluster"
|
||||||
|
run = """
|
||||||
|
cd ansible
|
||||||
|
uv run ansible-playbook playbooks/cluster-init.yml
|
||||||
|
"""
|
||||||
|
|
||||||
|
[tasks."cluster:status"]
|
||||||
|
description = "Show cluster status"
|
||||||
|
run = """
|
||||||
|
ansible -i ansible/inventory/proxmox.yml foxtrot -m shell -a "pvecm status"
|
||||||
|
"""
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Node Won't Join Cluster
|
||||||
|
|
||||||
|
**Symptoms:**
|
||||||
|
|
||||||
|
- `pvecm add` fails with timeout or connection error
|
||||||
|
|
||||||
|
**Solutions:**
|
||||||
|
|
||||||
|
1. Verify SSH connectivity: `ssh root@first-node hostname`
|
||||||
|
2. Check /etc/hosts: `getent hosts first-node`
|
||||||
|
3. Verify corosync network: `ping -c 3 192.168.8.5`
|
||||||
|
4. Check firewall: `iptables -L | grep 5404`
|
||||||
|
|
||||||
|
### Cluster Shows No Quorum
|
||||||
|
|
||||||
|
**Symptoms:**
|
||||||
|
|
||||||
|
- `pvecm status` shows `Quorate: No`
|
||||||
|
|
||||||
|
**Solutions:**
|
||||||
|
|
||||||
|
1. Check node count: Must have majority (2 of 3, 3 of 5, etc.)
|
||||||
|
2. Verify corosync: `systemctl status corosync`
|
||||||
|
3. Check corosync ring: `corosync-cfgtool -s`
|
||||||
|
4. Review logs: `journalctl -u corosync -n 50`
|
||||||
|
|
||||||
|
### Configuration Sync Issues
|
||||||
|
|
||||||
|
**Symptoms:**
|
||||||
|
|
||||||
|
- Changes on one node don't appear on others
|
||||||
|
|
||||||
|
**Solutions:**
|
||||||
|
|
||||||
|
1. Verify pmxcfs: `systemctl status pve-cluster`
|
||||||
|
2. Check filesystem: `pvecm status | grep -i cluster`
|
||||||
|
3. Restart cluster filesystem: `systemctl restart pve-cluster`
|
||||||
|
|
||||||
|
## Related Workflows
|
||||||
|
|
||||||
|
- [CEPH Deployment](ceph-deployment.md) - Deploy CEPH after cluster formation
|
||||||
|
- [Network Configuration](../reference/networking.md) - Configure cluster networking
|
||||||
|
- [Cluster Maintenance](cluster-maintenance.md) - Add/remove nodes, upgrades
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- ProxSpray analysis: `docs/proxspray-analysis.md` (lines 1318-1428)
|
||||||
|
- Proxmox VE Cluster Manager documentation
|
||||||
|
- Corosync configuration guide
|
||||||
|
- [Ansible cluster automation pattern](../../ansible-best-practices/patterns/cluster-automation.md)
|
||||||
Reference in New Issue
Block a user