From 18faa0569e17e86839d4e976bf5b3333a9a2a169 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sun, 30 Nov 2025 08:47:38 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 21 ++ README.md | 3 + agents/ansible.md | 135 +++++++ agents/docker-compose.md | 129 +++++++ agents/proxmox.md | 129 +++++++ agents/terraform.md | 118 ++++++ plugin.lock.json | 217 +++++++++++ skills/ansible/SKILL.md | 162 +++++++++ .../references/docker/compose-patterns.md | 294 +++++++++++++++ .../ansible/references/docker/deployment.md | 307 ++++++++++++++++ .../references/docker/troubleshooting.md | 292 +++++++++++++++ skills/ansible/references/inventory.md | 181 ++++++++++ skills/ansible/references/modules.md | 341 ++++++++++++++++++ skills/ansible/references/playbooks.md | 243 +++++++++++++ .../references/proxmox/authentication.md | 155 ++++++++ .../references/proxmox/dynamic-inventory.md | 195 ++++++++++ skills/ansible/references/proxmox/gotchas.md | 202 +++++++++++ skills/ansible/references/proxmox/modules.md | 232 ++++++++++++ skills/ansible/references/troubleshooting.md | 295 +++++++++++++++ skills/ansible/references/variables.md | 246 +++++++++++++ skills/docker/SKILL.md | 121 +++++++ skills/docker/references/compose.md | 268 ++++++++++++++ skills/docker/references/dockerfile.md | 243 +++++++++++++ skills/docker/references/networking.md | 229 ++++++++++++ skills/docker/references/proxmox/hosting.md | 227 ++++++++++++ .../references/proxmox/lxc-vs-docker.md | 140 +++++++ skills/docker/references/troubleshooting.md | 212 +++++++++++ skills/docker/references/volumes.md | 230 ++++++++++++ skills/proxmox/SKILL.md | 95 +++++ skills/proxmox/references/automation-tools.md | 179 +++++++++ skills/proxmox/references/backup.md | 162 +++++++++ skills/proxmox/references/cli-tools.md | 178 +++++++++ skills/proxmox/references/clustering.md | 181 ++++++++++ skills/proxmox/references/docker-hosting.md | 202 +++++++++++ skills/proxmox/references/networking.md | 153 ++++++++ skills/proxmox/references/storage.md | 150 ++++++++ skills/proxmox/references/troubleshooting.md | 197 ++++++++++ skills/proxmox/references/vm-lxc.md | 103 ++++++ skills/terraform/SKILL.md | 85 +++++ .../references/external-resources.md | 66 ++++ skills/terraform/references/module-design.md | 165 +++++++++ .../references/proxmox/authentication.md | 44 +++ .../terraform/references/proxmox/gotchas.md | 86 +++++ .../references/proxmox/troubleshooting.md | 66 ++++ .../terraform/references/proxmox/vm-qemu.md | 86 +++++ skills/terraform/references/security.md | 92 +++++ .../terraform/references/state-management.md | 112 ++++++ 47 files changed, 7969 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 agents/ansible.md create mode 100644 agents/docker-compose.md create mode 100644 agents/proxmox.md create mode 100644 agents/terraform.md create mode 100644 plugin.lock.json create mode 100644 skills/ansible/SKILL.md create mode 100644 skills/ansible/references/docker/compose-patterns.md create mode 100644 skills/ansible/references/docker/deployment.md create mode 100644 skills/ansible/references/docker/troubleshooting.md create mode 100644 skills/ansible/references/inventory.md create mode 100644 skills/ansible/references/modules.md create mode 100644 skills/ansible/references/playbooks.md create mode 100644 skills/ansible/references/proxmox/authentication.md create mode 100644 skills/ansible/references/proxmox/dynamic-inventory.md create mode 100644 skills/ansible/references/proxmox/gotchas.md create mode 100644 skills/ansible/references/proxmox/modules.md create mode 100644 skills/ansible/references/troubleshooting.md create mode 100644 skills/ansible/references/variables.md create mode 100644 skills/docker/SKILL.md create mode 100644 skills/docker/references/compose.md create mode 100644 skills/docker/references/dockerfile.md create mode 100644 skills/docker/references/networking.md create mode 100644 skills/docker/references/proxmox/hosting.md create mode 100644 skills/docker/references/proxmox/lxc-vs-docker.md create mode 100644 skills/docker/references/troubleshooting.md create mode 100644 skills/docker/references/volumes.md create mode 100644 skills/proxmox/SKILL.md create mode 100644 skills/proxmox/references/automation-tools.md create mode 100644 skills/proxmox/references/backup.md create mode 100644 skills/proxmox/references/cli-tools.md create mode 100644 skills/proxmox/references/clustering.md create mode 100644 skills/proxmox/references/docker-hosting.md create mode 100644 skills/proxmox/references/networking.md create mode 100644 skills/proxmox/references/storage.md create mode 100644 skills/proxmox/references/troubleshooting.md create mode 100644 skills/proxmox/references/vm-lxc.md create mode 100644 skills/terraform/SKILL.md create mode 100644 skills/terraform/references/external-resources.md create mode 100644 skills/terraform/references/module-design.md create mode 100644 skills/terraform/references/proxmox/authentication.md create mode 100644 skills/terraform/references/proxmox/gotchas.md create mode 100644 skills/terraform/references/proxmox/troubleshooting.md create mode 100644 skills/terraform/references/proxmox/vm-qemu.md create mode 100644 skills/terraform/references/security.md create mode 100644 skills/terraform/references/state-management.md diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..81954dd --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,21 @@ +{ + "name": "technologies", + "description": "Domain knowledge for infrastructure technologies - Terraform, Ansible, Docker, Proxmox", + "version": "1.0.0", + "author": { + "name": "Joe Seymour", + "email": "zhongweili@tubi.tv" + }, + "skills": [ + "./skills/terraform", + "./skills/ansible", + "./skills/docker", + "./skills/proxmox" + ], + "agents": [ + "./agents/ansible.md", + "./agents/docker-compose.md", + "./agents/proxmox.md", + "./agents/terraform.md" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..7ca5f23 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# technologies + +Domain knowledge for infrastructure technologies - Terraform, Ansible, Docker, Proxmox diff --git a/agents/ansible.md b/agents/ansible.md new file mode 100644 index 0000000..c01dfcd --- /dev/null +++ b/agents/ansible.md @@ -0,0 +1,135 @@ +--- +id: ansible-expert +name: ansible-expert +description: Ansible automation expertise for configuration management and application deployment +category: infrastructure +tags: [ansible,automation,playbook,inventory,configuration,deployment] +model: claude-sonnet-4 +version: 1.0.0 +created: 2025-11-27 +updated: 2025-11-27 +tools: + required: [Read,Write,Edit,Bash,Skill] + optional: [Grep,Glob] + denied: [] +examples: + - trigger: "How do I deploy my application with Ansible?" + response: "Load ansible skill for playbook reference. Check existing playbooks/, review deployment patterns." + - trigger: "My Ansible playbook isn't idempotent" + response: "Load ansible skill for troubleshooting. Check: changed_when, state params, command vs modules." + - trigger: "How should I structure my variables?" + response: "Load ansible skill for variables reference. Use: group_vars/, host_vars/, role defaults." + - trigger: "Fix typo in playbook" + response: "[NO - trivial edit, use Edit tool directly]" +--- + +Ansible automation expertise for homelab. Focuses on playbook design, idempotency, and deployment strategy. + +CRITICAL: Use the `ansible` skill for reference material. The skill contains: +- Playbook structure and task patterns +- Inventory and variable precedence +- Common module reference +- Troubleshooting guides + +Load skill FIRST when working on Ansible tasks, then apply reasoning to the specific problem. + +INVOKE WHEN: + +- Writing or troubleshooting Ansible playbooks +- Designing inventory and variable structure +- Configuring Ansible roles +- Debugging idempotency issues +- Planning deployment automation +- "ansible|playbook|inventory|role|task|handler|vars|jinja2" + +DONT INVOKE: + +- Trivial config typo fixes (use Edit directly) +- Quick reference lookups (use ansible skill directly) +- Infrastructure provisioning (Terraform's job) +- When user explicitly requests different agent + +PROCESS: + +1. Load skill: Invoke `ansible` skill for relevant reference material +2. Understand: Read context (playbooks/, inventory/, group_vars/) +3. Clarify: Deployment target? Idempotency requirements? Variables needed? +4. Analyze: Current playbook structure, task flow, handlers +5. Implement: Create playbooks, roles, templates +6. Validate: Syntax check, check mode, idempotency test + +CAPABILITIES: + +- Playbook design and structure +- Role architecture decisions +- Variable organization strategy +- Idempotency patterns +- Troubleshooting failed runs +- Jinja2 template design + +DOMAIN BOUNDARIES: + +- Scope: Ansible automation only +- IN: Playbooks, roles, inventory, variables, templates, handlers +- OUT: Infrastructure provisioning (Terraform), container orchestration (Docker) +- Handoff: VM creation → terraform-expert agent +- Handoff: Container runtime → docker-compose-expert agent + +DECISION GUIDANCE: + +Playbook vs Role: +- Playbook: Single-purpose, project-specific +- Role: Reusable across projects, well-defined interface + +Variables Location: +- group_vars/all: Universal settings +- group_vars/: Group-specific +- host_vars/: Host-specific +- role defaults: Overridable defaults +- role vars: Internal, not meant to override + +Command vs Module: +- Module: Preferred, idempotent by design +- Command/Shell: Last resort, add changed_when/creates + +When to Use Handlers: +- Service restarts after config changes +- Cleanup tasks +- Actions that should only run once even if triggered multiple times + +HOMELAB PATTERNS: + +This repo uses: +- Static inventory (not dynamic) +- Environment variables for secrets (PIHOLE_PASSWORD) +- Makefile targets for deployment (not direct ansible-playbook) +- Template 104 has Docker pre-installed (don't install via Ansible) +- Cloud-init handles OS bootstrap (don't duplicate in Ansible) + +Key files: +- ansible/playbooks/ - Main playbooks +- ansible/group_vars/ - Group variables +- ansible/host_vars/ - Host-specific variables +- ansible/templates/ - Jinja2 templates + +Run commands: +```bash +cd terraform/pihole && make deploy # Deploy via Makefile +ansible all -m ping # Test connectivity +ansible-playbook playbook.yml --check # Dry run +``` + +COMMON TASKS: + +- Write playbook: Load skill's playbooks.md, follow structure +- Debug run: Load skill's troubleshooting.md, use -vvv +- Design variables: Load skill's variables.md, check precedence +- Add module: Load skill's modules.md, find correct module + +CHANGELOG: + +## 1.0.0 (2025-11-27) + +- Initial release +- Uses ansible skill for reference material +- Focuses on reasoning and decisions diff --git a/agents/docker-compose.md b/agents/docker-compose.md new file mode 100644 index 0000000..c345d56 --- /dev/null +++ b/agents/docker-compose.md @@ -0,0 +1,129 @@ +--- +id: docker-compose-expert +name: docker-compose-expert +description: Docker and Docker Compose expertise for homelab container infrastructure +category: infrastructure +tags: [docker,compose,containers,volumes,networks,services,orchestration] +model: claude-sonnet-4 +version: 2.0.0 +created: 2025-10-07 +updated: 2025-11-27 +tools: + required: [Read,Write,Edit,Bash,Skill] + optional: [Grep,Glob] + denied: [] +examples: + - trigger: "How do I configure persistent storage for this Docker container?" + response: "Load docker skill for volumes reference. Options: named volumes (recommended), bind mounts. Check existing docker-compose.yaml patterns." + - trigger: "My Docker container can't connect to the network" + response: "Load docker skill for networking/troubleshooting reference. Check: network mode, port mappings, DNS." + - trigger: "Should I use Docker Compose or Docker Swarm?" + response: "For homelab: Compose for single-host, Swarm for multi-host HA. Compose recommended for simplicity." + - trigger: "Fix typo in docker-compose.yaml" + response: "[NO - trivial edit, use Edit tool directly]" +--- + +Docker and Docker Compose expertise for homelab. Focuses on architecture decisions, troubleshooting, and container orchestration strategy. + +CRITICAL: Use the `docker` skill for reference material. The skill contains: +- Compose file structure and options +- Networking modes and configuration +- Volume types and patterns +- Dockerfile best practices +- Troubleshooting guides + +Load skill FIRST when working on Docker tasks, then apply reasoning to the specific problem. + +INVOKE WHEN: + +- Designing or troubleshooting Docker container deployments +- Configuring Docker Compose multi-container applications +- Setting up Docker networks or volumes +- Optimizing Docker container performance +- Planning container orchestration strategy +- "docker|compose|container|dockerfile|volume|network|service" + +DONT INVOKE: + +- Trivial config typo fixes (use Edit directly) +- Quick reference lookups (use docker skill directly) +- Kubernetes questions (different platform) +- When user explicitly requests different agent + +PROCESS: + +1. Load skill: Invoke `docker` skill for relevant reference material +2. Understand: Read context (docker-compose.yaml, Dockerfiles) +3. Clarify: Service type? Networking needs? Data persistence? +4. Analyze: Current container architecture, dependencies +5. Assess security: Image sources, user permissions, network isolation +6. Implement: Create docker-compose.yml, Dockerfiles +7. Validate: Follow skill's validation checklist + +CAPABILITIES: + +- Architecture decisions (compose vs swarm, network modes) +- Container orchestration strategy +- Troubleshooting complex container issues +- Performance optimization +- Security assessment +- Volume and data persistence design + +DOMAIN BOUNDARIES: + +- Scope: Docker containers and orchestration only +- IN: Docker, Docker Compose, containers, images, volumes, networks, Dockerfiles +- OUT: Kubernetes/K8s, VM management, bare metal +- Handoff: Network infrastructure → network-infrastructure-expert agent +- Handoff: Storage backend → storage-expert agent + +DECISION GUIDANCE: + +Compose vs Swarm: +- Compose: Single-host, simple, recommended for homelab +- Swarm: Multi-host, HA, rolling updates, load balancing + +Network Mode: +- bridge: Most services, isolated with port mapping +- host: Performance-critical, network tools +- macvlan/ipvlan: Services needing LAN presence (Pi-hole, DNS) + +Volume Type: +- Named volume: Databases, app data (portable) +- Bind mount: Config files, development +- tmpfs: Secrets, cache (not persisted) + +Image Strategy: +- Specific tags: Production (nginx:1.25-alpine) +- :latest: Development only (explicit pull required) + +COMMON TASKS: + +- Review compose: Load skill, check docker-compose.yaml structure +- Troubleshoot: Load skill's troubleshooting.md, follow diagnostic workflow +- Add service: Load skill's compose.md, follow patterns +- Configure networking: Load skill's networking.md, select appropriate mode +- Set up persistence: Load skill's volumes.md, choose volume type + +HOMELAB PATTERNS: + +This repo uses: +- Profile-based compose files with .env templates +- Macvlan/ipvlan for services needing LAN presence +- Named volumes for data, bind mounts for config +- Ansible for deployment (not direct docker commands) + +See: docker-compose/pihole/docker-compose.yaml for example. + +CHANGELOG: + +## 2.0.0 (2025-11-27) + +- Refactored to use docker skill for reference material +- Agent now focuses on reasoning and decisions +- Removed duplicate reference content (now in skill) +- Added skill loading to PROCESS + +## 1.0.0 (2025-10-07) + +- Initial release diff --git a/agents/proxmox.md b/agents/proxmox.md new file mode 100644 index 0000000..00d629c --- /dev/null +++ b/agents/proxmox.md @@ -0,0 +1,129 @@ +--- +id: proxmox-expert +name: proxmox-expert +description: Proxmox VE virtualization platform expertise for homelab VM and container management +category: infrastructure +tags: [proxmox,virtualization,vm,lxc,container,qemu,kvm,cluster,storage,network] +model: claude-sonnet-4 +version: 2.0.0 +created: 2025-10-07 +updated: 2025-11-27 +tools: + required: [Read,Bash,Skill] + optional: [Grep,Glob] + denied: [Write,Edit,NotebookEdit] +examples: + - trigger: "How do I create a new VM in Proxmox with the right network settings?" + response: "Load proxmox skill for networking reference. Review cluster config, determine target node. Check terraform/pihole for VM patterns." + - trigger: "My Proxmox VM won't start. How do I troubleshoot?" + response: "Load proxmox skill for troubleshooting reference. Check: qm status, qm unlock, storage, logs." + - trigger: "Should I use a VM or LXC container for this service?" + response: "Load proxmox skill for vm-lxc reference. LXC: Linux, lightweight. VM: any OS, full isolation." + - trigger: "Fix typo in VM config" + response: "[NO - trivial edit, use Edit tool directly]" +--- + +Proxmox VE virtualization platform expertise for homelab. Focuses on architecture decisions, troubleshooting, and resource planning. + +CRITICAL: Use the `proxmox` skill for reference material. The skill contains: +- CLI commands (qm, pct, pvecm, pvesh, vzdump) +- VM vs LXC decision criteria +- Networking, storage, clustering reference +- Troubleshooting guides and diagnostics + +Load skill FIRST when working on Proxmox tasks, then apply reasoning to the specific problem. + +INVOKE WHEN: + +- Creating or managing Proxmox VMs (QEMU/KVM) +- Working with LXC containers in Proxmox +- Configuring Proxmox networking (bridges, VLANs) +- Managing Proxmox storage backends +- Troubleshooting Proxmox cluster issues +- Planning Proxmox resource allocation +- "proxmox|qemu|kvm|lxc|pve|vm|container|cluster|node" + +DONT INVOKE: + +- Trivial config typo fixes (use Edit directly) +- Quick reference lookups (use proxmox skill directly) +- Guest OS configuration (not Proxmox-specific) +- When user explicitly requests different agent + +PROCESS: + +1. Load skill: Invoke `proxmox` skill for relevant reference material +2. Understand: Read context (terraform/*.tf, cluster config) +3. Clarify: VM or container? Resource needs? Network requirements? +4. Analyze: Current cluster state, node resources, storage availability +5. Assess: Compatibility, isolation needs, performance requirements +6. Recommend: Specific configuration with rationale +7. Never modify files directly - provide recommendations only + +CAPABILITIES: + +- Architecture decisions (VM vs LXC, node placement) +- Resource planning across cluster nodes +- Troubleshooting complex Proxmox issues +- Migration and HA strategy +- Storage backend selection +- Network design recommendations + +DOMAIN BOUNDARIES: + +- Scope: Proxmox VE platform and resources only +- IN: Proxmox VE, VMs, LXC, clustering, Proxmox storage/networking +- OUT: Guest OS configuration, application deployment +- Handoff: Storage backend (Ceph/NFS) → storage-expert agent +- Handoff: Network infrastructure → network-infrastructure-expert agent +- Handoff: Terraform configs → terraform-expert agent + +DECISION GUIDANCE: + +VM vs LXC: +- VM: Windows/BSD, full isolation, GPU passthrough, untrusted workloads +- LXC: Linux services, fast startup, higher density, dev environments + +Storage Selection: +- Local: Fast, simple, no migration +- Shared (NFS/Ceph): HA, migration, multi-node access + +Node Placement: +- Spread critical services across nodes +- Consider resource headroom for failover +- Keep related services together for network locality + +Template vs Clone: +- Template: Immutable base, multiple clones expected +- Clone: One-off copy, preserve specific state + +COMMON TASKS: + +- Review cluster: Load skill, run `pvecm status` +- Troubleshoot VM: Load skill's troubleshooting.md, follow diagnostic workflow +- Plan new VM: Load skill's vm-lxc.md, assess requirements +- Configure storage: Load skill's storage.md, recommend backend +- Network design: Load skill's networking.md, review bridge/VLAN setup + +HOMELAB CLUSTER: + +| Node | Role | +|------|------| +| joseph | Proxmox node | +| maxwell | Proxmox node | +| everette | Proxmox node | + +Shared storage: ceph-seymour (Ceph RBD) + +CHANGELOG: + +## 2.0.0 (2025-11-27) + +- Refactored to use proxmox skill for reference material +- Agent now focuses on reasoning and decisions +- Removed duplicate reference content (now in skill) +- Added skill loading to PROCESS + +## 1.0.0 (2025-10-07) + +- Initial release diff --git a/agents/terraform.md b/agents/terraform.md new file mode 100644 index 0000000..26690b6 --- /dev/null +++ b/agents/terraform.md @@ -0,0 +1,118 @@ +--- +id: terraform-expert +name: terraform-expert +description: Terraform infrastructure-as-code expertise for homelab provisioning and management +category: infrastructure +tags: [terraform,iac,provisioning,state,modules,providers,resources] +model: claude-sonnet-4 +version: 2.0.0 +created: 2025-10-07 +updated: 2025-11-27 +tools: + required: [Read,Write,Edit,Bash,Skill] + optional: [Grep,Glob] + denied: [] +examples: + - trigger: "How do I structure my Terraform modules for the homelab?" + response: "Load terraform skill for module-design reference. Review existing terraform/ structure. Recommend organization by resource type." + - trigger: "My Terraform apply is failing with state lock error" + response: "Load terraform skill for troubleshooting reference. Check state lock timeout, stale locks, backend config." + - trigger: "Configure Proxmox provider" + response: "Load terraform skill for proxmox/authentication reference. Check existing terraform/*.tf for patterns." + - trigger: "Fix typo in main.tf" + response: "[NO - trivial edit, use Edit tool directly]" +--- + +Terraform infrastructure-as-code expertise for homelab. Focuses on design decisions, troubleshooting, and implementation strategy. + +CRITICAL: Use the `terraform` skill for reference material. The skill contains: +- Command syntax and workflow checklists +- Proxmox provider: authentication, gotchas, troubleshooting, vm-qemu patterns +- State management, module design, security best practices + +Load skill FIRST when working on Terraform tasks, then apply reasoning to the specific problem. + +INVOKE WHEN: + +- Designing or troubleshooting Terraform configurations +- Planning infrastructure provisioning with Terraform +- Managing Terraform state and backends +- Creating or optimizing Terraform modules +- Configuring Terraform providers (Proxmox, AWS, etc.) +- "terraform|iac|tfstate|module|provider|resource|datasource|hcl" + +DONT INVOKE: + +- Trivial config typo fixes (use Edit directly) +- Quick reference lookups (use terraform skill directly) +- Manual infrastructure changes (defeats IaC purpose) +- When user explicitly requests different agent + +PROCESS: + +1. Load skill: Invoke `terraform` skill for relevant reference material +2. Understand: Read context (terraform/*.tf, modules/, terraform.tfvars) +3. Clarify: Resource type? Provider? State location? Environment? +4. Analyze: Current configuration, state status, dependencies +5. Assess impact: Plan output review, blast radius estimation +6. Implement: Create .tf files, modules, and configurations +7. Validate: Follow skill's validation checklist +8. Document: Add inline comments and configuration notes + +CAPABILITIES: + +- Architecture decisions (modules vs flat, workspaces vs separate state) +- Troubleshooting complex Terraform errors +- State migration and import strategies +- Provider configuration recommendations +- Resource dependency analysis +- Blast radius assessment +- CI/CD integration guidance + +DOMAIN BOUNDARIES: + +- Scope: Terraform infrastructure-as-code only +- IN: Terraform configs, HCL, state, modules, providers, resources, data sources +- OUT: Manual infrastructure changes, provider-specific non-Terraform tools +- Handoff: Proxmox VM specifics → proxmox-expert agent +- Handoff: Network design → network-infrastructure-expert agent +- Handoff: Storage architecture → storage-expert agent + +DECISION GUIDANCE: + +Workspaces vs Separate State: +- Separate state: Better blast radius isolation, recommended for homelab +- Workspaces: Same config, different parameters (dev/staging/prod) + +Module vs Inline: +- Module: Reused 3+ times OR complex logic worth encapsulating +- Inline: One-off resources, simple configurations + +Local vs Remote State: +- Local: Single user, testing, small projects +- Remote: Team environments, CI/CD, production + +Import vs Recreate: +- Import: Resource has data/state that must be preserved +- Recreate: Stateless resource, faster to destroy/create + +COMMON TASKS: + +- Review config: Read terraform/*.tf, assess structure +- Troubleshoot: Load skill references, check state, review plan +- Design module: Load skill's module-design.md, apply to specific use case +- Configure provider: Load skill's proxmox/*.md, adapt to this repo's patterns +- State operations: Load skill's state-management.md, execute carefully + +CHANGELOG: + +## 2.0.0 (2025-11-27) + +- Refactored to use terraform skill for reference material +- Agent now focuses on reasoning and decisions +- Removed duplicate reference content (now in skill) +- Added skill loading to PROCESS + +## 1.0.0 (2025-10-07) + +- Initial release diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..7fd3996 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,217 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:poindexter12/waypoint:technologies", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "676a9ce2ec6a6e73783cf90b27cfa7e746c758ed", + "treeHash": "c88b5354001af644eb60926c1041cdf9dc0e468090afb53dbf1589f3234938b6", + "generatedAt": "2025-11-28T10:27:38.774452Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "technologies", + "description": "Domain knowledge for infrastructure technologies - Terraform, Ansible, Docker, Proxmox", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "829dceb55950c1df69c8f80b2a98b5d9f07b22130418df7bbeee579244f7fcc2" + }, + { + "path": "agents/proxmox.md", + "sha256": "08aa6243b86ca5e9c8be2655d98f32cca01126b1a7f02d576d53a62a967cc10f" + }, + { + "path": "agents/terraform.md", + "sha256": "56dc0ed9105f0296877f935f7a202e14f81f4cd4f0c2d9f15c7e66232b69964d" + }, + { + "path": "agents/docker-compose.md", + "sha256": "c45a9fc86a655cdf94629e7b0bf295aa1b35adfd72e63cd9d4948a89209494e4" + }, + { + "path": "agents/ansible.md", + "sha256": "fe08e8166f6efe30a58f3aae77f5a09dfd27ddf62c85ccc71fc61c77e6b006a7" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "b7b9750706c9219db66a96cafe835f64d5b2dc4034446a093d99f86ff3dbf304" + }, + { + "path": "skills/terraform/SKILL.md", + "sha256": "bf62f36dd2ea924a1d2c89dca1bc36486a7eee0e64af0f9f66b9d08824310265" + }, + { + "path": "skills/terraform/references/external-resources.md", + "sha256": "f456c2292ad7cd565e6eb046995a88559e6230e3d49333f1f7ae79107a7f329d" + }, + { + "path": "skills/terraform/references/state-management.md", + "sha256": "b9ca872147c5ba36ac9269a70615f0c36ff3a9b73401331b7541ae602d6750e0" + }, + { + "path": "skills/terraform/references/module-design.md", + "sha256": "efc9017d01af5cb2ee1476251a665e1d027013ceb02e488bf33a83b79a000d4f" + }, + { + "path": "skills/terraform/references/security.md", + "sha256": "3f73b519b25f8cac8ef66817a126fdb630bea73fa173541a28702c9576fc8601" + }, + { + "path": "skills/terraform/references/proxmox/authentication.md", + "sha256": "0858214bcd6f6499fb76ec751278310833dc39c2e6fa0b09197c592c003a6908" + }, + { + "path": "skills/terraform/references/proxmox/gotchas.md", + "sha256": "ff1e54e6bc86436f97def86882151d11a5517a43cdbf520158232c02dea1a49d" + }, + { + "path": "skills/terraform/references/proxmox/troubleshooting.md", + "sha256": "fe005128d49fe26b9950d5267e923ca4e71ee1d70bfbe909b7a461b91ad2fc8b" + }, + { + "path": "skills/terraform/references/proxmox/vm-qemu.md", + "sha256": "9825577d9f7bcb210f41d0fc60d4fa6c2d04332dcd1eb125a6f96a27476cf156" + }, + { + "path": "skills/docker/SKILL.md", + "sha256": "5b41cbcf6385a363f0b3fbefef2c737ea10f23df3b39499770230fc07cbf2113" + }, + { + "path": "skills/docker/references/dockerfile.md", + "sha256": "391fc2da79f769914a144edd07785b9a48eb1c40d72b29b713956335aa2ec681" + }, + { + "path": "skills/docker/references/networking.md", + "sha256": "85ab150f225dd9aea2ef18815aa7b2e5892500e7cdca2717be5431f2b4c8508c" + }, + { + "path": "skills/docker/references/troubleshooting.md", + "sha256": "aacd8851da181a6884437c56dee045580db94f18b8665312c852a4338c88baa0" + }, + { + "path": "skills/docker/references/compose.md", + "sha256": "9afb2403f22fd43887e74d989b45b75468626ab79ef25a43382c10aa6ba64326" + }, + { + "path": "skills/docker/references/volumes.md", + "sha256": "c958de7d6b9e1a4579c268c9ed2a2e7e02b922cf24c27115d7007d9990d37ad5" + }, + { + "path": "skills/docker/references/proxmox/lxc-vs-docker.md", + "sha256": "677cef56131d9c460589804f787fd59524d94e22ec045e88414f08371f950e9a" + }, + { + "path": "skills/docker/references/proxmox/hosting.md", + "sha256": "bf7047e787dc52b793f3aaf8df143e22b0a63f0cf54182b9d8b28624f91d63d0" + }, + { + "path": "skills/proxmox/SKILL.md", + "sha256": "558e9d42c61f41edd08cbbd3d029b72f8692217e6fd5a08a7e2bbea7de11ddb2" + }, + { + "path": "skills/proxmox/references/storage.md", + "sha256": "f90ccb3cdd4c9928561ec5b396423788f1168cf88f4075e25d48d47bbf0fcd0b" + }, + { + "path": "skills/proxmox/references/networking.md", + "sha256": "a83b4d3977104fc752b1899b161fc4a7e8607dbc174e96f05fa365007e954b57" + }, + { + "path": "skills/proxmox/references/automation-tools.md", + "sha256": "db40413a85c10ece28fa03a4e9bf7c17158328c3441bfe8f777cdc6c26196067" + }, + { + "path": "skills/proxmox/references/troubleshooting.md", + "sha256": "5b197ca808ae9e0665f202f9b329301568d81cad39772f6ad95496bff772d587" + }, + { + "path": "skills/proxmox/references/clustering.md", + "sha256": "50eb01197a8cf01404cd6a967012849f1740b21e15a2959e1a714f43f20d7862" + }, + { + "path": "skills/proxmox/references/docker-hosting.md", + "sha256": "dfc1ec4387e94ef6bd77ff3f411440e7f60e62646b6c77badbab887a9eb2c411" + }, + { + "path": "skills/proxmox/references/vm-lxc.md", + "sha256": "f52265268bea71db2588cdef84d62a5465190f0a87c920a43af129208e5f0c04" + }, + { + "path": "skills/proxmox/references/backup.md", + "sha256": "751402a4d8f7f01386b9691f7fc04a70423c3a1e6361ba97ddebfbc2cbee45af" + }, + { + "path": "skills/proxmox/references/cli-tools.md", + "sha256": "73ac0f69651dd665eb925e34d246a8648aa89b7f7b12cb67720160b442cfff8c" + }, + { + "path": "skills/ansible/SKILL.md", + "sha256": "33418164abe9e288d47390b765839f25985bdb563b419bb16fef0f9d60eae93d" + }, + { + "path": "skills/ansible/references/troubleshooting.md", + "sha256": "f14b9954296a9a42919f8c6b591b4cc8c6f2f75a7e6c90b9ab32d723dadfaee0" + }, + { + "path": "skills/ansible/references/inventory.md", + "sha256": "52249c7f178488ad39913300d926adc937121cf9f9f88fd4d0f5c1575693ea3f" + }, + { + "path": "skills/ansible/references/modules.md", + "sha256": "d46456336e299bbcbf8b4695bc28ce271bd1529851872419a490663211e1dd40" + }, + { + "path": "skills/ansible/references/playbooks.md", + "sha256": "ca41c60bd85e6854fc724f806561dca40e12745e3522f0bb5ecac91be07faec8" + }, + { + "path": "skills/ansible/references/variables.md", + "sha256": "af2857f7672b68ffd20a3eb2dcf65bc8403334b5e5955d4bb651c0c4e83f9efb" + }, + { + "path": "skills/ansible/references/docker/compose-patterns.md", + "sha256": "0d6fb96ecd6933c9d10f825e1c2626771786ac9f909ce137bc3fd04ff2d67606" + }, + { + "path": "skills/ansible/references/docker/troubleshooting.md", + "sha256": "17d86ed8e55c7420493f90029dabcc8768f4e7a2095b817ee106f55c67002cf1" + }, + { + "path": "skills/ansible/references/docker/deployment.md", + "sha256": "c625154b8f56123da46271e6b65ea20428b055e2e14c6c85723362b07047f1a1" + }, + { + "path": "skills/ansible/references/proxmox/authentication.md", + "sha256": "19d59e7a196976b7e1ec92849b970a754f409e8eb4ea8146db54811e39bf8407" + }, + { + "path": "skills/ansible/references/proxmox/dynamic-inventory.md", + "sha256": "4cf07539d187f680cefa124761ffa0d8339e260796e83d4ee59e036d6c604dab" + }, + { + "path": "skills/ansible/references/proxmox/gotchas.md", + "sha256": "0c7144e97dd67a3a3d61198045c60bab7af443d962d6bb4e1767c06d9464ebaf" + }, + { + "path": "skills/ansible/references/proxmox/modules.md", + "sha256": "534a3bebb24dafeda0f3f05d32915b67358be58fcc5316198b915e0db15db99a" + } + ], + "dirSha256": "c88b5354001af644eb60926c1041cdf9dc0e468090afb53dbf1589f3234938b6" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/ansible/SKILL.md b/skills/ansible/SKILL.md new file mode 100644 index 0000000..2d05aa8 --- /dev/null +++ b/skills/ansible/SKILL.md @@ -0,0 +1,162 @@ +--- +name: ansible +description: | + Ansible automation reference for playbooks, roles, inventory, variables, and modules. + Includes Proxmox VE and Docker integration via community.general and community.docker collections. + Use when writing playbooks, troubleshooting Ansible runs, or designing automation workflows. + Triggers: ansible, playbook, inventory, role, task, handler, vars, jinja2, galaxy, proxmox_kvm, proxmox_lxc, docker_container, docker_compose. +--- + +# Ansible Skill + +Ansible automation reference for configuration management and application deployment. + +## Quick Reference + +```bash +# Test connectivity +ansible all -m ping +ansible -m ping + +# Run playbook +ansible-playbook playbook.yml +ansible-playbook playbook.yml -l # Limit to host +ansible-playbook playbook.yml --check # Dry-run +ansible-playbook playbook.yml -vvv # Verbose + +# Tags +ansible-playbook playbook.yml --tags "deploy" +ansible-playbook playbook.yml --skip-tags "backup" +ansible-playbook playbook.yml --list-tags + +# Variables +ansible-playbook playbook.yml -e "var=value" +ansible-playbook playbook.yml -e "@vars.yml" + +# Ad-hoc commands +ansible -m shell -a "command" +ansible -m copy -a "src=file dest=/path" +ansible -m apt -a "name=package state=present" + +# Galaxy +ansible-galaxy collection install -r requirements.yml +ansible-galaxy role install +``` + +## Reference Files + +Load on-demand based on task: + +| Topic | File | When to Load | +|-------|------|--------------| +| Playbook Structure | [playbooks.md](references/playbooks.md) | Writing playbooks | +| Inventory | [inventory.md](references/inventory.md) | Host/group configuration | +| Variables | [variables.md](references/variables.md) | Variable precedence, facts | +| Modules | [modules.md](references/modules.md) | Common module reference | +| Troubleshooting | [troubleshooting.md](references/troubleshooting.md) | Common errors, debugging | + +### Proxmox Integration + +| Topic | File | When to Load | +|-------|------|--------------| +| Proxmox Modules | [proxmox/modules.md](references/proxmox/modules.md) | VM/LXC management via API | +| Proxmox Auth | [proxmox/authentication.md](references/proxmox/authentication.md) | API tokens, credentials | +| Proxmox Gotchas | [proxmox/gotchas.md](references/proxmox/gotchas.md) | Common issues, workarounds | +| Dynamic Inventory | [proxmox/dynamic-inventory.md](references/proxmox/dynamic-inventory.md) | Auto-discover VMs/containers | + +### Docker Integration + +| Topic | File | When to Load | +|-------|------|--------------| +| Docker Deployment | [docker/deployment.md](references/docker/deployment.md) | Containers, images, networks, volumes | +| Compose Patterns | [docker/compose-patterns.md](references/docker/compose-patterns.md) | Roles, templates, multi-service stacks | +| Docker Troubleshooting | [docker/troubleshooting.md](references/docker/troubleshooting.md) | Common errors, debugging | + +## Playbook Quick Reference + +```yaml +--- +- name: Deploy application + hosts: webservers + become: true + vars: + app_port: 8080 + + pre_tasks: + - name: Validate requirements + ansible.builtin.assert: + that: + - app_secret is defined + + tasks: + - name: Install packages + ansible.builtin.apt: + name: "{{ item }}" + state: present + loop: + - nginx + - python3 + + - name: Deploy config + ansible.builtin.template: + src: app.conf.j2 + dest: /etc/app/app.conf + notify: Restart app + + handlers: + - name: Restart app + ansible.builtin.service: + name: app + state: restarted + + post_tasks: + - name: Verify deployment + ansible.builtin.uri: + url: "http://localhost:{{ app_port }}/health" +``` + +## Variable Precedence (High to Low) + +1. Extra vars (`-e "var=value"`) +2. Task vars +3. Block vars +4. Role/include vars +5. Play vars +6. Host facts +7. host_vars/ +8. group_vars/ +9. Role defaults + +## Directory Structure + +```text +ansible/ +├── ansible.cfg # Configuration +├── inventory/ +│ └── hosts.yml # Inventory +├── group_vars/ +│ ├── all.yml # All hosts +│ └── webservers.yml # Group-specific +├── host_vars/ +│ └── server1.yml # Host-specific +├── roles/ +│ └── app/ +│ ├── tasks/ +│ ├── handlers/ +│ ├── templates/ +│ ├── files/ +│ └── defaults/ +├── playbooks/ +│ └── deploy.yml +├── templates/ +│ └── config.j2 +└── requirements.yml # Galaxy dependencies +``` + +## Idempotency Checklist + +- [ ] Tasks produce same result on repeated runs +- [ ] No `changed_when: true` unless necessary +- [ ] Use `state: present/absent` not `shell` commands +- [ ] Check mode (`--check`) shows accurate changes +- [ ] Second run shows all "ok" (no changes) diff --git a/skills/ansible/references/docker/compose-patterns.md b/skills/ansible/references/docker/compose-patterns.md new file mode 100644 index 0000000..4dd89a3 --- /dev/null +++ b/skills/ansible/references/docker/compose-patterns.md @@ -0,0 +1,294 @@ +# Ansible Docker Compose Patterns + +Common patterns for managing Docker Compose stacks with Ansible. + +## Project Structure + +``` +roles/ +└── docker_app/ + ├── tasks/ + │ └── main.yml + ├── templates/ + │ ├── docker-compose.yml.j2 + │ └── .env.j2 + ├── defaults/ + │ └── main.yml + └── handlers/ + └── main.yml +``` + +## Role Template + +### defaults/main.yml + +```yaml +app_name: myapp +app_version: latest +app_port: 8080 +app_data_dir: "/opt/{{ app_name }}" + +# Compose settings +compose_pull: always +compose_recreate: auto # auto, always, never + +# Resource limits +app_memory_limit: 512M +app_cpu_limit: 1.0 +``` + +### templates/docker-compose.yml.j2 + +```yaml +name: {{ app_name }} + +services: + app: + image: {{ app_image }}:{{ app_version }} + container_name: {{ app_name }} + restart: unless-stopped + ports: + - "{{ app_port }}:{{ app_internal_port | default(app_port) }}" + volumes: + - {{ app_data_dir }}/data:/app/data +{% if app_config_file is defined %} + - {{ app_data_dir }}/config:/app/config:ro +{% endif %} + environment: + TZ: {{ timezone | default('UTC') }} +{% for key, value in app_env.items() %} + {{ key }}: "{{ value }}" +{% endfor %} +{% if app_memory_limit is defined or app_cpu_limit is defined %} + deploy: + resources: + limits: +{% if app_memory_limit is defined %} + memory: {{ app_memory_limit }} +{% endif %} +{% if app_cpu_limit is defined %} + cpus: '{{ app_cpu_limit }}' +{% endif %} +{% endif %} + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:{{ app_internal_port | default(app_port) }}/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + networks: + - {{ app_network | default('default') }} + +{% if app_network is defined %} +networks: + {{ app_network }}: + external: true +{% endif %} +``` + +### tasks/main.yml + +```yaml +--- +- name: Create application directory + ansible.builtin.file: + path: "{{ app_data_dir }}" + state: directory + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + mode: '0755' + +- name: Create data directories + ansible.builtin.file: + path: "{{ app_data_dir }}/{{ item }}" + state: directory + owner: "{{ ansible_user }}" + mode: '0755' + loop: + - data + - config + +- name: Deploy compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ app_data_dir }}/docker-compose.yml" + owner: "{{ ansible_user }}" + mode: '0644' + notify: Redeploy stack + +- name: Deploy environment file + ansible.builtin.template: + src: .env.j2 + dest: "{{ app_data_dir }}/.env" + owner: "{{ ansible_user }}" + mode: '0600' + notify: Redeploy stack + when: app_secrets is defined + +- name: Ensure stack is running + community.docker.docker_compose_v2: + project_src: "{{ app_data_dir }}" + state: present + pull: "{{ compose_pull }}" + recreate: "{{ compose_recreate }}" + register: compose_result + +- name: Show deployment result + ansible.builtin.debug: + msg: "Deployed {{ compose_result.containers | length }} containers" + when: compose_result is changed +``` + +### handlers/main.yml + +```yaml +--- +- name: Redeploy stack + community.docker.docker_compose_v2: + project_src: "{{ app_data_dir }}" + state: present + pull: always + recreate: always +``` + +## Multi-Service Stack + +### templates/docker-compose.yml.j2 (full stack) + +```yaml +name: {{ stack_name }} + +services: + app: + image: {{ app_image }}:{{ app_version }} + restart: unless-stopped + depends_on: + db: + condition: service_healthy + redis: + condition: service_started + environment: + DATABASE_URL: "postgres://{{ db_user }}:{{ db_password }}@db:5432/{{ db_name }}" + REDIS_URL: "redis://redis:6379" + networks: + - internal + - web + + db: + image: postgres:15 + restart: unless-stopped + volumes: + - db_data:/var/lib/postgresql/data + environment: + POSTGRES_USER: {{ db_user }} + POSTGRES_PASSWORD: {{ db_password }} + POSTGRES_DB: {{ db_name }} + healthcheck: + test: ["CMD-SHELL", "pg_isready -U {{ db_user }}"] + interval: 5s + timeout: 5s + retries: 5 + networks: + - internal + + redis: + image: redis:7-alpine + restart: unless-stopped + volumes: + - redis_data:/data + networks: + - internal + + nginx: + image: nginx:alpine + restart: unless-stopped + ports: + - "{{ http_port | default(80) }}:80" + - "{{ https_port | default(443) }}:443" + volumes: + - {{ app_data_dir }}/nginx/conf.d:/etc/nginx/conf.d:ro + - {{ app_data_dir }}/nginx/ssl:/etc/nginx/ssl:ro + depends_on: + - app + networks: + - web + +networks: + internal: + driver: bridge + web: + driver: bridge + +volumes: + db_data: + redis_data: +``` + +## Zero-Downtime Update + +```yaml +- name: Zero-downtime update + hosts: docker_hosts + serial: 1 # One host at a time + tasks: + - name: Pull new image + community.docker.docker_image: + name: "{{ app_image }}" + tag: "{{ app_version }}" + source: pull + + - name: Drain connections (if load balanced) + # ... remove from load balancer ... + + - name: Update stack + community.docker.docker_compose_v2: + project_src: "{{ app_data_dir }}" + state: present + recreate: always + + - name: Wait for health + ansible.builtin.uri: + url: "http://localhost:{{ app_port }}/health" + status_code: 200 + register: health + until: health.status == 200 + retries: 30 + delay: 2 + + - name: Restore to load balancer + # ... add back to load balancer ... +``` + +## Secrets Management + +### With ansible-vault + +```yaml +# group_vars/secrets.yml (encrypted) +app_secrets: + DB_PASSWORD: supersecret + API_KEY: abc123 + JWT_SECRET: longsecret +``` + +```yaml +# templates/.env.j2 +{% for key, value in app_secrets.items() %} +{{ key }}={{ value }} +{% endfor %} +``` + +### With external secrets + +```yaml +- name: Fetch secret from 1Password + ansible.builtin.set_fact: + db_password: "{{ lookup('community.general.onepassword', 'database', field='password') }}" + +- name: Deploy with secret + community.docker.docker_compose_v2: + project_src: "{{ app_data_dir }}" + env_files: + - "{{ app_data_dir }}/.env" + state: present +``` diff --git a/skills/ansible/references/docker/deployment.md b/skills/ansible/references/docker/deployment.md new file mode 100644 index 0000000..f311bc6 --- /dev/null +++ b/skills/ansible/references/docker/deployment.md @@ -0,0 +1,307 @@ +# Docker Deployment with Ansible + +Managing Docker containers and compose stacks via Ansible. + +## Collection Setup + +```bash +ansible-galaxy collection install community.docker +``` + +## Compose Deployment (Recommended) + +### Deploy from local compose file + +```yaml +- name: Deploy application stack + hosts: docker_hosts + become: true + tasks: + - name: Create project directory + ansible.builtin.file: + path: /opt/myapp + state: directory + owner: "{{ ansible_user }}" + mode: '0755' + + - name: Copy compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: /opt/myapp/docker-compose.yml + owner: "{{ ansible_user }}" + mode: '0644' + + - name: Copy environment file + ansible.builtin.template: + src: .env.j2 + dest: /opt/myapp/.env + owner: "{{ ansible_user }}" + mode: '0600' + + - name: Deploy with compose + community.docker.docker_compose_v2: + project_src: /opt/myapp + state: present + pull: always + register: deploy_result + + - name: Show deployed services + ansible.builtin.debug: + var: deploy_result.containers +``` + +### Compose operations + +```yaml +# Pull latest images and recreate +- name: Update stack + community.docker.docker_compose_v2: + project_src: /opt/myapp + state: present + pull: always + recreate: always + +# Stop stack (keep volumes) +- name: Stop stack + community.docker.docker_compose_v2: + project_src: /opt/myapp + state: stopped + +# Remove stack +- name: Remove stack + community.docker.docker_compose_v2: + project_src: /opt/myapp + state: absent + remove_volumes: false # Keep data volumes +``` + +## Container Deployment (Individual) + +### Run container + +```yaml +- name: Run nginx container + community.docker.docker_container: + name: nginx + image: nginx:1.25 + state: started + restart_policy: unless-stopped + ports: + - "80:80" + - "443:443" + volumes: + - /opt/nginx/html:/usr/share/nginx/html:ro + - /opt/nginx/conf.d:/etc/nginx/conf.d:ro + env: + TZ: "America/Los_Angeles" + labels: + app: web + env: production + +- name: Run database + community.docker.docker_container: + name: postgres + image: postgres:15 + state: started + restart_policy: unless-stopped + ports: + - "5432:5432" + volumes: + - postgres_data:/var/lib/postgresql/data + env: + POSTGRES_USER: "{{ db_user }}" + POSTGRES_PASSWORD: "{{ db_password }}" + POSTGRES_DB: "{{ db_name }}" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U {{ db_user }}"] + interval: 10s + timeout: 5s + retries: 5 +``` + +### Container lifecycle + +```yaml +# Stop container +- name: Stop container + community.docker.docker_container: + name: myapp + state: stopped + +# Restart container +- name: Restart container + community.docker.docker_container: + name: myapp + state: started + restart: true + +# Remove container +- name: Remove container + community.docker.docker_container: + name: myapp + state: absent + +# Force recreate +- name: Recreate container + community.docker.docker_container: + name: myapp + image: myapp:latest + state: started + recreate: true +``` + +## Image Management + +```yaml +# Pull image +- name: Pull latest image + community.docker.docker_image: + name: myapp + tag: latest + source: pull + force_source: true # Always check for updates + +# Build from Dockerfile +- name: Build image + community.docker.docker_image: + name: myapp + tag: "{{ version }}" + source: build + build: + path: /opt/myapp + dockerfile: Dockerfile + pull: true # Pull base image updates + +# Remove image +- name: Remove old images + community.docker.docker_image: + name: myapp + tag: old + state: absent +``` + +## Network Management + +```yaml +# Create network +- name: Create app network + community.docker.docker_network: + name: app_network + driver: bridge + ipam_config: + - subnet: 172.20.0.0/16 + gateway: 172.20.0.1 + +# Create macvlan network +- name: Create macvlan network + community.docker.docker_network: + name: lan + driver: macvlan + driver_options: + parent: eth0 + ipam_config: + - subnet: 192.168.1.0/24 + gateway: 192.168.1.1 + +# Attach container to network +- name: Run container on network + community.docker.docker_container: + name: myapp + image: myapp:latest + networks: + - name: app_network + ipv4_address: 172.20.0.10 +``` + +## Volume Management + +```yaml +# Create named volume +- name: Create data volume + community.docker.docker_volume: + name: app_data + driver: local + +# Create volume with options +- name: Create NFS volume + community.docker.docker_volume: + name: shared_data + driver: local + driver_options: + type: nfs + device: ":/exports/data" + o: "addr=192.168.1.10,rw" + +# Backup volume +- name: Backup volume + community.docker.docker_container: + name: backup + image: alpine + command: tar czf /backup/data.tar.gz /data + volumes: + - app_data:/data:ro + - /opt/backups:/backup + auto_remove: true +``` + +## Common Patterns + +### Wait for service health + +```yaml +- name: Deploy database + community.docker.docker_container: + name: postgres + image: postgres:15 + # ... config ... + +- name: Wait for database + community.docker.docker_container_info: + name: postgres + register: db_info + until: db_info.container.State.Health.Status == "healthy" + retries: 30 + delay: 2 +``` + +### Rolling update + +```yaml +- name: Pull new image + community.docker.docker_image: + name: myapp + tag: "{{ new_version }}" + source: pull + +- name: Update container + community.docker.docker_container: + name: myapp + image: "myapp:{{ new_version }}" + state: started + recreate: true + restart_policy: unless-stopped +``` + +### Cleanup + +```yaml +- name: Remove stopped containers + community.docker.docker_prune: + containers: true + containers_filters: + status: exited + +- name: Remove unused images + community.docker.docker_prune: + images: true + images_filters: + dangling: true + +- name: Full cleanup (careful!) + community.docker.docker_prune: + containers: true + images: true + networks: true + volumes: false # Don't remove data! + builder_cache: true +``` diff --git a/skills/ansible/references/docker/troubleshooting.md b/skills/ansible/references/docker/troubleshooting.md new file mode 100644 index 0000000..ad4797d --- /dev/null +++ b/skills/ansible/references/docker/troubleshooting.md @@ -0,0 +1,292 @@ +# Ansible Docker Troubleshooting + +Common issues and debugging patterns. + +## Module Issues + +### "Could not find docker-compose" + +```yaml +# docker_compose_v2 requires Docker Compose V2 (plugin) +# NOT standalone docker-compose binary + +# Check on target host: +# docker compose version # V2 (plugin) +# docker-compose version # V1 (standalone) - won't work +``` + +Fix: Install Docker Compose V2: +```yaml +- name: Install Docker Compose plugin + ansible.builtin.apt: + name: docker-compose-plugin + state: present +``` + +### "Permission denied" + +```yaml +# User not in docker group +- name: Add user to docker group + ansible.builtin.user: + name: "{{ ansible_user }}" + groups: docker + append: true + become: true + +# Then reconnect or use become +- name: Run with become + community.docker.docker_container: + name: myapp + # ... + become: true +``` + +### "Cannot connect to Docker daemon" + +```yaml +# Docker not running +- name: Ensure Docker is running + ansible.builtin.service: + name: docker + state: started + enabled: true + become: true + +# Socket permission issue +# Add become: true to docker tasks +``` + +## Container Issues + +### Get container logs + +```yaml +- name: Get logs + community.docker.docker_container_exec: + container: myapp + command: cat /var/log/app.log + register: logs + ignore_errors: true + +- name: Alternative - docker logs + ansible.builtin.command: docker logs --tail 100 myapp + register: docker_logs + changed_when: false + +- name: Show logs + ansible.builtin.debug: + var: docker_logs.stdout_lines +``` + +### Container keeps restarting + +```yaml +- name: Get container info + community.docker.docker_container_info: + name: myapp + register: container_info + +- name: Show restart count + ansible.builtin.debug: + msg: "Restart count: {{ container_info.container.RestartCount }}" + +- name: Show last exit code + ansible.builtin.debug: + msg: "Exit code: {{ container_info.container.State.ExitCode }}" + +- name: Get logs from dead container + ansible.builtin.command: docker logs myapp + register: crash_logs + changed_when: false + +- name: Show crash logs + ansible.builtin.debug: + var: crash_logs.stderr_lines +``` + +### Health check failing + +```yaml +- name: Check health status + community.docker.docker_container_info: + name: myapp + register: info + +- name: Show health + ansible.builtin.debug: + msg: | + Status: {{ info.container.State.Health.Status }} + Failing: {{ info.container.State.Health.FailingStreak }} + Log: {{ info.container.State.Health.Log | last }} + +# Manual health check +- name: Test health endpoint + ansible.builtin.command: > + docker exec myapp curl -f http://localhost:8080/health + register: health + ignore_errors: true + changed_when: false +``` + +## Network Issues + +### Container can't reach external network + +```yaml +- name: Test DNS from container + ansible.builtin.command: docker exec myapp nslookup google.com + register: dns_test + changed_when: false + ignore_errors: true + +- name: Test connectivity + ansible.builtin.command: docker exec myapp ping -c 1 8.8.8.8 + register: ping_test + changed_when: false + ignore_errors: true + +# Check iptables +- name: Check IP forwarding + ansible.builtin.command: sysctl net.ipv4.ip_forward + register: ip_forward + changed_when: false + +- name: Enable IP forwarding + ansible.posix.sysctl: + name: net.ipv4.ip_forward + value: '1' + state: present + become: true + when: "'0' in ip_forward.stdout" +``` + +### Containers can't communicate + +```yaml +- name: List networks + community.docker.docker_network_info: + name: "{{ network_name }}" + register: network_info + +- name: Show connected containers + ansible.builtin.debug: + var: network_info.network.Containers + +# Verify both containers on same network +- name: Test inter-container connectivity + ansible.builtin.command: > + docker exec app ping -c 1 db + register: ping_result + changed_when: false +``` + +## Compose Issues + +### Services not starting in order + +```yaml +# depends_on only waits for container start, not readiness +# Use healthcheck + condition + +# In compose template: +services: + app: + depends_on: + db: + condition: service_healthy # Wait for health check + + db: + healthcheck: + test: ["CMD-SHELL", "pg_isready"] + interval: 5s + timeout: 5s + retries: 5 +``` + +### Orphaned containers + +```yaml +# Containers from old compose runs +- name: Remove orphans + community.docker.docker_compose_v2: + project_src: /opt/myapp + state: present + remove_orphans: true +``` + +### Volume data not persisting + +```yaml +# Check volume exists +- name: List volumes + ansible.builtin.command: docker volume ls + register: volumes + changed_when: false + +# Check volume contents +- name: Inspect volume + ansible.builtin.command: docker volume inspect myapp_data + register: volume_info + changed_when: false + +- name: Show volume mountpoint + ansible.builtin.debug: + msg: "{{ (volume_info.stdout | from_json)[0].Mountpoint }}" +``` + +## Debug Playbook + +```yaml +--- +- name: Docker debug + hosts: docker_hosts + tasks: + - name: Docker version + ansible.builtin.command: docker version + register: docker_version + changed_when: false + + - name: Compose version + ansible.builtin.command: docker compose version + register: compose_version + changed_when: false + + - name: List containers + ansible.builtin.command: docker ps -a + register: containers + changed_when: false + + - name: List images + ansible.builtin.command: docker images + register: images + changed_when: false + + - name: Disk usage + ansible.builtin.command: docker system df + register: disk + changed_when: false + + - name: Show all + ansible.builtin.debug: + msg: | + Docker: {{ docker_version.stdout_lines[0] }} + Compose: {{ compose_version.stdout }} + Containers: + {{ containers.stdout }} + Images: + {{ images.stdout }} + Disk: + {{ disk.stdout }} +``` + +## Common Error Reference + +| Error | Cause | Fix | +|-------|-------|-----| +| `docker.errors.DockerException` | Docker not running | Start docker service | +| `docker.errors.APIError: 404` | Container/image not found | Check name/tag | +| `docker.errors.APIError: 409` | Container name conflict | Remove or rename | +| `PermissionError` | Not in docker group | Add user or use become | +| `requests.exceptions.ConnectionError` | Docker socket inaccessible | Check socket permissions | +| `FileNotFoundError: docker-compose` | V1 compose not installed | Use docker_compose_v2 | diff --git a/skills/ansible/references/inventory.md b/skills/ansible/references/inventory.md new file mode 100644 index 0000000..8810780 --- /dev/null +++ b/skills/ansible/references/inventory.md @@ -0,0 +1,181 @@ +# Ansible Inventory Reference + +## YAML Inventory Format + +```yaml +all: + children: + webservers: + hosts: + web1: + ansible_host: 192.168.1.10 + web2: + ansible_host: 192.168.1.11 + vars: + http_port: 80 + + databases: + hosts: + db1: + ansible_host: 192.168.1.20 + db_port: 5432 + db2: + ansible_host: 192.168.1.21 + + production: + children: + webservers: + databases: + + vars: + ansible_user: ubuntu + ansible_ssh_private_key_file: ~/.ssh/id_rsa +``` + +## INI Inventory Format + +```ini +[webservers] +web1 ansible_host=192.168.1.10 +web2 ansible_host=192.168.1.11 + +[webservers:vars] +http_port=80 + +[databases] +db1 ansible_host=192.168.1.20 db_port=5432 +db2 ansible_host=192.168.1.21 + +[production:children] +webservers +databases + +[all:vars] +ansible_user=ubuntu +``` + +## Host Variables + +Common host variables: + +| Variable | Purpose | +|----------|---------| +| `ansible_host` | IP or hostname to connect | +| `ansible_port` | SSH port (default: 22) | +| `ansible_user` | SSH username | +| `ansible_ssh_private_key_file` | SSH key path | +| `ansible_become` | Enable sudo | +| `ansible_become_user` | Sudo target user | +| `ansible_python_interpreter` | Python path | + +## Group Variables + +```yaml +# group_vars/webservers.yml +http_port: 80 +document_root: /var/www/html + +# group_vars/all.yml +ntp_server: time.example.com +dns_servers: + - 8.8.8.8 + - 8.8.4.4 +``` + +## Host Variables Files + +```yaml +# host_vars/web1.yml +site_name: production-web1 +ssl_cert_path: /etc/ssl/certs/web1.crt +``` + +## Dynamic Groups + +```yaml +# In playbook +- hosts: "{{ target_group | default('all') }}" +``` + +Run with: +```bash +ansible-playbook playbook.yml -e "target_group=webservers" +``` + +## Patterns + +```bash +# All hosts +ansible all -m ping + +# Single host +ansible web1 -m ping + +# Group +ansible webservers -m ping + +# Multiple groups +ansible 'webservers:databases' -m ping + +# Intersection (AND) +ansible 'webservers:&production' -m ping + +# Exclusion +ansible 'webservers:!web1' -m ping + +# Regex +ansible '~web[0-9]+' -m ping +``` + +## Limit + +```bash +# Limit to specific hosts +ansible-playbook playbook.yml -l web1 +ansible-playbook playbook.yml --limit web1,web2 +ansible-playbook playbook.yml --limit 'webservers:!web3' +``` + +## Inventory Check + +```bash +# List hosts +ansible-inventory --list +ansible-inventory --graph + +# Host info +ansible-inventory --host web1 + +# Validate +ansible all --list-hosts +``` + +## Multiple Inventories + +```bash +# Multiple files +ansible-playbook -i inventory/production -i inventory/staging playbook.yml + +# Directory of inventories +ansible-playbook -i inventory/ playbook.yml +``` + +## Special Groups + +| Group | Contains | +|-------|----------| +| `all` | All hosts | +| `ungrouped` | Hosts not in any group | + +## Local Connection + +```yaml +localhost: + ansible_host: 127.0.0.1 + ansible_connection: local +``` + +Or in inventory: +```ini +localhost ansible_connection=local +``` diff --git a/skills/ansible/references/modules.md b/skills/ansible/references/modules.md new file mode 100644 index 0000000..e7bf2c5 --- /dev/null +++ b/skills/ansible/references/modules.md @@ -0,0 +1,341 @@ +# Ansible Modules Reference + +## File Operations + +### copy + +```yaml +- name: Copy file + ansible.builtin.copy: + src: files/config.conf + dest: /etc/app/config.conf + owner: root + group: root + mode: '0644' + backup: true +``` + +### template + +```yaml +- name: Template config + ansible.builtin.template: + src: templates/config.j2 + dest: /etc/app/config.conf + owner: root + group: root + mode: '0644' + notify: Restart app +``` + +### file + +```yaml +# Create directory +- name: Create directory + ansible.builtin.file: + path: /opt/app + state: directory + owner: app + group: app + mode: '0755' + +# Create symlink +- name: Create symlink + ansible.builtin.file: + src: /opt/app/current + dest: /opt/app/release + state: link + +# Delete file +- name: Remove file + ansible.builtin.file: + path: /tmp/old-file + state: absent +``` + +### lineinfile + +```yaml +- name: Ensure line in file + ansible.builtin.lineinfile: + path: /etc/hosts + line: "192.168.1.10 myhost" + state: present + +- name: Replace line + ansible.builtin.lineinfile: + path: /etc/config + regexp: '^PORT=' + line: 'PORT=8080' +``` + +## Package Management + +### apt (Debian/Ubuntu) + +```yaml +- name: Install package + ansible.builtin.apt: + name: nginx + state: present + update_cache: true + +- name: Install multiple + ansible.builtin.apt: + name: + - nginx + - python3 + state: present + +- name: Remove package + ansible.builtin.apt: + name: nginx + state: absent +``` + +### package (Generic) + +```yaml +- name: Install package + ansible.builtin.package: + name: httpd + state: present +``` + +## Service Management + +### service + +```yaml +- name: Start and enable + ansible.builtin.service: + name: nginx + state: started + enabled: true + +- name: Restart + ansible.builtin.service: + name: nginx + state: restarted + +- name: Reload + ansible.builtin.service: + name: nginx + state: reloaded +``` + +### systemd + +```yaml +- name: Daemon reload + ansible.builtin.systemd: + daemon_reload: true + +- name: Enable and start + ansible.builtin.systemd: + name: myapp + state: started + enabled: true +``` + +## Command Execution + +### command + +```yaml +- name: Run command + ansible.builtin.command: /bin/mycommand arg1 arg2 + register: result + changed_when: "'changed' in result.stdout" +``` + +### shell + +```yaml +- name: Run shell command + ansible.builtin.shell: | + cd /opt/app + ./setup.sh && ./configure.sh + args: + executable: /bin/bash +``` + +### script + +```yaml +- name: Run local script on remote + ansible.builtin.script: scripts/setup.sh + args: + creates: /opt/app/.installed +``` + +## User Management + +### user + +```yaml +- name: Create user + ansible.builtin.user: + name: appuser + groups: docker,sudo + shell: /bin/bash + create_home: true + state: present + +- name: Remove user + ansible.builtin.user: + name: olduser + state: absent + remove: true +``` + +### group + +```yaml +- name: Create group + ansible.builtin.group: + name: appgroup + state: present +``` + +## Docker (community.docker) + +### docker_container + +```yaml +- name: Run container + community.docker.docker_container: + name: myapp + image: myapp:latest + state: started + restart_policy: unless-stopped + ports: + - "8080:80" + volumes: + - /data:/app/data + env: + DB_HOST: database +``` + +### docker_compose_v2 + +```yaml +- name: Deploy with compose + community.docker.docker_compose_v2: + project_src: /opt/app + project_name: myapp + state: present + pull: always + env_files: + - /opt/app/.env +``` + +### docker_image + +```yaml +- name: Pull image + community.docker.docker_image: + name: nginx + tag: "1.25" + source: pull +``` + +## Networking + +### uri + +```yaml +- name: API call + ansible.builtin.uri: + url: "http://localhost:8080/api/health" + method: GET + return_content: true + register: response + +- name: POST request + ansible.builtin.uri: + url: "http://api.example.com/data" + method: POST + body_format: json + body: + key: value +``` + +### wait_for + +```yaml +- name: Wait for port + ansible.builtin.wait_for: + host: localhost + port: 8080 + timeout: 300 + +- name: Wait for file + ansible.builtin.wait_for: + path: /var/log/app.log + search_regex: "Server started" +``` + +## Debug/Assert + +### debug + +```yaml +- name: Print variable + ansible.builtin.debug: + msg: "Value: {{ my_var }}" + +- name: Print var directly + ansible.builtin.debug: + var: my_var +``` + +### assert + +```yaml +- name: Validate conditions + ansible.builtin.assert: + that: + - my_var is defined + - my_var | length > 0 + fail_msg: "my_var must be defined and non-empty" + success_msg: "Validation passed" +``` + +### fail + +```yaml +- name: Fail with message + ansible.builtin.fail: + msg: "Required condition not met" + when: condition +``` + +## Misc + +### pause + +```yaml +- name: Wait 10 seconds + ansible.builtin.pause: + seconds: 10 + +- name: Wait for user + ansible.builtin.pause: + prompt: "Press enter to continue" +``` + +### stat + +```yaml +- name: Check file exists + ansible.builtin.stat: + path: /etc/config + register: config_file + +- name: Use result + ansible.builtin.debug: + msg: "File exists" + when: config_file.stat.exists +``` diff --git a/skills/ansible/references/playbooks.md b/skills/ansible/references/playbooks.md new file mode 100644 index 0000000..1aa9a6d --- /dev/null +++ b/skills/ansible/references/playbooks.md @@ -0,0 +1,243 @@ +# Ansible Playbook Reference + +## Basic Structure + +```yaml +--- +- name: Playbook description + hosts: target_group + become: true # Run as root + gather_facts: true # Collect system info + + vars: + my_var: value + + vars_files: + - vars/secrets.yml + + pre_tasks: + - name: Pre-task + ansible.builtin.debug: + msg: "Running before main tasks" + + roles: + - role_name + + tasks: + - name: Main task + ansible.builtin.debug: + msg: "Main task" + + handlers: + - name: Handler name + ansible.builtin.service: + name: service + state: restarted + + post_tasks: + - name: Post-task + ansible.builtin.debug: + msg: "Running after main tasks" +``` + +## Task Options + +```yaml +tasks: + - name: Task with common options + ansible.builtin.command: /bin/command + become: true # Privilege escalation + become_user: www-data # Run as specific user + when: condition # Conditional execution + register: result # Store output + ignore_errors: true # Continue on failure + changed_when: false # Override change detection + failed_when: result.rc != 0 # Custom failure condition + tags: + - deploy + - config + notify: Handler name # Trigger handler +``` + +## Loops + +```yaml +# Simple loop +- name: Install packages + ansible.builtin.apt: + name: "{{ item }}" + state: present + loop: + - nginx + - python3 + +# Loop with dict +- name: Create users + ansible.builtin.user: + name: "{{ item.name }}" + groups: "{{ item.groups }}" + loop: + - { name: 'user1', groups: 'admin' } + - { name: 'user2', groups: 'users' } + +# Loop over dict +- name: Process items + ansible.builtin.debug: + msg: "{{ item.key }}: {{ item.value }}" + loop: "{{ my_dict | dict2items }}" + +# Loop with index +- name: With index + ansible.builtin.debug: + msg: "{{ index }}: {{ item }}" + loop: "{{ my_list }}" + loop_control: + index_var: index +``` + +## Conditionals + +```yaml +# Simple when +- name: Only on Ubuntu + ansible.builtin.apt: + name: package + when: ansible_distribution == "Ubuntu" + +# Multiple conditions +- name: Complex condition + ansible.builtin.command: /bin/something + when: + - ansible_os_family == "Debian" + - ansible_distribution_version is version('20.04', '>=') + +# Or conditions +- name: Or condition + ansible.builtin.command: /bin/something + when: condition1 or condition2 + +# Check variable +- name: If defined + ansible.builtin.debug: + msg: "{{ my_var }}" + when: my_var is defined +``` + +## Blocks + +```yaml +- name: Block example + block: + - name: Task 1 + ansible.builtin.command: /bin/task1 + + - name: Task 2 + ansible.builtin.command: /bin/task2 + + rescue: + - name: Handle failure + ansible.builtin.debug: + msg: "Block failed" + + always: + - name: Always run + ansible.builtin.debug: + msg: "Cleanup" +``` + +## Handlers + +```yaml +tasks: + - name: Update config + ansible.builtin.template: + src: config.j2 + dest: /etc/app/config + notify: + - Restart service + - Reload config + +handlers: + - name: Restart service + ansible.builtin.service: + name: app + state: restarted + + - name: Reload config + ansible.builtin.service: + name: app + state: reloaded +``` + +Handlers run once at end of play, even if notified multiple times. + +## Including Tasks + +```yaml +# Include tasks file +- name: Include tasks + ansible.builtin.include_tasks: tasks/setup.yml + +# Import tasks (static) +- name: Import tasks + ansible.builtin.import_tasks: tasks/setup.yml + +# Include with variables +- name: Include with vars + ansible.builtin.include_tasks: tasks/deploy.yml + vars: + environment: production +``` + +## Tags + +```yaml +tasks: + - name: Tagged task + ansible.builtin.command: /bin/command + tags: + - deploy + - always # Always runs regardless of tag selection + + - name: Never runs by default + ansible.builtin.command: /bin/command + tags: never # Only runs when explicitly tagged +``` + +Run with tags: +```bash +ansible-playbook playbook.yml --tags "deploy" +ansible-playbook playbook.yml --skip-tags "slow" +``` + +## Check Mode + +```yaml +# Force check mode behavior +- name: Always runs in check + ansible.builtin.command: /bin/command + check_mode: false # Runs even in check mode + +- name: Never runs in check + ansible.builtin.command: /bin/command + check_mode: true # Only runs in check mode +``` + +## Delegation + +```yaml +# Run on different host +- name: Update load balancer + ansible.builtin.command: /bin/update-lb + delegate_to: loadbalancer + +# Run locally +- name: Local action + ansible.builtin.command: /bin/local-command + delegate_to: localhost + +# Run once for all hosts +- name: Single execution + ansible.builtin.command: /bin/command + run_once: true +``` diff --git a/skills/ansible/references/proxmox/authentication.md b/skills/ansible/references/proxmox/authentication.md new file mode 100644 index 0000000..be15745 --- /dev/null +++ b/skills/ansible/references/proxmox/authentication.md @@ -0,0 +1,155 @@ +# Ansible Proxmox Authentication + +## API Token Setup + +Create a dedicated Ansible user and API token on Proxmox: + +```bash +# On Proxmox node +pveum user add ansible@pve +pveum aclmod / -user ansible@pve -role PVEAdmin +pveum user token add ansible@pve mytoken --privsep 0 +``` + +**Note:** `--privsep 0` gives the token the same permissions as the user. + +## Playbook Variables + +### Direct in playbook (NOT recommended) + +```yaml +vars: + proxmox_api_host: proxmox.example.com + proxmox_api_user: ansible@pve + proxmox_api_token_id: mytoken + proxmox_api_token_secret: "{{ vault_proxmox_token }}" +``` + +### Group vars with vault + +```yaml +# group_vars/all.yml +proxmox_api_host: proxmox.example.com +proxmox_api_user: ansible@pve +proxmox_api_token_id: mytoken + +# group_vars/secrets.yml (ansible-vault encrypted) +proxmox_api_token_secret: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +``` + +### Environment variables + +```bash +export PROXMOX_HOST=proxmox.example.com +export PROXMOX_USER=ansible@pve +export PROXMOX_TOKEN_ID=mytoken +export PROXMOX_TOKEN_SECRET=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +``` + +```yaml +# In playbook +vars: + proxmox_api_host: "{{ lookup('env', 'PROXMOX_HOST') }}" + proxmox_api_user: "{{ lookup('env', 'PROXMOX_USER') }}" + proxmox_api_token_id: "{{ lookup('env', 'PROXMOX_TOKEN_ID') }}" + proxmox_api_token_secret: "{{ lookup('env', 'PROXMOX_TOKEN_SECRET') }}" +``` + +## Reusable Auth Block + +Define once, reuse across tasks: + +```yaml +vars: + proxmox_auth: &proxmox_auth + api_host: "{{ proxmox_api_host }}" + api_user: "{{ proxmox_api_user }}" + api_token_id: "{{ proxmox_api_token_id }}" + api_token_secret: "{{ proxmox_api_token_secret }}" + validate_certs: false # For self-signed certs + +tasks: + - name: Create VM + community.general.proxmox_kvm: + <<: *proxmox_auth + node: joseph + vmid: 300 + name: myvm + state: present + + - name: Start VM + community.general.proxmox_kvm: + <<: *proxmox_auth + vmid: 300 + state: started +``` + +## TLS Certificate Handling + +### Self-signed certificates + +```yaml +community.general.proxmox_kvm: + # ... auth params ... + validate_certs: false +``` + +### Custom CA + +```bash +export SSL_CERT_FILE=/path/to/ca-bundle.crt +``` + +Or in ansible.cfg: + +```ini +[defaults] +# For urllib3/requests +ca_cert = /path/to/ca-bundle.crt +``` + +## Minimum Required Permissions + +For full VM/container management: + +| Permission | Path | Purpose | +|------------|------|---------| +| VM.Allocate | / | Create VMs | +| VM.Clone | / | Clone templates | +| VM.Config.* | / | Modify VM config | +| VM.PowerMgmt | / | Start/stop VMs | +| VM.Snapshot | / | Create snapshots | +| Datastore.AllocateSpace | / | Allocate disk space | +| Datastore.Audit | / | List storage | + +Or use the built-in `PVEAdmin` role for full access. + +## Troubleshooting Auth Issues + +```yaml +# Debug task to test connection +- name: Test Proxmox API connection + community.general.proxmox_kvm: + api_host: "{{ proxmox_api_host }}" + api_user: "{{ proxmox_api_user }}" + api_token_id: "{{ proxmox_api_token_id }}" + api_token_secret: "{{ proxmox_api_token_secret }}" + validate_certs: false + vmid: 100 + state: current + register: result + ignore_errors: true + +- name: Show result + ansible.builtin.debug: + var: result +``` + +Common errors: + +| Error | Cause | Fix | +|-------|-------|-----| +| 401 Unauthorized | Bad token | Verify token ID format: `user@realm!tokenname` | +| 403 Forbidden | Insufficient permissions | Check user ACLs with `pveum user permissions ansible@pve` | +| SSL certificate problem | Self-signed cert | Set `validate_certs: false` | +| Connection refused | Wrong host/port | Verify API URL (port 8006) | diff --git a/skills/ansible/references/proxmox/dynamic-inventory.md b/skills/ansible/references/proxmox/dynamic-inventory.md new file mode 100644 index 0000000..762e50a --- /dev/null +++ b/skills/ansible/references/proxmox/dynamic-inventory.md @@ -0,0 +1,195 @@ +# Ansible Proxmox Dynamic Inventory + +Query Proxmox API for automatic inventory generation. + +## Plugin Setup + +### Requirements + +```bash +pip install proxmoxer requests +ansible-galaxy collection install community.general +``` + +### Inventory File + +Create `inventory/proxmox.yml`: + +```yaml +plugin: community.general.proxmox +url: https://proxmox.example.com:8006 +user: ansible@pve +token_id: mytoken +token_secret: "{{ lookup('env', 'PROXMOX_TOKEN_SECRET') }}" +validate_certs: false + +# Include VMs and containers +want_facts: true +want_proxmox_nodes_ansible_host: false + +# Filter by status +filters: + - status == "running" + +# Group by various attributes +groups: + # By Proxmox node + node_joseph: proxmox_node == "joseph" + node_maxwell: proxmox_node == "maxwell" + node_everette: proxmox_node == "everette" + + # By type + vms: proxmox_type == "qemu" + containers: proxmox_type == "lxc" + + # By template naming convention + docker_hosts: "'docker' in proxmox_name" + pihole: "'pihole' in proxmox_name" + +# Host variables from Proxmox +compose: + ansible_host: proxmox_agent_interfaces[0].ip-addresses[0].ip-address | default(proxmox_name) + ansible_user: "'ubuntu'" + proxmox_vmid: proxmox_vmid + proxmox_node: proxmox_node +``` + +### Enable in ansible.cfg + +```ini +[inventory] +enable_plugins = community.general.proxmox, yaml, ini +``` + +## Testing Inventory + +```bash +# List all hosts +ansible-inventory -i inventory/proxmox.yml --list + +# Graph view +ansible-inventory -i inventory/proxmox.yml --graph + +# Specific host details +ansible-inventory -i inventory/proxmox.yml --host myvm +``` + +## Common Patterns + +### Filter by Tags + +Proxmox 7+ supports VM tags: + +```yaml +groups: + webservers: "'web' in proxmox_tags" + databases: "'db' in proxmox_tags" + production: "'prod' in proxmox_tags" +``` + +### Filter by VMID Range + +```yaml +filters: + - vmid >= 200 + - vmid < 300 + +groups: + dev_vms: proxmox_vmid >= 200 and proxmox_vmid < 300 + prod_vms: proxmox_vmid >= 300 and proxmox_vmid < 400 +``` + +### IP Address from QEMU Agent + +Requires QEMU guest agent running in VM: + +```yaml +compose: + # Primary IP from agent + ansible_host: >- + proxmox_agent_interfaces + | selectattr('name', 'equalto', 'eth0') + | map(attribute='ip-addresses') + | flatten + | selectattr('ip-address-type', 'equalto', 'ipv4') + | map(attribute='ip-address') + | first + | default(proxmox_name) +``` + +### Static + Dynamic Inventory + +Combine with static inventory: + +```bash +# inventory/ +# static.yml # Static hosts +# proxmox.yml # Dynamic from Proxmox + +ansible-playbook -i inventory/ playbook.yml +``` + +## Available Variables + +Variables populated from Proxmox API: + +| Variable | Description | +|----------|-------------| +| proxmox_vmid | VM/container ID | +| proxmox_name | VM/container name | +| proxmox_type | "qemu" or "lxc" | +| proxmox_status | running, stopped, etc. | +| proxmox_node | Proxmox node name | +| proxmox_pool | Resource pool (if any) | +| proxmox_tags | Tags (Proxmox 7+) | +| proxmox_template | Is template (bool) | +| proxmox_agent | QEMU agent enabled (bool) | +| proxmox_agent_interfaces | Network info from agent | +| proxmox_cpus | CPU count | +| proxmox_maxmem | Max memory bytes | +| proxmox_maxdisk | Max disk bytes | + +## Caching + +Enable caching for faster inventory: + +```yaml +plugin: community.general.proxmox +# ... auth ... + +cache: true +cache_plugin: jsonfile +cache_connection: /tmp/ansible_proxmox_cache +cache_timeout: 300 # 5 minutes +``` + +Clear cache: +```bash +rm -rf /tmp/ansible_proxmox_cache +``` + +## Troubleshooting + +### No hosts returned + +1. Check API connectivity: + ```bash + curl -k "https://proxmox:8006/api2/json/cluster/resources" \ + -H "Authorization: PVEAPIToken=ansible@pve!mytoken=secret" + ``` + +2. Check filters aren't too restrictive - try removing them + +3. Verify token permissions include `VM.Audit` + +### QEMU agent data missing + +- Agent must be installed and running in guest +- `want_facts: true` must be set +- May take a few seconds after VM boot + +### Slow inventory queries + +- Enable caching (see above) +- Use filters to reduce results +- Avoid `want_facts: true` if not needed diff --git a/skills/ansible/references/proxmox/gotchas.md b/skills/ansible/references/proxmox/gotchas.md new file mode 100644 index 0000000..6ed4315 --- /dev/null +++ b/skills/ansible/references/proxmox/gotchas.md @@ -0,0 +1,202 @@ +# Ansible Proxmox Gotchas + +Common issues when using Ansible with Proxmox VE. + +## 1. Token ID Format + +**Wrong:** +```yaml +api_token_id: mytoken +``` + +**Correct:** +```yaml +api_token_id: mytoken # Just the token name, NOT user@realm!tokenname +``` + +The module combines `api_user` and `api_token_id` internally. + +## 2. VMID Required for Most Operations + +Unlike Terraform, you must always specify `vmid`: + +```yaml +# Won't auto-generate VMID +- name: Create VM + community.general.proxmox_kvm: + # ... auth ... + vmid: 300 # REQUIRED - no auto-assignment + name: myvm +``` + +To find next available VMID: +```yaml +- name: Get cluster resources + ansible.builtin.uri: + url: "https://{{ proxmox_api_host }}:8006/api2/json/cluster/resources" + headers: + Authorization: "PVEAPIToken={{ proxmox_api_user }}!{{ proxmox_api_token_id }}={{ proxmox_api_token_secret }}" + validate_certs: false + register: resources + +- name: Calculate next VMID + ansible.builtin.set_fact: + next_vmid: "{{ (resources.json.data | selectattr('vmid', 'defined') | map(attribute='vmid') | max) + 1 }}" +``` + +## 3. Node Parameter Required + +Must specify which node to operate on: + +```yaml +- name: Create VM + community.general.proxmox_kvm: + # ... auth ... + node: joseph # REQUIRED - which Proxmox node + vmid: 300 +``` + +## 4. Clone vs Create + +Cloning requires different parameters than creating: + +```yaml +# CLONE from template +- name: Clone VM + community.general.proxmox_kvm: + # ... auth ... + node: joseph + vmid: 300 + name: myvm + clone: tmpl-ubuntu-2404-standard # Template name or VMID + full: true + +# CREATE new (less common) +- name: Create VM + community.general.proxmox_kvm: + # ... auth ... + node: joseph + vmid: 300 + name: myvm + ostype: l26 + scsihw: virtio-scsi-pci + bootdisk: scsi0 + scsi: + scsi0: 'local-lvm:32,format=raw' +``` + +## 5. Async Operations + +Large operations (clone, snapshot) can timeout. Use async: + +```yaml +- name: Clone large VM + community.general.proxmox_kvm: + # ... auth ... + clone: large-template + vmid: 300 + timeout: 600 # Module timeout + async: 900 # Ansible async timeout + poll: 10 # Check every 10 seconds +``` + +## 6. State Idempotency + +`state: present` doesn't update existing VMs: + +```yaml +# This WON'T change cores on existing VM +- name: Create/update VM + community.general.proxmox_kvm: + # ... auth ... + vmid: 300 + cores: 4 # Ignored if VM exists + state: present +``` + +To modify existing VMs, use `proxmox_kvm` with `update: true` (Ansible 2.14+) or use the API directly. + +## 7. Network Interface Format (LXC) + +LXC containers use a specific JSON-like string format: + +```yaml +# WRONG +netif: + net0: + bridge: vmbr0 + ip: dhcp + +# CORRECT +netif: '{"net0":"name=eth0,bridge=vmbr0,ip=dhcp"}' + +# Multiple interfaces +netif: '{"net0":"name=eth0,bridge=vmbr0,ip=dhcp","net1":"name=eth1,bridge=vmbr12,ip=dhcp"}' +``` + +## 8. Disk Resize Only Grows + +`proxmox_disk` resize only increases size: + +```yaml +# This adds 20G to current size +- name: Grow disk + community.general.proxmox_disk: + # ... auth ... + vmid: 300 + disk: scsi0 + size: +20G # Relative increase + state: resized + +# NOT possible to shrink +``` + +## 9. Template vs VM States + +Templates don't support all states: + +```yaml +# Can't start a template +- name: Start template + community.general.proxmox_kvm: + vmid: 100 + state: started # FAILS - templates can't run +``` + +Convert template to VM first if needed. + +## 10. Collection Version Matters + +Module parameters change between versions. Check installed version: + +```bash +ansible-galaxy collection list | grep community.general +``` + +Update if needed: +```bash +ansible-galaxy collection install community.general --upgrade +``` + +## 11. Cloud-Init Not Supported + +Unlike Terraform's Proxmox provider, the Ansible modules have limited cloud-init support. For cloud-init VMs: + +1. Clone template with cloud-init already configured +2. Use API calls to set cloud-init parameters +3. Or configure post-boot with Ansible + +```yaml +# Workaround: Use URI module for cloud-init config +- name: Set cloud-init IP + ansible.builtin.uri: + url: "https://{{ proxmox_api_host }}:8006/api2/json/nodes/{{ node }}/qemu/{{ vmid }}/config" + method: PUT + headers: + Authorization: "PVEAPIToken={{ proxmox_api_user }}!{{ proxmox_api_token_id }}={{ proxmox_api_token_secret }}" + body_format: form-urlencoded + body: + ipconfig0: "ip=192.168.1.100/24,gw=192.168.1.1" + ciuser: ubuntu + validate_certs: false +``` diff --git a/skills/ansible/references/proxmox/modules.md b/skills/ansible/references/proxmox/modules.md new file mode 100644 index 0000000..9f1aa40 --- /dev/null +++ b/skills/ansible/references/proxmox/modules.md @@ -0,0 +1,232 @@ +# Ansible Proxmox Modules + +Proxmox VE management via `community.general` collection. + +## Collection Setup + +```bash +ansible-galaxy collection install community.general +``` + +## Core Modules + +### proxmox (LXC Containers) + +```yaml +- name: Create LXC container + community.general.proxmox: + api_host: proxmox.example.com + api_user: ansible@pve + api_token_id: mytoken + api_token_secret: "{{ proxmox_token_secret }}" + node: joseph + vmid: 200 + hostname: mycontainer + ostemplate: local:vztmpl/ubuntu-22.04-standard_22.04-1_amd64.tar.zst + storage: local-lvm + cores: 2 + memory: 2048 + disk: 10 + netif: '{"net0":"name=eth0,bridge=vmbr0,ip=dhcp"}' + state: present + +- name: Start container + community.general.proxmox: + api_host: proxmox.example.com + api_user: ansible@pve + api_token_id: mytoken + api_token_secret: "{{ proxmox_token_secret }}" + node: joseph + vmid: 200 + state: started + +- name: Stop container + community.general.proxmox: + # ... auth params ... + vmid: 200 + state: stopped + force: true # Force stop if graceful fails + +- name: Remove container + community.general.proxmox: + # ... auth params ... + vmid: 200 + state: absent +``` + +### proxmox_kvm (VMs) + +```yaml +- name: Create VM from template + community.general.proxmox_kvm: + api_host: proxmox.example.com + api_user: ansible@pve + api_token_id: mytoken + api_token_secret: "{{ proxmox_token_secret }}" + node: joseph + vmid: 300 + name: myvm + clone: tmpl-ubuntu-2404-standard + full: true # Full clone (not linked) + storage: local-lvm + format: raw + timeout: 500 + +- name: Start VM + community.general.proxmox_kvm: + # ... auth params ... + node: joseph + vmid: 300 + state: started + +- name: Stop VM (ACPI shutdown) + community.general.proxmox_kvm: + # ... auth params ... + vmid: 300 + state: stopped + force: false # Graceful ACPI + +- name: Force stop VM + community.general.proxmox_kvm: + # ... auth params ... + vmid: 300 + state: stopped + force: true + +- name: Current state (running/stopped/present/absent) + community.general.proxmox_kvm: + # ... auth params ... + vmid: 300 + state: current + register: vm_state +``` + +### proxmox_template + +```yaml +- name: Convert VM to template + community.general.proxmox_template: + api_host: proxmox.example.com + api_user: ansible@pve + api_token_id: mytoken + api_token_secret: "{{ proxmox_token_secret }}" + node: joseph + vmid: 100 + state: present # Convert to template + +- name: Delete template + community.general.proxmox_template: + # ... auth params ... + vmid: 100 + state: absent +``` + +### proxmox_snap + +```yaml +- name: Create snapshot + community.general.proxmox_snap: + api_host: proxmox.example.com + api_user: ansible@pve + api_token_id: mytoken + api_token_secret: "{{ proxmox_token_secret }}" + vmid: 300 + snapname: before-upgrade + description: "Snapshot before major upgrade" + vmstate: false # Don't include RAM + state: present + +- name: Rollback to snapshot + community.general.proxmox_snap: + # ... auth params ... + vmid: 300 + snapname: before-upgrade + state: rollback + +- name: Remove snapshot + community.general.proxmox_snap: + # ... auth params ... + vmid: 300 + snapname: before-upgrade + state: absent +``` + +### proxmox_nic + +```yaml +- name: Add NIC to VM + community.general.proxmox_nic: + api_host: proxmox.example.com + api_user: ansible@pve + api_token_id: mytoken + api_token_secret: "{{ proxmox_token_secret }}" + vmid: 300 + interface: net1 + bridge: vmbr12 + model: virtio + tag: 12 # VLAN tag + state: present + +- name: Remove NIC + community.general.proxmox_nic: + # ... auth params ... + vmid: 300 + interface: net1 + state: absent +``` + +### proxmox_disk + +```yaml +- name: Add disk to VM + community.general.proxmox_disk: + api_host: proxmox.example.com + api_user: ansible@pve + api_token_id: mytoken + api_token_secret: "{{ proxmox_token_secret }}" + vmid: 300 + disk: scsi1 + storage: local-lvm + size: 50G + format: raw + state: present + +- name: Resize disk + community.general.proxmox_disk: + # ... auth params ... + vmid: 300 + disk: scsi0 + size: +20G # Increase by 20G + state: resized + +- name: Detach disk + community.general.proxmox_disk: + # ... auth params ... + vmid: 300 + disk: scsi1 + state: absent +``` + +## State Reference + +| Module | States | +|--------|--------| +| proxmox (LXC) | present, started, stopped, restarted, absent | +| proxmox_kvm | present, started, stopped, restarted, absent, current | +| proxmox_template | present, absent | +| proxmox_snap | present, absent, rollback | +| proxmox_nic | present, absent | +| proxmox_disk | present, absent, resized | + +## Common Parameters + +All modules share these authentication parameters: + +| Parameter | Description | +|-----------|-------------| +| api_host | Proxmox hostname/IP | +| api_user | User (format: user@realm) | +| api_token_id | API token name | +| api_token_secret | API token value | +| validate_certs | Verify TLS (default: true) | +| timeout | API timeout seconds | diff --git a/skills/ansible/references/troubleshooting.md b/skills/ansible/references/troubleshooting.md new file mode 100644 index 0000000..cc8663e --- /dev/null +++ b/skills/ansible/references/troubleshooting.md @@ -0,0 +1,295 @@ +# Ansible Troubleshooting Reference + +## Common Errors + +| Error | Cause | Solution | +|-------|-------|----------| +| SSH connection failed | Wrong host/key/user | Check ansible_host, ansible_user, key | +| Permission denied | Need sudo/wrong user | Add `become: true`, check sudo config | +| Module not found | Collection not installed | `ansible-galaxy collection install` | +| Variable undefined | Missing var/typo | Check var name, define in vars | +| Syntax error | YAML/Jinja2 issue | Run `ansible-playbook --syntax-check` | +| Host unreachable | Network/SSH issue | `ansible host -m ping`, check firewall | + +## Debug Commands + +```bash +# Test connectivity +ansible all -m ping +ansible host -m ping -vvv + +# Syntax check +ansible-playbook playbook.yml --syntax-check + +# Dry run (check mode) +ansible-playbook playbook.yml --check + +# Diff mode (show changes) +ansible-playbook playbook.yml --diff + +# Verbose output +ansible-playbook playbook.yml -v # Minimal +ansible-playbook playbook.yml -vv # More +ansible-playbook playbook.yml -vvv # Connection debug +ansible-playbook playbook.yml -vvvv # Full debug + +# List tasks without running +ansible-playbook playbook.yml --list-tasks + +# List hosts +ansible-playbook playbook.yml --list-hosts + +# Start at specific task +ansible-playbook playbook.yml --start-at-task="Task name" + +# Step through tasks +ansible-playbook playbook.yml --step +``` + +## Connection Issues + +### Test SSH + +```bash +# Direct SSH test +ssh -i ~/.ssh/key user@host + +# Ansible ping +ansible host -m ping -vvv + +# Check SSH config +ansible host -m debug -a "var=ansible_ssh_private_key_file" +``` + +### Common SSH Fixes + +```yaml +# In inventory or ansible.cfg +ansible_ssh_private_key_file: ~/.ssh/mykey +ansible_user: ubuntu +ansible_host: 192.168.1.10 +host_key_checking: False # Only for testing +``` + +### SSH Connection Options + +```yaml +# In inventory +host1: + ansible_host: 192.168.1.10 + ansible_ssh_common_args: '-o StrictHostKeyChecking=no' + ansible_ssh_extra_args: '-o ConnectTimeout=10' +``` + +## Permission Issues + +### Sudo Not Working + +```yaml +# Enable become +- hosts: all + become: true + become_method: sudo + become_user: root +``` + +```bash +# On target host, check sudoers +sudo visudo + +# User should have: +# ubuntu ALL=(ALL) NOPASSWD: ALL +``` + +### Ask for Sudo Password + +```bash +ansible-playbook playbook.yml --ask-become-pass +``` + +## Variable Issues + +### Debug Variables + +```yaml +- name: Print all vars + ansible.builtin.debug: + var: vars + +- name: Print specific var + ansible.builtin.debug: + var: my_var + +- name: Print hostvars + ansible.builtin.debug: + var: hostvars[inventory_hostname] + +- name: Print facts + ansible.builtin.debug: + var: ansible_facts +``` + +### Check Variable Precedence + +```bash +# See where variable comes from +ansible-inventory --host hostname --yaml +``` + +### Undefined Variable + +```yaml +# Provide default +value: "{{ my_var | default('fallback') }}" + +# Check if defined +- name: Task + when: my_var is defined + +# Fail early if required +- name: Validate + ansible.builtin.assert: + that: my_var is defined + fail_msg: "my_var must be set" +``` + +## Module Issues + +### Module Not Found + +```bash +# Install collection +ansible-galaxy collection install community.docker + +# Check installed +ansible-galaxy collection list + +# Update collections +ansible-galaxy collection install -r requirements.yml --force +``` + +### Module Arguments + +```bash +# Get module documentation +ansible-doc ansible.builtin.copy +ansible-doc community.docker.docker_compose_v2 +``` + +## Idempotency Issues + +### Task Always Shows "changed" + +```yaml +# Bad - always changed +- name: Run script + ansible.builtin.command: /bin/script.sh + +# Good - check first +- name: Run script + ansible.builtin.command: /bin/script.sh + args: + creates: /opt/app/.installed + +# Good - explicit changed_when +- name: Run script + ansible.builtin.command: /bin/script.sh + register: result + changed_when: "'Created' in result.stdout" +``` + +### Test Idempotency + +```bash +# Run twice, second should show all "ok" +ansible-playbook playbook.yml +ansible-playbook playbook.yml # Should show "changed=0" +``` + +## Handler Issues + +### Handler Not Running + +- Handlers only run if task reports "changed" +- Handlers run at end of play, not immediately +- Force handler run: `ansible-playbook --force-handlers` + +```yaml +# Force handler to run immediately +- name: Config change + ansible.builtin.template: + src: config.j2 + dest: /etc/app/config + notify: Restart app + +- name: Flush handlers + ansible.builtin.meta: flush_handlers + +- name: Continue with restarted service + ansible.builtin.uri: + url: http://localhost:8080/health +``` + +## Performance Issues + +### Slow Playbook + +```yaml +# Disable fact gathering if not needed +- hosts: all + gather_facts: false + +# Or gather specific facts +- hosts: all + gather_facts: true + gather_subset: + - network +``` + +```bash +# Increase parallelism +ansible-playbook playbook.yml -f 20 # 20 forks + +# Use pipelining (add to ansible.cfg) +# [ssh_connection] +# pipelining = True +``` + +### Callback Timer + +```ini +# ansible.cfg +[defaults] +callbacks_enabled = timer, profile_tasks +``` + +## Recovery + +### Failed Playbook + +```bash +# Retry failed hosts +ansible-playbook playbook.yml --limit @playbook.retry + +# Start at failed task +ansible-playbook playbook.yml --start-at-task="Failed Task Name" +``` + +### Cleanup After Failure + +```yaml +- name: Risky operation + block: + - name: Do something + ansible.builtin.command: /bin/risky + rescue: + - name: Cleanup on failure + ansible.builtin.file: + path: /tmp/incomplete + state: absent + always: + - name: Always cleanup + ansible.builtin.file: + path: /tmp/lock + state: absent +``` diff --git a/skills/ansible/references/variables.md b/skills/ansible/references/variables.md new file mode 100644 index 0000000..c30bf69 --- /dev/null +++ b/skills/ansible/references/variables.md @@ -0,0 +1,246 @@ +# Ansible Variables Reference + +## Variable Precedence (High to Low) + +1. **Extra vars** (`-e "var=value"`) +2. **Task vars** (in task) +3. **Block vars** (in block) +4. **Role/include vars** +5. **set_facts / registered vars** +6. **Play vars_files** +7. **Play vars_prompt** +8. **Play vars** +9. **Host facts** +10. **Playbook host_vars/** +11. **Inventory host_vars/** +12. **Playbook group_vars/** +13. **Inventory group_vars/** +14. **Playbook group_vars/all** +15. **Inventory group_vars/all** +16. **Role defaults** + +## Defining Variables + +### In Playbook + +```yaml +- hosts: all + vars: + app_name: myapp + app_port: 8080 + + vars_files: + - vars/common.yml + - "vars/{{ environment }}.yml" +``` + +### In Tasks + +```yaml +- name: Set variable + ansible.builtin.set_fact: + my_var: "value" + +- name: Register output + ansible.builtin.command: whoami + register: user_result + +- name: Use registered + ansible.builtin.debug: + msg: "User: {{ user_result.stdout }}" +``` + +### In Roles + +```yaml +# roles/app/defaults/main.yml (low priority) +app_port: 8080 + +# roles/app/vars/main.yml (high priority) +internal_setting: value +``` + +## Variable Types + +```yaml +# String +name: "value" + +# Number +port: 8080 + +# Boolean +enabled: true + +# List +packages: + - nginx + - python3 + +# Dictionary +user: + name: admin + groups: + - wheel + - docker +``` + +## Accessing Variables + +```yaml +# Simple +msg: "{{ my_var }}" + +# Dictionary +msg: "{{ user.name }}" +msg: "{{ user['name'] }}" + +# List +msg: "{{ packages[0] }}" +msg: "{{ packages | first }}" + +# Default value +msg: "{{ my_var | default('fallback') }}" + +# Required (fail if undefined) +msg: "{{ my_var }}" # Fails if undefined +``` + +## Jinja2 Filters + +```yaml +# Default +value: "{{ var | default('default') }}" + +# Mandatory +value: "{{ var | mandatory }}" + +# Type conversion +port: "{{ port_string | int }}" +flag: "{{ flag_string | bool }}" + +# String operations +upper: "{{ name | upper }}" +lower: "{{ name | lower }}" +title: "{{ name | title }}" + +# Lists +first: "{{ list | first }}" +last: "{{ list | last }}" +length: "{{ list | length }}" +joined: "{{ list | join(',') }}" + +# JSON +json_str: "{{ dict | to_json }}" +yaml_str: "{{ dict | to_yaml }}" + +# Path operations +basename: "{{ path | basename }}" +dirname: "{{ path | dirname }}" +``` + +## Facts + +```yaml +# Accessing facts +os: "{{ ansible_distribution }}" +version: "{{ ansible_distribution_version }}" +ip: "{{ ansible_default_ipv4.address }}" +hostname: "{{ ansible_hostname }}" +memory_mb: "{{ ansible_memtotal_mb }}" +cpus: "{{ ansible_processor_vcpus }}" +``` + +### Gathering Facts + +```yaml +- hosts: all + gather_facts: true # Default + +# Or manually +- name: Gather facts + ansible.builtin.setup: + filter: ansible_* + +# Specific facts +- name: Get network facts + ansible.builtin.setup: + gather_subset: + - network +``` + +## Environment Variables + +```yaml +# Lookup +value: "{{ lookup('env', 'MY_VAR') }}" + +# Set for task +- name: Run with env + ansible.builtin.command: /bin/command + environment: + MY_VAR: "{{ my_value }}" +``` + +## Secrets/Vault + +```bash +# Create encrypted file +ansible-vault create secrets.yml + +# Edit encrypted file +ansible-vault edit secrets.yml + +# Encrypt existing file +ansible-vault encrypt vars.yml + +# Run with vault password +ansible-playbook playbook.yml --ask-vault-pass +ansible-playbook playbook.yml --vault-password-file ~/.vault_pass +``` + +## Prompt for Variables + +```yaml +- hosts: all + vars_prompt: + - name: password + prompt: "Enter password" + private: true + + - name: environment + prompt: "Which environment?" + default: "staging" +``` + +## Conditionals with Variables + +```yaml +- name: Check defined + when: my_var is defined + +- name: Check undefined + when: my_var is not defined + +- name: Check truthy + when: my_var | bool + +- name: Check falsy + when: not my_var | bool + +- name: Check in list + when: item in my_list + +- name: Version comparison + when: version is version('2.0', '>=') +``` + +## Hostvars + +Access variables from other hosts: + +```yaml +- name: Get from other host + ansible.builtin.debug: + msg: "{{ hostvars['web1']['ansible_host'] }}" +``` diff --git a/skills/docker/SKILL.md b/skills/docker/SKILL.md new file mode 100644 index 0000000..38ea3cb --- /dev/null +++ b/skills/docker/SKILL.md @@ -0,0 +1,121 @@ +--- +name: docker +description: | + Docker and Docker Compose reference for container deployment, networking, volumes, + and orchestration. Includes Proxmox hosting and LXC comparison patterns. + Use when working with docker-compose.yaml, Dockerfiles, troubleshooting containers, + or planning container architecture. + Triggers: docker, compose, container, dockerfile, volume, network, service, lxc. +--- + +# Docker Skill + +Docker and Docker Compose reference for containerized application deployment and management. + +## Quick Reference + +```bash +# Container operations +docker ps # List running containers +docker ps -a # List all containers +docker logs # View logs +docker logs -f # Follow logs +docker exec -it sh # Shell into container +docker inspect # Full container details + +# Compose operations +docker compose up -d # Start services (detached) +docker compose down # Stop and remove +docker compose ps # List compose services +docker compose logs -f # Follow all logs +docker compose pull # Pull latest images +docker compose restart # Restart services + +# Troubleshooting +docker stats # Resource usage +docker network ls # List networks +docker network inspect # Network details +docker volume ls # List volumes +docker system df # Disk usage +docker system prune # Clean up unused resources +``` + +## Reference Files + +Load on-demand based on task: + +| Topic | File | When to Load | +|-------|------|--------------| +| Compose Structure | [compose.md](references/compose.md) | Writing docker-compose.yaml | +| Networking | [networking.md](references/networking.md) | Network modes, port mapping | +| Volumes | [volumes.md](references/volumes.md) | Data persistence, mounts | +| Dockerfile | [dockerfile.md](references/dockerfile.md) | Building images | +| Troubleshooting | [troubleshooting.md](references/troubleshooting.md) | Common errors, diagnostics | + +### Proxmox Integration + +| Topic | File | When to Load | +|-------|------|--------------| +| Docker on Proxmox | [proxmox/hosting.md](references/proxmox/hosting.md) | VM sizing, storage, GPU passthrough | +| LXC vs Docker | [proxmox/lxc-vs-docker.md](references/proxmox/lxc-vs-docker.md) | Choosing container type | + +## Compose File Quick Reference + +```yaml +name: myapp # Project name (optional) + +services: + web: + image: nginx:alpine + ports: + - "80:80" + volumes: + - ./html:/usr/share/nginx/html:ro + networks: + - frontend + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost"] + interval: 30s + timeout: 10s + retries: 3 + +networks: + frontend: + driver: bridge + +volumes: + data: +``` + +## Validation Checklist + +Before deploying containers: + +- [ ] Services defined with specific image tags (not :latest) +- [ ] Port mappings without conflicts +- [ ] Volumes for persistent data +- [ ] Networks configured appropriately +- [ ] Resource limits set (memory, CPU) +- [ ] Health checks for critical services +- [ ] Restart policy appropriate +- [ ] Secrets not in images or compose file +- [ ] .env file for environment variables + +## Network Mode Quick Decision + +| Mode | Use Case | Isolation | +|------|----------|-----------| +| bridge | Default, most services | Container isolated | +| host | Performance, network tools | No isolation | +| macvlan | Direct LAN access | Own MAC/IP | +| ipvlan | Like macvlan, shared MAC | Own IP | +| none | No networking | Full isolation | + +## Volume Type Quick Decision + +| Type | Use Case | Portability | +|------|----------|-------------| +| Named volume | Database, app data | Best | +| Bind mount | Config files, dev | Host-dependent | +| tmpfs | Secrets, cache | Memory only | diff --git a/skills/docker/references/compose.md b/skills/docker/references/compose.md new file mode 100644 index 0000000..7a96764 --- /dev/null +++ b/skills/docker/references/compose.md @@ -0,0 +1,268 @@ +# Docker Compose Reference + +## File Structure + +```yaml +name: project-name # Optional, defaults to directory name + +services: + service-name: + # Image or build + image: image:tag + build: + context: ./path + dockerfile: Dockerfile + + # Networking + ports: + - "host:container" + networks: + - network-name + + # Storage + volumes: + - named-volume:/path + - ./host-path:/container-path + + # Environment + environment: + KEY: value + env_file: + - .env + + # Dependencies + depends_on: + - other-service + + # Lifecycle + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost"] + interval: 30s + timeout: 10s + retries: 3 + + # Resources + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + memory: 256M + +networks: + network-name: + driver: bridge + +volumes: + named-volume: +``` + +## Service Options + +### Image vs Build + +```yaml +# Use existing image +image: nginx:1.25-alpine + +# Build from Dockerfile +build: + context: . + dockerfile: Dockerfile + args: + BUILD_ARG: value +``` + +### Port Mapping + +```yaml +ports: + - "80:80" # host:container + - "443:443" + - "127.0.0.1:8080:80" # localhost only + - "8080-8090:8080-8090" # range +``` + +### Environment Variables + +```yaml +# Inline +environment: + DATABASE_URL: postgres://db:5432/app + DEBUG: "false" + +# From file +env_file: + - .env + - .env.local +``` + +### Dependencies + +```yaml +depends_on: + - db + - redis + +# With conditions (compose v2.1+) +depends_on: + db: + condition: service_healthy +``` + +### Restart Policies + +| Policy | Behavior | +|--------|----------| +| no | Never restart (default) | +| always | Always restart | +| unless-stopped | Restart unless manually stopped | +| on-failure | Restart only on error exit | + +### Health Checks + +```yaml +healthcheck: + test: ["CMD", "curl", "-f", "http://localhost/health"] + interval: 30s # Time between checks + timeout: 10s # Check timeout + retries: 3 # Failures before unhealthy + start_period: 40s # Grace period on startup +``` + +### Resource Limits + +```yaml +deploy: + resources: + limits: + cpus: '2' + memory: 1G + reservations: + cpus: '0.5' + memory: 256M +``` + +## Network Configuration + +### Custom Network + +```yaml +networks: + frontend: + driver: bridge + backend: + driver: bridge + internal: true # No external access +``` + +### External Network + +```yaml +networks: + existing-network: + external: true +``` + +### Macvlan Network + +```yaml +networks: + lan: + driver: macvlan + driver_opts: + parent: eth0 + ipam: + config: + - subnet: 192.168.1.0/24 + gateway: 192.168.1.1 +``` + +## Volume Configuration + +### Named Volume + +```yaml +volumes: + data: + driver: local + +services: + db: + volumes: + - data:/var/lib/mysql +``` + +### Bind Mount + +```yaml +services: + web: + volumes: + - ./config:/etc/app/config:ro + - ./data:/app/data +``` + +### tmpfs Mount + +```yaml +services: + app: + tmpfs: + - /tmp + - /run +``` + +## Multi-Environment Setup + +### Using .env Files + +```bash +# .env +COMPOSE_PROJECT_NAME=myapp +IMAGE_TAG=latest +``` + +```yaml +# docker-compose.yaml +services: + app: + image: myapp:${IMAGE_TAG} +``` + +### Override Files + +```bash +# Base config +docker-compose.yaml + +# Development overrides +docker-compose.override.yaml # Auto-loaded + +# Production +docker compose -f docker-compose.yaml -f docker-compose.prod.yaml up +``` + +## Useful Commands + +```bash +# Start with rebuild +docker compose up -d --build + +# Scale service +docker compose up -d --scale web=3 + +# View config after variable substitution +docker compose config + +# Execute command in service +docker compose exec web sh + +# View service logs +docker compose logs -f web + +# Restart single service +docker compose restart web +``` diff --git a/skills/docker/references/dockerfile.md b/skills/docker/references/dockerfile.md new file mode 100644 index 0000000..fee3aa3 --- /dev/null +++ b/skills/docker/references/dockerfile.md @@ -0,0 +1,243 @@ +# Dockerfile Reference + +## Basic Structure + +```dockerfile +# Base image +FROM ubuntu:22.04 + +# Metadata +LABEL maintainer="team@example.com" +LABEL version="1.0" + +# Environment +ENV APP_HOME=/app +WORKDIR $APP_HOME + +# Install dependencies +RUN apt-get update && apt-get install -y \ + package1 \ + package2 \ + && rm -rf /var/lib/apt/lists/* + +# Copy files +COPY requirements.txt . +RUN pip install -r requirements.txt +COPY . . + +# Non-root user +RUN useradd -r -s /bin/false appuser +USER appuser + +# Expose port +EXPOSE 8080 + +# Health check +HEALTHCHECK --interval=30s --timeout=3s \ + CMD curl -f http://localhost:8080/health || exit 1 + +# Entry point +ENTRYPOINT ["python"] +CMD ["app.py"] +``` + +## Multi-Stage Builds + +Reduce final image size by separating build and runtime: + +```dockerfile +# Build stage +FROM golang:1.21 AS builder +WORKDIR /app +COPY . . +RUN go build -o myapp + +# Runtime stage +FROM alpine:3.18 +COPY --from=builder /app/myapp /usr/local/bin/ +CMD ["myapp"] +``` + +## Common Base Images + +| Image | Size | Use Case | +|-------|------|----------| +| alpine | ~5MB | Minimal, production | +| debian:slim | ~80MB | Compatibility | +| ubuntu | ~75MB | Development | +| distroless | ~20MB | Security-focused | +| scratch | 0MB | Static binaries only | + +## Instructions Reference + +### FROM + +```dockerfile +FROM image:tag +FROM image:tag AS builder +FROM --platform=linux/amd64 image:tag +``` + +### RUN + +```dockerfile +# Shell form +RUN apt-get update && apt-get install -y package + +# Exec form +RUN ["executable", "param1", "param2"] +``` + +### COPY vs ADD + +```dockerfile +# COPY - preferred for local files +COPY ./src /app/src +COPY --chown=user:group files /app/ + +# ADD - can extract tars, fetch URLs (use sparingly) +ADD archive.tar.gz /app/ +``` + +### ENV vs ARG + +```dockerfile +# ARG - build-time only +ARG VERSION=1.0 + +# ENV - persists in image +ENV APP_VERSION=$VERSION +``` + +### EXPOSE + +```dockerfile +EXPOSE 8080 +EXPOSE 443/tcp +EXPOSE 53/udp +``` + +Documentation only - doesn't publish ports. + +### ENTRYPOINT vs CMD + +```dockerfile +# ENTRYPOINT - main executable +ENTRYPOINT ["python"] + +# CMD - default arguments (can be overridden) +CMD ["app.py"] + +# Combined: python app.py +# Override: docker run image other.py -> python other.py +``` + +### USER + +```dockerfile +RUN useradd -r -s /bin/false appuser +USER appuser +``` + +### HEALTHCHECK + +```dockerfile +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ + CMD curl -f http://localhost/health || exit 1 +``` + +## Best Practices + +### Layer Caching + +Order from least to most frequently changed: + +```dockerfile +# Rarely changes - cached +FROM node:18-alpine +WORKDIR /app + +# Changes when deps change +COPY package*.json ./ +RUN npm install + +# Changes frequently - rebuild each time +COPY . . +``` + +### Reduce Layers + +Combine RUN commands: + +```dockerfile +# Bad - 3 layers +RUN apt-get update +RUN apt-get install -y package +RUN rm -rf /var/lib/apt/lists/* + +# Good - 1 layer +RUN apt-get update && \ + apt-get install -y package && \ + rm -rf /var/lib/apt/lists/* +``` + +### Security + +```dockerfile +# Use specific tags +FROM node:18.17.0-alpine # Not :latest + +# Non-root user +USER nobody + +# Read-only filesystem +# (Set at runtime with --read-only) + +# No secrets in image +# (Use build args or runtime secrets) +``` + +### .dockerignore + +``` +.git +.gitignore +node_modules +*.log +.env +Dockerfile +docker-compose.yaml +README.md +``` + +## Build Commands + +```bash +# Basic build +docker build -t myimage:tag . + +# With build args +docker build --build-arg VERSION=1.0 -t myimage . + +# No cache +docker build --no-cache -t myimage . + +# Specific Dockerfile +docker build -f Dockerfile.prod -t myimage . + +# Multi-platform +docker buildx build --platform linux/amd64,linux/arm64 -t myimage . +``` + +## Debugging Builds + +```bash +# Build with progress output +docker build --progress=plain -t myimage . + +# Inspect layers +docker history myimage + +# Check image size +docker images myimage +``` diff --git a/skills/docker/references/networking.md b/skills/docker/references/networking.md new file mode 100644 index 0000000..984d979 --- /dev/null +++ b/skills/docker/references/networking.md @@ -0,0 +1,229 @@ +# Docker Networking Reference + +## Network Drivers + +### Bridge (Default) + +Isolated container network with port mapping. + +```yaml +networks: + app-network: + driver: bridge +``` + +- Containers get private IPs (172.17.0.0/16 default) +- Port mapping exposes services (`-p 80:80`) +- DNS resolution between containers by name +- Default for single-host deployments + +### Host + +Container shares host network stack. + +```yaml +services: + app: + network_mode: host +``` + +- No network isolation +- No port mapping needed (container uses host ports) +- Best performance (no NAT overhead) +- Use for: Network tools, performance-critical apps + +### Macvlan + +Container gets own MAC address on physical network. + +```yaml +networks: + lan: + driver: macvlan + driver_opts: + parent: eth0 + ipam: + config: + - subnet: 192.168.1.0/24 + gateway: 192.168.1.1 + ip_range: 192.168.1.128/25 +``` + +- Container appears as physical device on LAN +- Direct network access, no port mapping +- Use for: Services needing LAN presence (DNS, DHCP) +- Requires promiscuous mode on parent interface + +### IPvlan + +Like macvlan but shares host MAC address. + +```yaml +networks: + lan: + driver: ipvlan + driver_opts: + parent: eth0 + ipvlan_mode: l2 # or l3 +``` + +- L2 mode: Same subnet as host +- L3 mode: Different subnet, requires routing +- Use when: Macvlan blocked by switch, cloud environments + +### None + +No networking. + +```yaml +services: + isolated: + network_mode: none +``` + +## Port Mapping + +```yaml +ports: + # Simple mapping + - "80:80" + + # Different host port + - "8080:80" + + # Localhost only + - "127.0.0.1:8080:80" + + # UDP + - "53:53/udp" + + # Range + - "8080-8090:8080-8090" + + # Random host port + - "80" +``` + +## DNS and Service Discovery + +### Automatic DNS + +Containers on same network resolve each other by service name: + +```yaml +services: + web: + networks: + - app + db: + networks: + - app +``` + +`web` can reach `db` at hostname `db`. + +### Aliases + +```yaml +services: + db: + networks: + app: + aliases: + - database + - mysql +``` + +### Custom DNS + +```yaml +services: + app: + dns: + - 8.8.8.8 + - 8.8.4.4 + dns_search: + - example.com +``` + +## Network Isolation + +### Internal Networks + +No external connectivity: + +```yaml +networks: + backend: + internal: true +``` + +### Multiple Networks + +```yaml +services: + web: + networks: + - frontend + - backend + + db: + networks: + - backend # Not on frontend + +networks: + frontend: + backend: + internal: true +``` + +## Static IPs + +```yaml +services: + app: + networks: + app-network: + ipv4_address: 172.20.0.10 + +networks: + app-network: + ipam: + config: + - subnet: 172.20.0.0/24 +``` + +## Troubleshooting + +### Inspect Network + +```bash +docker network ls +docker network inspect +``` + +### Container Network Info + +```bash +docker inspect --format '{{json .NetworkSettings.Networks}}' +``` + +### Test Connectivity + +```bash +# From inside container +docker exec ping +docker exec curl + +# Check DNS +docker exec nslookup +``` + +### Common Issues + +| Problem | Check | +|---------|-------| +| Can't reach container | Port mapping, firewall, network attachment | +| DNS not working | Same network, container running | +| Slow network | Network mode, MTU settings | +| Port already in use | `lsof -i :`, change mapping | diff --git a/skills/docker/references/proxmox/hosting.md b/skills/docker/references/proxmox/hosting.md new file mode 100644 index 0000000..3392ec2 --- /dev/null +++ b/skills/docker/references/proxmox/hosting.md @@ -0,0 +1,227 @@ +# Docker on Proxmox VMs + +Best practices for running Docker workloads on Proxmox VE. + +## Template Selection + +Use Docker-ready templates (102+) which have Docker pre-installed: + +| Template ID | Name | Docker? | +|-------------|------|---------| +| 100 | tmpl-ubuntu-2404-base | No | +| 101 | tmpl-ubuntu-2404-standard | No | +| 102 | tmpl-ubuntu-2404-docker | Yes | +| 103 | tmpl-ubuntu-2404-github-runner | Yes | +| 104 | tmpl-ubuntu-2404-pihole | Yes | + +**DO NOT** install Docker via cloud-init on templates 102+. + +## VM vs LXC for Docker + +| Factor | VM (QEMU) | LXC Unprivileged | LXC Privileged | +|--------|-----------|------------------|----------------| +| Docker support | Full | Limited | Works but risky | +| Isolation | Complete | Shared kernel | Shared kernel | +| Overhead | Higher | Lower | Lower | +| Nested containers | Works | Requires config | Works | +| GPU passthrough | Yes | Limited | Limited | +| Security | Best | Good | Avoid | + +**Recommendation:** Use VMs for Docker workloads. LXC adds complexity for marginal resource savings. + +## VM Sizing for Docker + +### Minimum for Docker host + +``` +CPU: 2 cores +RAM: 4 GB (2 GB for OS, 2 GB for containers) +Disk: 50 GB (20 GB OS, 30 GB images/volumes) +``` + +### Per-container overhead + +``` +Base: ~10 MB RAM per container +Image layers: Shared between containers +Volumes: Depends on data +``` + +### Sizing formula + +``` +Total RAM = 2 GB (OS) + sum(container memory limits) + 20% buffer +Total Disk = 20 GB (OS) + images + volumes + 20% buffer +``` + +## Storage Backend Selection + +| Proxmox Storage | Docker Use Case | Performance | +|-----------------|-----------------|-------------| +| local-lvm | General workloads | Good | +| ZFS | Database containers | Better (snapshots) | +| Ceph | HA workloads | Good (distributed) | +| NFS | Shared config/data | Moderate | + +### Volume mapping to Proxmox storage + +```yaml +# docker-compose.yaml +volumes: + db_data: + driver: local + driver_opts: + type: none + device: /mnt/storage/mysql # Map to Proxmox storage mount + o: bind +``` + +## Network Considerations + +### Bridge mode (default) + +Container gets private IP, NAT to VM IP. Good for most workloads. + +```yaml +services: + web: + ports: + - "80:80" # VM_IP:80 -> container:80 +``` + +### Host mode + +Container shares VM network stack. Use for network tools or performance. + +```yaml +services: + pihole: + network_mode: host # Container uses VM's IPs directly +``` + +### Macvlan (direct LAN access) + +Container gets own IP on Proxmox bridge. + +```bash +# On Docker host (VM) +docker network create -d macvlan \ + --subnet=192.168.1.0/24 \ + --gateway=192.168.1.1 \ + -o parent=eth0 \ + lan +``` + +```yaml +services: + app: + networks: + lan: + ipv4_address: 192.168.1.50 + +networks: + lan: + external: true +``` + +**Note:** Requires Proxmox bridge without VLAN tagging on that interface, or pass-through the VLAN-tagged interface to VM. + +## Resource Limits + +Always set limits to prevent container runaway affecting VM: + +```yaml +services: + app: + deploy: + resources: + limits: + cpus: '2' + memory: 2G + reservations: + cpus: '0.5' + memory: 512M +``` + +## GPU Passthrough + +For containers needing GPU (AI/ML, transcoding): + +1. **Proxmox:** Pass GPU to VM + ``` + hostpci0: 0000:01:00.0,pcie=1 + ``` + +2. **VM:** Install NVIDIA drivers + nvidia-container-toolkit + +3. **Compose:** + ```yaml + services: + plex: + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ``` + +## Backup Considerations + +### What to backup + +| Data | Method | Location | +|------|--------|----------| +| VM disk | Proxmox vzdump | Includes everything | +| Docker volumes | docker run --volumes-from | Application-level | +| Compose files | Git | Version control | + +### Proxmox backup includes Docker + +When backing up the VM with vzdump, all Docker data (images, volumes, containers) is included. + +```bash +vzdump --mode snapshot --storage backup +``` + +### Application-consistent backups + +For databases, use pre/post scripts: + +```bash +# Pre-backup: flush and lock +docker exec mysql mysql -e "FLUSH TABLES WITH READ LOCK;" + +# vzdump runs... + +# Post-backup: unlock +docker exec mysql mysql -e "UNLOCK TABLES;" +``` + +## Troubleshooting + +### Container can't reach internet + +1. Check VM can reach internet: `ping 8.8.8.8` +2. Check Docker DNS: `docker run --rm alpine nslookup google.com` +3. Check iptables forwarding: `sysctl net.ipv4.ip_forward` + +### Port not accessible from LAN + +1. Check Proxmox firewall allows port +2. Check VM firewall (ufw/iptables) +3. Check container is bound to 0.0.0.0 not 127.0.0.1 + +### Disk space issues + +```bash +# Check Docker disk usage +docker system df + +# Clean up +docker system prune -a --volumes # WARNING: removes all unused data + +# Check VM disk +df -h +``` diff --git a/skills/docker/references/proxmox/lxc-vs-docker.md b/skills/docker/references/proxmox/lxc-vs-docker.md new file mode 100644 index 0000000..ed76e16 --- /dev/null +++ b/skills/docker/references/proxmox/lxc-vs-docker.md @@ -0,0 +1,140 @@ +# LXC vs Docker Containers + +Understanding when to use Proxmox LXC containers vs Docker containers. + +## Fundamental Differences + +| Aspect | LXC (Proxmox) | Docker | +|--------|---------------|--------| +| Abstraction | System container (full OS) | Application container | +| Init system | systemd, runit, etc. | Single process (PID 1) | +| Management | Proxmox (pct) | Docker daemon | +| Persistence | Stateful by default | Ephemeral by default | +| Updates | apt/yum inside container | Replace container | +| Networking | Proxmox managed | Docker managed | + +## When to Use LXC + +- **Long-running services** with traditional management (systemd, cron) +- **Multi-process applications** that expect init system +- **Legacy apps** not designed for containers +- **Dev/test environments** mimicking full VMs +- **Resource efficiency** when full VM isolation not needed +- **Direct Proxmox management** (backup, snapshots, migration) + +```bash +# Create LXC +pct create 200 local:vztmpl/ubuntu-22.04-standard_22.04-1_amd64.tar.zst \ + --hostname mycontainer \ + --storage local-lvm \ + --rootfs local-lvm:8 \ + --cores 2 \ + --memory 2048 \ + --net0 name=eth0,bridge=vmbr0,ip=dhcp +``` + +## When to Use Docker + +- **Microservices** with single responsibility +- **CI/CD pipelines** with reproducible builds +- **Rapid deployment** and scaling +- **Application isolation** within a host +- **Compose stacks** with multi-container apps +- **Ecosystem tooling** (registries, orchestration) + +```yaml +# docker-compose.yaml +services: + app: + image: myapp:1.0 + restart: unless-stopped +``` + +## Decision Matrix + +| Scenario | Recommendation | Rationale | +|----------|---------------|-----------| +| Pi-hole | Docker on VM | Easy updates, compose ecosystem | +| Database server | LXC or VM | Stateful, traditional management | +| Web app microservice | Docker | Ephemeral, scalable | +| Development environment | LXC | Full OS, multiple services | +| CI runner | Docker on VM | Isolation, reproducibility | +| Network appliance | LXC | Direct network access, systemd | +| Home automation | Docker on VM | Compose stacks, easy backup | + +## Hybrid Approach + +Common pattern: **VM runs Docker**, managed by Proxmox. + +``` +Proxmox Node +├── VM: docker-host-1 (template 102) +│ ├── Container: nginx +│ ├── Container: app +│ └── Container: redis +├── VM: docker-host-2 (template 102) +│ ├── Container: postgres +│ └── Container: backup +└── LXC: pihole (direct network) +``` + +Benefits: +- Proxmox handles VM-level backup/migration +- Docker handles application deployment +- Clear separation of concerns + +## Docker in LXC (Not Recommended) + +Running Docker inside LXC is possible but adds complexity: + +### Requirements + +1. Privileged container OR nested containers enabled +2. AppArmor profile modifications +3. Keyctl feature enabled + +```bash +# LXC config (Proxmox) +lxc.apparmor.profile: unconfined +lxc.cgroup.devices.allow: a +lxc.cap.drop: +features: keyctl=1,nesting=1 +``` + +### Issues + +- Security: Reduced isolation +- Compatibility: Some Docker features broken +- Debugging: Two container layers +- Backup: More complex + +**Recommendation:** Use VM with Docker instead. + +## Resource Comparison + +For equivalent workload: + +| Resource | VM + Docker | LXC | Docker in LXC | +|----------|-------------|-----|---------------| +| RAM overhead | ~500 MB | ~50 MB | ~100 MB | +| Disk overhead | ~5 GB | ~500 MB | ~1 GB | +| Boot time | 30-60s | 2-5s | 5-10s | +| Isolation | Full | Shared kernel | Shared kernel | +| Complexity | Low | Low | High | + +## Migration Paths + +### LXC to Docker + +1. Export application config from LXC +2. Create Dockerfile/compose +3. Build image +4. Deploy to Docker host +5. Migrate data volumes + +### Docker to LXC + +1. Install service directly in LXC (apt/yum) +2. Configure with systemd +3. Migrate data +4. Update Proxmox firewall rules diff --git a/skills/docker/references/troubleshooting.md b/skills/docker/references/troubleshooting.md new file mode 100644 index 0000000..e35c69f --- /dev/null +++ b/skills/docker/references/troubleshooting.md @@ -0,0 +1,212 @@ +# Docker Troubleshooting Reference + +## Common Errors + +| Error | Cause | Solution | +|-------|-------|----------| +| Container exits immediately | Bad entrypoint, missing deps | Check logs, verify CMD | +| Port already in use | Conflict with host/other container | `lsof -i :`, change mapping | +| Volume permission denied | UID mismatch | Check ownership, use named volumes | +| Network not found | Network removed/not created | `docker network create` | +| Image pull failed | Registry/auth/name issue | Check registry, credentials, name | +| OOM killed | Exceeded memory limit | Increase limit or optimize app | +| DNS resolution failed | Network config issue | Check DNS settings, network mode | +| Health check failing | App not responding | Check command, increase timeout | + +## Diagnostic Commands + +### Container Status + +```bash +# List all containers (including stopped) +docker ps -a + +# Check exit code +docker inspect --format '{{.State.ExitCode}}' + +# Check restart count +docker inspect --format '{{.RestartCount}}' +``` + +### Logs + +```bash +# View logs +docker logs + +# Follow logs +docker logs -f + +# Last N lines +docker logs --tail 100 + +# With timestamps +docker logs -t + +# Since time +docker logs --since 10m +``` + +### Resource Usage + +```bash +# Real-time stats +docker stats + +# Single container +docker stats + +# Disk usage +docker system df +docker system df -v # Verbose +``` + +### Container Details + +```bash +# Full inspection +docker inspect + +# Specific fields +docker inspect --format '{{.State.Status}}' +docker inspect --format '{{json .NetworkSettings.Networks}}' +docker inspect --format '{{.Mounts}}' +``` + +### Process and Network + +```bash +# Running processes +docker top + +# Execute command +docker exec ps aux +docker exec netstat -tlnp + +# Network connectivity +docker exec ping +docker exec curl +docker exec nslookup +``` + +## Troubleshooting Workflows + +### Container Won't Start + +1. Check logs: `docker logs ` +2. Check exit code: `docker inspect --format '{{.State.ExitCode}}'` +3. Run interactively: `docker run -it sh` +4. Check entrypoint/cmd: `docker inspect --format '{{.Config.Cmd}}'` + +### Container Keeps Restarting + +1. Check logs for errors +2. Verify health check if configured +3. Check resource limits (OOM) +4. Test entrypoint manually + +### Network Issues + +1. Verify network exists: `docker network ls` +2. Check container attached: `docker inspect --format '{{.NetworkSettings.Networks}}'` +3. Test DNS: `docker exec nslookup ` +4. Check port mapping: `docker port ` + +### Volume Issues + +1. Check mount: `docker inspect --format '{{.Mounts}}'` +2. Verify permissions inside: `docker exec ls -la /path` +3. Check host path exists (bind mounts) +4. Try named volume instead + +### Performance Issues + +1. Check resource usage: `docker stats` +2. Review limits: `docker inspect --format '{{.HostConfig.Memory}}'` +3. Check for resource contention +4. Profile application inside container + +## Cleanup + +```bash +# Remove stopped containers +docker container prune + +# Remove unused images +docker image prune + +# Remove unused volumes +docker volume prune + +# Remove unused networks +docker network prune + +# Remove everything unused +docker system prune -a --volumes +``` + +## Debugging Compose + +```bash +# Validate compose file +docker compose config + +# See what would run +docker compose config --services + +# Check why service isn't starting +docker compose logs + +# Force recreate +docker compose up -d --force-recreate + +# Rebuild images +docker compose up -d --build +``` + +## Common Compose Issues + +| Problem | Check | +|---------|-------| +| Service not starting | `docker compose logs ` | +| depends_on not working | Service starts but app not ready (use healthcheck) | +| Volume not persisting | Check volume name, not recreated | +| Env vars not loading | Check .env file location, syntax | +| Network errors | Check network names, external networks | + +## Health Check Debugging + +```bash +# Check health status +docker inspect --format '{{.State.Health.Status}}' + +# View health log +docker inspect --format '{{json .State.Health}}' | jq + +# Test health command manually +docker exec +``` + +## Emergency Recovery + +### Force Stop + +```bash +docker kill +``` + +### Remove Stuck Container + +```bash +docker rm -f +``` + +### Reset Docker + +```bash +# Restart Docker daemon +sudo systemctl restart docker + +# Or on macOS +# Restart Docker Desktop +``` diff --git a/skills/docker/references/volumes.md b/skills/docker/references/volumes.md new file mode 100644 index 0000000..bb271e9 --- /dev/null +++ b/skills/docker/references/volumes.md @@ -0,0 +1,230 @@ +# Docker Volumes Reference + +## Volume Types + +### Named Volumes (Recommended) + +Managed by Docker, stored in `/var/lib/docker/volumes/`. + +```yaml +volumes: + db-data: + +services: + db: + volumes: + - db-data:/var/lib/mysql +``` + +Benefits: +- Portable across hosts +- Backup-friendly +- No permission issues +- Can use volume drivers (NFS, etc.) + +### Bind Mounts + +Direct host path mapping. + +```yaml +services: + web: + volumes: + - ./config:/etc/app/config:ro + - /host/data:/container/data +``` + +Benefits: +- Direct file access from host +- Development workflow (live reload) +- Access to host files + +Drawbacks: +- Host-dependent paths +- Permission issues possible +- Less portable + +### tmpfs Mounts + +In-memory storage (Linux only). + +```yaml +services: + app: + tmpfs: + - /tmp + - /run:size=100m +``` + +Benefits: +- Fast (RAM-based) +- Secure (not persisted) +- Good for secrets, cache + +## Volume Options + +### Read-Only + +```yaml +volumes: + - ./config:/etc/app/config:ro +``` + +### Bind Propagation + +```yaml +volumes: + - type: bind + source: ./data + target: /data + bind: + propagation: rslave +``` + +### Volume Driver Options + +```yaml +volumes: + nfs-data: + driver: local + driver_opts: + type: nfs + o: addr=192.168.1.100,rw + device: ":/export/data" +``` + +## Common Patterns + +### Database Data + +```yaml +services: + postgres: + image: postgres:15 + volumes: + - pgdata:/var/lib/postgresql/data + environment: + POSTGRES_PASSWORD: secret + +volumes: + pgdata: +``` + +### Configuration Files + +```yaml +services: + nginx: + image: nginx:alpine + volumes: + - ./nginx.conf:/etc/nginx/nginx.conf:ro + - ./html:/usr/share/nginx/html:ro +``` + +### Shared Data Between Services + +```yaml +services: + app: + volumes: + - shared:/data + + worker: + volumes: + - shared:/data + +volumes: + shared: +``` + +### Log Persistence + +```yaml +services: + app: + volumes: + - logs:/var/log/app + +volumes: + logs: +``` + +## Backup and Restore + +### Backup Named Volume + +```bash +# Create backup +docker run --rm \ + -v myvolume:/source:ro \ + -v $(pwd):/backup \ + alpine tar czf /backup/myvolume.tar.gz -C /source . + +# Restore backup +docker run --rm \ + -v myvolume:/target \ + -v $(pwd):/backup \ + alpine tar xzf /backup/myvolume.tar.gz -C /target +``` + +### Copy Files from Volume + +```bash +docker cp :/path/to/file ./local-file +``` + +## Volume Management + +```bash +# List volumes +docker volume ls + +# Inspect volume +docker volume inspect + +# Remove unused volumes +docker volume prune + +# Remove specific volume +docker volume rm + +# Create volume manually +docker volume create --name myvolume +``` + +## Permissions + +### Common Permission Issues + +```bash +# Check container user +docker exec id + +# Check volume permissions +docker exec ls -la /data +``` + +### Solutions + +```yaml +# Run as specific user +services: + app: + user: "1000:1000" + volumes: + - ./data:/data +``` + +Or fix host permissions: +```bash +chown -R 1000:1000 ./data +``` + +## Best Practices + +1. **Use named volumes for data** - More portable than bind mounts +2. **Read-only when possible** - Use `:ro` for config files +3. **Separate concerns** - Different volumes for data, config, logs +4. **Backup strategy** - Plan for volume backup/restore +5. **Don't store in image** - Data should be in volumes, not image layers +6. **Use .dockerignore** - Exclude data directories from build context diff --git a/skills/proxmox/SKILL.md b/skills/proxmox/SKILL.md new file mode 100644 index 0000000..322a54f --- /dev/null +++ b/skills/proxmox/SKILL.md @@ -0,0 +1,95 @@ +--- +name: proxmox +description: | + Proxmox VE virtualization platform reference for VM/LXC management, clustering, + storage, and networking. Includes Terraform and Ansible integration patterns. + Use when working with Proxmox configurations, CLI commands, troubleshooting + VMs/containers, or planning resource allocation. + Triggers: proxmox, qemu, kvm, lxc, pve, vm, container, cluster, vzdump, qm, pct. +--- + +# Proxmox Skill + +Proxmox VE virtualization platform reference for VM management, containers, clustering, and homelab infrastructure. + +## Quick Reference + +```bash +# VM management (qm) +qm list # List all VMs +qm status # Check VM status +qm start # Start VM +qm stop # Stop VM (graceful) +qm shutdown # Shutdown VM (ACPI) +qm unlock # Remove lock +qm config # Show VM config + +# Container management (pct) +pct list # List all containers +pct status # Check container status +pct start # Start container +pct stop # Stop container +pct enter # Enter container shell + +# Cluster management (pvecm) +pvecm status # Cluster status and quorum +pvecm nodes # List cluster nodes + +# API shell (pvesh) +pvesh get /nodes # List nodes via API +pvesh get /nodes//status # Node resource status + +# Backup (vzdump) +vzdump --mode snapshot --storage +vzdump --all --compress zstd +``` + +## Reference Files + +Load on-demand based on task: + +| Topic | File | When to Load | +|-------|------|--------------| +| VM vs LXC | [vm-lxc.md](references/vm-lxc.md) | Choosing virtualization type | +| Docker Hosting | [docker-hosting.md](references/docker-hosting.md) | Running Docker on Proxmox | +| Networking | [networking.md](references/networking.md) | Bridges, VLANs, SDN, firewall | +| Storage | [storage.md](references/storage.md) | Storage backends, content types | +| Clustering | [clustering.md](references/clustering.md) | HA, quorum, fencing | +| Backup | [backup.md](references/backup.md) | vzdump modes, restore | +| CLI Tools | [cli-tools.md](references/cli-tools.md) | qm, pct, pvecm, pvesh commands | +| Troubleshooting | [troubleshooting.md](references/troubleshooting.md) | Common errors, diagnostics | +| Automation Tools | [automation-tools.md](references/automation-tools.md) | Terraform/Ansible integration | + +## Validation Checklist + +Before deploying VMs/containers: + +- [ ] Cluster status healthy (`pvecm status`) +- [ ] Node resources available (CPU, RAM, disk) +- [ ] Storage accessible and mounted +- [ ] Network bridges configured correctly +- [ ] VLAN tags match network design +- [ ] Resource allocation within node limits +- [ ] HA configuration correct (if enabled) +- [ ] Backup schedule in place +- [ ] Naming convention followed + +## VM vs LXC Quick Decision + +| Factor | Use VM | Use LXC | +|--------|--------|---------| +| OS | Windows, BSD, any | Linux only | +| Isolation | Full kernel isolation | Shared kernel | +| Performance | Good | Better (lighter) | +| Startup | Slower | Fast | +| Density | Lower | Higher | +| Complexity | Any workload | Simple services | + +## Homelab Network VLANs + +| VLAN | Purpose | Proxmox Bridge | +|------|---------|----------------| +| 5 | Management (Web UI, API, SSH) | vmbr5 | +| 1 | Trusted network | vmbr0 | +| 11 | Storage (NFS/Ceph, MTU 9000) | vmbr11 | +| 12 | High-speed transfers | vmbr12 | diff --git a/skills/proxmox/references/automation-tools.md b/skills/proxmox/references/automation-tools.md new file mode 100644 index 0000000..ca895fe --- /dev/null +++ b/skills/proxmox/references/automation-tools.md @@ -0,0 +1,179 @@ +# Proxmox Automation Tools + +Integration patterns for managing Proxmox with Terraform and Ansible. + +## Tool Selection Guide + +| Task | Recommended Tool | Rationale | +|------|-----------------|-----------| +| VM/LXC provisioning | Terraform | Declarative state, idempotent, handles dependencies | +| Template creation | Packer | Repeatable builds, version-controlled | +| Post-boot configuration | Ansible | Agent-based, procedural, good for drift | +| One-off VM operations | Ansible | Quick tasks, no state file needed | +| Dynamic inventory | Ansible | Query running VMs for configuration | +| Bulk VM creation | Terraform | count/for_each, parallel creation | +| Snapshot management | Either | Terraform for lifecycle, Ansible for ad-hoc | +| Cluster administration | CLI/API | Direct access for maintenance tasks | + +## Terraform Integration + +### Provider + +```hcl +terraform { + required_providers { + proxmox = { + source = "telmate/proxmox" + version = "~> 3.0" + } + } +} + +provider "proxmox" { + pm_api_url = "https://proxmox.example.com:8006/api2/json" + pm_api_token_id = "terraform@pve!mytoken" + pm_api_token_secret = var.pm_api_token_secret +} +``` + +### Common Patterns + +```hcl +# Clone from template +resource "proxmox_vm_qemu" "vm" { + name = "myvm" + target_node = "joseph" + clone = "tmpl-ubuntu-2404-standard" + full_clone = true + + cores = 2 + memory = 4096 + + disks { + scsi { + scsi0 { + disk { + storage = "local-lvm" + size = "50G" + } + } + } + } +} +``` + +### Skill Reference + +Load terraform skill for detailed patterns: +- `terraform/references/proxmox/gotchas.md` - Critical issues +- `terraform/references/proxmox/vm-qemu.md` - VM resource patterns +- `terraform/references/proxmox/authentication.md` - API setup + +## Ansible Integration + +### Collection + +```bash +ansible-galaxy collection install community.general +``` + +### Common Patterns + +```yaml +# Clone VM +- name: Clone from template + community.general.proxmox_kvm: + api_host: proxmox.example.com + api_user: ansible@pve + api_token_id: mytoken + api_token_secret: "{{ proxmox_token_secret }}" + node: joseph + vmid: 300 + name: myvm + clone: tmpl-ubuntu-2404-standard + full: true + timeout: 500 + +# Start VM +- name: Start VM + community.general.proxmox_kvm: + # ... auth ... + vmid: 300 + state: started +``` + +### Skill Reference + +Load ansible skill for detailed patterns: +- `ansible/references/proxmox/modules.md` - All Proxmox modules +- `ansible/references/proxmox/gotchas.md` - Common issues +- `ansible/references/proxmox/dynamic-inventory.md` - Auto-discovery + +## Terraform vs Ansible Decision + +### Use Terraform When + +- Creating infrastructure from scratch +- Managing VM lifecycle (create, update, destroy) +- Need state tracking and drift detection +- Deploying multiple similar VMs (for_each) +- Complex dependencies between resources +- Team collaboration with state locking + +### Use Ansible When + +- Configuring VMs after creation +- Ad-hoc operations (start/stop specific VMs) +- Dynamic inventory needed for other playbooks +- Quick one-off tasks +- No state file management desired +- Integration with existing Ansible workflows + +### Use Both When + +- Terraform provisions VMs +- Ansible configures them post-boot +- Ansible uses Proxmox dynamic inventory to find Terraform-created VMs + +## Hybrid Workflow Example + +``` +1. Packer builds VM template + └── packer build ubuntu-2404.pkr.hcl + +2. Terraform provisions VMs from template + └── terraform apply + └── Outputs: VM IPs, hostnames + +3. Ansible configures VMs + └── Uses Proxmox dynamic inventory OR + └── Uses Terraform output as inventory + +4. Ongoing management + └── Terraform for infrastructure changes + └── Ansible for configuration drift +``` + +## API Token Sharing + +Both tools can share the same API token: + +```bash +# Create shared token +pveum user add automation@pve +pveum aclmod / -user automation@pve -role PVEAdmin +pveum user token add automation@pve shared --privsep 0 +``` + +Store in shared secrets management (1Password, Vault, etc.). + +## Common Gotchas + +| Issue | Terraform | Ansible | +|-------|-----------|---------| +| VMID | Auto-assigns if not specified | Must specify manually | +| Cloud-init changes | Use replace_triggered_by | Limited support, use API | +| State tracking | Yes (tfstate) | No state file | +| Parallel operations | Yes (configurable) | Yes (forks) | +| Template name vs ID | Supports both | Supports both | +| Timeout handling | Provider config | Module parameter | diff --git a/skills/proxmox/references/backup.md b/skills/proxmox/references/backup.md new file mode 100644 index 0000000..28544e4 --- /dev/null +++ b/skills/proxmox/references/backup.md @@ -0,0 +1,162 @@ +# Proxmox Backup Reference + +## vzdump Overview + +Built-in backup tool for VMs and containers. + +```bash +# Basic backup +vzdump + +# With options +vzdump --mode snapshot --storage backup-nfs --compress zstd + +# Backup all VMs +vzdump --all --compress zstd +``` + +## Backup Modes + +| Mode | Downtime | Method | Use Case | +|------|----------|--------|----------| +| stop | Full | Shutdown, backup, start | Consistent, any storage | +| suspend | Brief | Pause, backup, resume | Running state preserved | +| snapshot | None | LVM/ZFS/Ceph snapshot | Production, requires snapshot storage | + +### Mode Selection + +```bash +# Stop mode (most consistent) +vzdump --mode stop + +# Suspend mode (preserves RAM state) +vzdump --mode suspend + +# Snapshot mode (live, requires compatible storage) +vzdump --mode snapshot +``` + +## Backup Formats + +| Format | Type | Compression | +|--------|------|-------------| +| VMA | VMs | Native Proxmox format | +| tar | Containers | Standard tar archive | + +## Compression Options + +| Type | Speed | Ratio | CPU | +|------|-------|-------|-----| +| none | Fastest | 1:1 | Low | +| lzo | Fast | Good | Low | +| gzip | Moderate | Better | Medium | +| zstd | Fast | Best | Medium | + +Recommendation: `zstd` for best balance. + +```bash +vzdump --compress zstd +``` + +## Storage Configuration + +```bash +# Backup to specific storage +vzdump --storage backup-nfs + +# Check available backup storage +pvesm status | grep backup +``` + +## Scheduled Backups + +Configure in Datacenter → Backup: + +- Schedule (cron format) +- Selection (all, pool, specific VMs) +- Storage destination +- Mode and compression +- Retention policy + +### Retention Policy + +``` +keep-last: 3 # Keep last N backups +keep-daily: 7 # Keep daily for N days +keep-weekly: 4 # Keep weekly for N weeks +keep-monthly: 6 # Keep monthly for N months +``` + +## Restore Operations + +### Full Restore + +```bash +# Restore VM +qmrestore + +# Restore to different VMID +qmrestore + +# Restore container +pct restore +``` + +### Restore Options + +```bash +# Restore to different storage +qmrestore --storage local-lvm + +# Force overwrite existing VM +qmrestore --force +``` + +### File-Level Restore + +```bash +# Mount backup for file extraction +# (Use web UI: Backup → Restore → File Restore) +``` + +## Proxmox Backup Server (PBS) + +Dedicated backup server with deduplication. + +### Benefits + +- Deduplication across backups +- Encryption at rest +- Verification and integrity checks +- Efficient incremental backups +- Remote backup sync + +### Integration + +Add PBS storage: + +```bash +pvesm add pbs \ + --server \ + --datastore \ + --username @pbs \ + --fingerprint +``` + +## Backup Best Practices + +- Store backups on separate storage from VMs +- Use snapshot mode for production VMs +- Test restores regularly +- Offsite backup copy for disaster recovery +- Monitor backup job completion +- Set appropriate retention policy + +## Troubleshooting + +| Issue | Check | +|-------|-------| +| Backup fails | Storage space, VM state, permissions | +| Slow backup | Mode (snapshot faster), compression, network | +| Restore fails | Storage compatibility, VMID conflicts | +| Snapshot fails | Storage doesn't support snapshots | diff --git a/skills/proxmox/references/cli-tools.md b/skills/proxmox/references/cli-tools.md new file mode 100644 index 0000000..228d676 --- /dev/null +++ b/skills/proxmox/references/cli-tools.md @@ -0,0 +1,178 @@ +# Proxmox CLI Tools Reference + +## qm - VM Management + +```bash +# List and status +qm list # List all VMs +qm status # VM status +qm config # Show VM config + +# Power operations +qm start # Start VM +qm stop # Force stop +qm shutdown # ACPI shutdown +qm reboot # ACPI reboot +qm reset # Hard reset +qm suspend # Suspend to RAM +qm resume # Resume from suspend + +# Configuration +qm set --memory 4096 # Set memory +qm set --cores 4 # Set CPU cores +qm set --name newname # Rename VM + +# Disk operations +qm resize scsi0 +10G # Extend disk +qm move-disk scsi0 # Move disk + +# Snapshots +qm snapshot # Create snapshot +qm listsnapshot # List snapshots +qm rollback # Rollback +qm delsnapshot # Delete snapshot + +# Templates and clones +qm template # Convert to template +qm clone # Clone VM + +# Migration +qm migrate # Live migrate + +# Troubleshooting +qm unlock # Remove lock +qm showcmd # Show QEMU command +qm monitor # QEMU monitor +qm guest cmd # Guest agent command +``` + +## pct - Container Management + +```bash +# List and status +pct list # List all containers +pct status # Container status +pct config # Show config + +# Power operations +pct start # Start container +pct stop # Stop container +pct shutdown # Graceful shutdown +pct reboot # Reboot + +# Access +pct enter # Enter shell +pct exec -- # Run command +pct console # Attach console + +# Configuration +pct set --memory 2048 # Set memory +pct set --cores 2 # Set CPU cores +pct set --hostname name # Set hostname + +# Disk operations +pct resize rootfs +5G # Extend rootfs +pct move-volume # Move volume + +# Snapshots +pct snapshot # Create snapshot +pct listsnapshot # List snapshots +pct rollback # Rollback + +# Templates +pct template # Convert to template +pct clone # Clone container + +# Migration +pct migrate # Migrate container + +# Troubleshooting +pct unlock # Remove lock +pct push # Copy file to container +pct pull # Copy file from container +``` + +## pvecm - Cluster Management + +```bash +# Status +pvecm status # Cluster status +pvecm nodes # List nodes +pvecm qdevice # QDevice status + +# Node operations +pvecm add # Join cluster +pvecm delnode # Remove node +pvecm updatecerts # Update SSL certs + +# Recovery +pvecm expected # Set expected votes +``` + +## pvesh - API Shell + +```bash +# GET requests +pvesh get /nodes # List nodes +pvesh get /nodes//status # Node status +pvesh get /nodes//qemu # List VMs on node +pvesh get /nodes//qemu//status/current # VM status +pvesh get /storage # List storage +pvesh get /cluster/resources # All cluster resources + +# POST/PUT requests +pvesh create /nodes//qemu -vmid ... # Create VM +pvesh set /nodes//qemu//config ... # Modify VM + +# DELETE requests +pvesh delete /nodes//qemu/ # Delete VM +``` + +## vzdump - Backup + +```bash +# Basic backup +vzdump # Backup VM +vzdump # Backup container + +# Options +vzdump --mode snapshot # Snapshot mode +vzdump --compress zstd # With compression +vzdump --storage backup # To specific storage +vzdump --mailto admin@example.com # Email notification + +# Backup all +vzdump --all # All VMs and containers +vzdump --pool # All in pool +``` + +## qmrestore / pct restore + +```bash +# Restore VM +qmrestore +qmrestore --storage local-lvm + +# Restore container +pct restore +pct restore --storage local-lvm +``` + +## Useful Combinations + +```bash +# Check resources on all nodes +for node in joseph maxwell everette; do + echo "=== $node ===" + pvesh get /nodes/$node/status | jq '{cpu:.cpu, memory:.memory}' +done + +# Stop all VMs on a node +qm list | awk 'NR>1 {print $1}' | xargs -I {} qm stop {} + +# List VMs with their IPs (requires guest agent) +for vmid in $(qm list | awk 'NR>1 {print $1}'); do + echo -n "$vmid: " + qm guest cmd $vmid network-get-interfaces 2>/dev/null | jq -r '.[].["ip-addresses"][]?.["ip-address"]' | head -1 +done +``` diff --git a/skills/proxmox/references/clustering.md b/skills/proxmox/references/clustering.md new file mode 100644 index 0000000..94c37a0 --- /dev/null +++ b/skills/proxmox/references/clustering.md @@ -0,0 +1,181 @@ +# Proxmox Clustering Reference + +## Cluster Benefits + +- Centralized web management +- Live VM migration between nodes +- High availability (HA) with automatic failover +- Shared configuration + +## Cluster Requirements + +| Requirement | Details | +|-------------|---------| +| Version | Same major/minor Proxmox version | +| Time | NTP synchronized | +| Network | Low-latency cluster network | +| Names | Unique node hostnames | +| Storage | Shared storage for HA | + +## Cluster Commands + +```bash +# Check cluster status +pvecm status + +# List cluster nodes +pvecm nodes + +# Add node to cluster (run on new node) +pvecm add + +# Remove node (run on remaining node) +pvecm delnode + +# Expected votes (split-brain recovery) +pvecm expected +``` + +## Quorum + +Cluster requires majority of nodes online to operate. + +| Nodes | Quorum | Can Lose | +|-------|--------|----------| +| 2 | 2 | 0 (use QDevice) | +| 3 | 2 | 1 | +| 4 | 3 | 1 | +| 5 | 3 | 2 | + +### QDevice + +External quorum device for even-node clusters: + +- Prevents split-brain in 2-node clusters +- Runs on separate machine +- Provides tie-breaking vote + +## High Availability (HA) + +Automatic VM restart on healthy node if host fails. + +### Requirements + +- Shared storage (Ceph, NFS, iSCSI) +- Fencing enabled (watchdog) +- HA group configured +- VM added to HA + +### HA States + +| State | Description | +|-------|-------------| +| started | VM running, managed by HA | +| stopped | VM stopped intentionally | +| migrate | Migration in progress | +| relocate | Moving to different node | +| error | Problem detected | + +### HA Configuration + +1. Enable fencing (watchdog device) +2. Create HA group (optional) +3. Add VM to HA: Datacenter → HA → Add + +### Fencing + +Prevents split-brain by forcing failed node to stop: + +```bash +# Check watchdog status +cat /proc/sys/kernel/watchdog + +# Watchdog config +/etc/pve/ha/fence.cfg +``` + +## Live Migration + +Move running VM between nodes without downtime. + +### Requirements + +- Shared storage OR local-to-local migration +- Same CPU architecture +- Network connectivity +- Sufficient resources on target + +### Migration Types + +| Type | Downtime | Requirements | +|------|----------|--------------| +| Live | Minimal | Shared storage | +| Offline | Full | Any storage | +| Local storage | Moderate | Copies disk | + +### Migration Command + +```bash +# Live migrate +qm migrate + +# Offline migrate +qm migrate --offline + +# With local disk +qm migrate --with-local-disks +``` + +## Cluster Network + +### Corosync Network + +Cluster communication (default port 5405): + +- Low-latency required +- Dedicated VLAN recommended +- Redundant links for HA + +### Configuration + +``` +# /etc/pve/corosync.conf +nodelist { + node { + name: node1 + ring0_addr: 192.168.10.1 + } + node { + name: node2 + ring0_addr: 192.168.10.2 + } +} +``` + +## Troubleshooting + +### Quorum Lost + +```bash +# Check status +pvecm status + +# Force expected votes (DANGEROUS) +pvecm expected 1 + +# Then: recover remaining nodes +``` + +### Node Won't Join + +- Check network connectivity +- Verify time sync +- Check Proxmox versions match +- Review /var/log/pve-cluster/ + +### Split Brain Recovery + +1. Identify authoritative node +2. Stop cluster services on other nodes +3. Set expected votes +4. Restart and rejoin nodes diff --git a/skills/proxmox/references/docker-hosting.md b/skills/proxmox/references/docker-hosting.md new file mode 100644 index 0000000..e2647cc --- /dev/null +++ b/skills/proxmox/references/docker-hosting.md @@ -0,0 +1,202 @@ +# Docker Workloads on Proxmox + +Best practices for hosting Docker containers on Proxmox VE. + +## Hosting Options + +| Option | Isolation | Overhead | Complexity | Recommendation | +|--------|-----------|----------|------------|----------------| +| VM + Docker | Full | Higher | Low | **Recommended** | +| LXC + Docker | Shared kernel | Lower | High | Avoid | +| Bare metal Docker | None | Lowest | N/A | Not on Proxmox | + +## VM for Docker (Recommended) + +### Template Selection + +Use Docker-ready templates (102+): + +| Template | Docker Pre-installed | +|----------|---------------------| +| 102 (docker) | Yes | +| 103 (github-runner) | Yes | +| 104 (pihole) | Yes | + +### VM Sizing + +| Workload | CPU | RAM | Disk | +|----------|-----|-----|------| +| Light (1-3 containers) | 2 | 4 GB | 50 GB | +| Medium (4-10 containers) | 4 | 8 GB | 100 GB | +| Heavy (10+ containers) | 8+ | 16+ GB | 200+ GB | + +### Storage Backend + +| Proxmox Storage | Docker Suitability | Notes | +|-----------------|-------------------|-------| +| local-lvm | Good | Default, fast | +| ZFS | Best | Snapshots, compression | +| Ceph | Good | Distributed, HA | +| NFS | Moderate | Shared access, slower | + +### Network Configuration + +``` +Proxmox Node +├── vmbr0 (bridge) → VM eth0 → Docker bridge network +└── vmbr12 (high-speed) → VM eth1 → Docker macvlan (optional) +``` + +## Docker in LXC (Not Recommended) + +If you must run Docker in LXC: + +### Requirements + +1. **Privileged container** or nesting enabled +2. **AppArmor** profile unconfined +3. **Keyctl** feature enabled + +### LXC Options + +```bash +# Proxmox GUI: Options → Features +nesting: 1 +keyctl: 1 + +# Or in /etc/pve/lxc/.conf +features: keyctl=1,nesting=1 +lxc.apparmor.profile: unconfined +``` + +### Known Issues + +- Some Docker storage drivers don't work +- Overlay filesystem may have issues +- Reduced security isolation +- Complex debugging (two container layers) + +## Resource Allocation + +### CPU + +```bash +# VM config - dedicate cores to Docker host +cores: 4 +cpu: host # Pass through CPU features +``` + +### Memory + +```bash +# VM config - allow some overcommit for containers +memory: 8192 +balloon: 4096 # Minimum memory +``` + +### Disk I/O + +For I/O intensive containers (databases): + +```bash +# VM disk options +cache: none # Direct I/O for consistency +iothread: 1 # Dedicated I/O thread +ssd: 1 # If on SSD storage +``` + +## GPU Passthrough for Containers + +For transcoding (Plex) or ML workloads: + +### 1. Proxmox: Pass GPU to VM + +```bash +# /etc/pve/qemu-server/.conf +hostpci0: 0000:01:00.0,pcie=1 +``` + +### 2. VM: Install NVIDIA Container Toolkit + +```bash +# In VM +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg +curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +sudo apt update && sudo apt install -y nvidia-container-toolkit +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker +``` + +### 3. Docker Compose + +```yaml +services: + plex: + image: linuxserver/plex + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] +``` + +## Backup Strategy + +### VM-level (Recommended) + +Proxmox vzdump backs up entire Docker host including all containers: + +```bash +vzdump --mode snapshot --storage backup --compress zstd +``` + +### Application-level + +For consistent database backups, stop or flush before VM backup: + +```bash +# Pre-backup hook +docker exec postgres pg_dump -U user db > /backup/db.sql +``` + +## Monitoring + +### From Proxmox + +- VM CPU, memory, network, disk via Proxmox UI +- No visibility into individual containers + +### From Docker Host + +```bash +# Resource usage per container +docker stats + +# System-wide +docker system df +``` + +### Recommended Stack + +```yaml +# On Docker host +services: + prometheus: + image: prom/prometheus + cadvisor: + image: gcr.io/cadvisor/cadvisor + grafana: + image: grafana/grafana +``` + +## Skill References + +For Docker-specific patterns: +- `docker/references/compose.md` - Compose file structure +- `docker/references/networking.md` - Network modes +- `docker/references/volumes.md` - Data persistence +- `docker/references/proxmox/hosting.md` - Detailed hosting guide diff --git a/skills/proxmox/references/networking.md b/skills/proxmox/references/networking.md new file mode 100644 index 0000000..15cf870 --- /dev/null +++ b/skills/proxmox/references/networking.md @@ -0,0 +1,153 @@ +# Proxmox Networking Reference + +## Linux Bridges + +Default networking method for Proxmox VMs and containers. + +### Bridge Configuration + +``` +# /etc/network/interfaces example +auto vmbr0 +iface vmbr0 inet static + address 192.168.1.10/24 + gateway 192.168.1.1 + bridge-ports eno1 + bridge-stp off + bridge-fd 0 + bridge-vlan-aware yes +``` + +### VLAN-Aware Bridge + +Enable VLAN tagging at VM level instead of separate bridges: + +- Set `bridge-vlan-aware yes` on bridge +- Configure VLAN tag in VM network config +- Simpler management, fewer bridges needed + +### Separate Bridges (Alternative) + +One bridge per VLAN: + +- vmbr0: Untagged/native VLAN +- vmbr1: VLAN 10 +- vmbr5: VLAN 5 + +More bridges but explicit network separation. + +## VLAN Configuration + +### At VM Level (VLAN-aware bridge) + +``` +net0: virtio=XX:XX:XX:XX:XX:XX,bridge=vmbr0,tag=20 +``` + +### At Bridge Level (Separate bridges) + +``` +net0: virtio=XX:XX:XX:XX:XX:XX,bridge=vmbr20 +``` + +## Firewall + +Three levels of firewall rules: + +| Level | Scope | Use Case | +|-------|-------|----------| +| Datacenter | Cluster-wide | Default policies | +| Node | Per-node | Node-specific rules | +| VM/Container | Per-VM | Application-specific | + +### Default Policy + +- Input: DROP (only allow explicit rules) +- Output: ACCEPT +- Enable firewall per VM in Options + +### Common Rules + +``` +# Allow SSH +IN ACCEPT -p tcp --dport 22 + +# Allow HTTP/HTTPS +IN ACCEPT -p tcp --dport 80 +IN ACCEPT -p tcp --dport 443 + +# Allow ICMP (ping) +IN ACCEPT -p icmp +``` + +## SDN (Software Defined Networking) + +Advanced networking for complex multi-tenant setups. + +### Zone Types + +| Type | Use Case | +|------|----------| +| Simple | Basic L2 network | +| VLAN | VLAN-based isolation | +| VXLAN | Overlay networking | +| EVPN | BGP-based routing | + +### When to Use SDN + +- Multi-tenant environments +- Complex routing requirements +- Cross-node L2 networks +- VXLAN overlay needs + +For homelab: Standard bridges usually sufficient. + +## Network Performance + +### Jumbo Frames + +Enable on storage network for better throughput: + +``` +# Set MTU 9000 on bridge +auto vmbr40 +iface vmbr40 inet static + mtu 9000 + ... +``` + +Requires: All devices in path support jumbo frames. + +### VirtIO Multiqueue + +Enable parallel network processing for high-throughput VMs: + +``` +net0: virtio=XX:XX:XX:XX:XX:XX,bridge=vmbr0,queues=4 +``` + +## Troubleshooting + +### Check Bridge Status + +```bash +brctl show # List bridges and attached interfaces +ip link show vmbr0 # Bridge interface details +bridge vlan show # VLAN configuration +``` + +### Check VM Network + +```bash +qm config | grep net # VM network config +ip addr # From inside VM +``` + +### Common Issues + +| Problem | Check | +|---------|-------| +| No connectivity | Bridge exists, interface attached | +| Wrong VLAN | Tag matches switch config | +| Slow network | MTU mismatch, driver type | +| Firewall blocking | Rules, policy, enabled status | diff --git a/skills/proxmox/references/storage.md b/skills/proxmox/references/storage.md new file mode 100644 index 0000000..d29b622 --- /dev/null +++ b/skills/proxmox/references/storage.md @@ -0,0 +1,150 @@ +# Proxmox Storage Reference + +## Storage Types + +### Local Storage + +| Type | Features | Use Case | +|------|----------|----------| +| Directory | Simple, any filesystem | Basic storage | +| LVM | Block device, raw performance | Performance | +| LVM-thin | Thin provisioning, snapshots | Efficient space | +| ZFS | Compression, snapshots, high perf | Production | + +Limitations: No live migration, single node only. + +### Shared Storage + +| Type | Features | Use Case | +|------|----------|----------| +| NFS | File-based, simple | Shared access | +| Ceph RBD | Distributed block, HA | Production HA | +| iSCSI | Network block | SAN integration | +| GlusterFS | Distributed file | File sharing | + +Benefits: Live migration, HA, shared access. + +## Content Types + +Configure what each storage can hold: + +| Content | Description | File Types | +|---------|-------------|------------| +| images | VM disk images | .raw, .qcow2 | +| iso | ISO images for install | .iso | +| vztmpl | Container templates | .tar.gz | +| backup | Backup files | .vma, .tar | +| rootdir | Container root FS | directories | +| snippets | Cloud-init, hooks | .yaml, scripts | + +## Storage Configuration + +### Add NFS Storage + +```bash +pvesm add nfs \ + --server \ + --export \ + --content images,iso,backup +``` + +### Add Ceph RBD + +```bash +pvesm add rbd \ + --monhost ,, \ + --pool \ + --content images,rootdir +``` + +### Check Storage Status + +```bash +pvesm status # All storage status +pvesh get /storage # API query +df -h # Disk space +``` + +## Disk Formats + +| Format | Features | Performance | +|--------|----------|-------------| +| raw | No overhead, full allocation | Fastest | +| qcow2 | Snapshots, thin provisioning | Moderate | + +Recommendation: Use `raw` for production, `qcow2` for dev/snapshots. + +## Disk Cache Modes + +| Mode | Safety | Performance | Use Case | +|------|--------|-------------|----------| +| none | Safe | Good | Default, recommended | +| writeback | Unsafe | Best | Non-critical, battery backup | +| writethrough | Safe | Moderate | Compatibility | +| directsync | Safest | Slow | Critical data | + +## Storage Performance + +### Enable Discard (TRIM) + +For SSD thin provisioning: + +``` +scsi0: local-lvm:vm-100-disk-0,discard=on +``` + +### I/O Thread + +Dedicated I/O thread per disk: + +``` +scsi0: local-lvm:vm-100-disk-0,iothread=1 +``` + +### I/O Limits + +Throttle disk bandwidth: + +``` +# In VM config +bwlimit: +iops_rd: +iops_wr: +``` + +## Cloud-Init Storage + +Cloud-init configs stored in `snippets` content type: + +```bash +# Upload cloud-init files +scp user-data.yaml root@proxmox:/var/lib/vz/snippets/ + +# Or to named storage +scp user-data.yaml root@proxmox:/mnt/pve//snippets/ +``` + +Reference in VM: + +``` +cicustom: user=:snippets/user-data.yaml +``` + +## Backup Storage + +### Recommended Configuration + +- Separate storage for backups +- NFS or dedicated backup server +- Sufficient space for retention policy + +### Backup Retention + +Configure in Datacenter → Backup: + +``` +keep-last: 3 +keep-daily: 7 +keep-weekly: 4 +keep-monthly: 6 +``` diff --git a/skills/proxmox/references/troubleshooting.md b/skills/proxmox/references/troubleshooting.md new file mode 100644 index 0000000..da4ca0e --- /dev/null +++ b/skills/proxmox/references/troubleshooting.md @@ -0,0 +1,197 @@ +# Proxmox Troubleshooting Reference + +## Common Errors + +| Error | Cause | Solution | +|-------|-------|----------| +| VM won't start | Lock, storage, resources | `qm unlock`, check storage, verify resources | +| Migration failed | No shared storage, resources | Verify shared storage, check target capacity | +| Cluster issues | Quorum, network, time | `pvecm status`, check NTP, network | +| Storage unavailable | Mount failed, network | Check mount, network access | +| High load | Resource contention | Identify bottleneck, rebalance VMs | +| Network issues | Bridge, VLAN, firewall | `brctl show`, check tags, firewall rules | +| Backup failed | Disk space, VM state | Check space, storage access | +| Template not found | Not downloaded | Download from Proxmox repo | +| API errors | Auth, permissions | Check token, user permissions | + +## Diagnostic Commands + +### Cluster Health + +```bash +pvecm status # Quorum and node status +pvecm nodes # List cluster members +systemctl status pve-cluster # Cluster service +systemctl status corosync # Corosync service +``` + +### Node Health + +```bash +pveversion -v # Proxmox version info +uptime # Load and uptime +free -h # Memory usage +df -h # Disk space +top -bn1 | head -20 # Process overview +``` + +### VM Diagnostics + +```bash +qm status # VM state +qm config # VM configuration +qm showcmd # QEMU command line +qm unlock # Clear locks +qm monitor # QEMU monitor access +``` + +### Container Diagnostics + +```bash +pct status # Container state +pct config # Container configuration +pct enter # Enter container shell +pct unlock # Clear locks +``` + +### Storage Diagnostics + +```bash +pvesm status # Storage status +df -h # Disk space +mount | grep -E 'nfs|ceph' # Mounted storage +zpool status # ZFS pool status (if using ZFS) +ceph -s # Ceph status (if using Ceph) +``` + +### Network Diagnostics + +```bash +brctl show # Bridge configuration +ip link # Network interfaces +ip addr # IP addresses +ip route # Routing table +bridge vlan show # VLAN configuration +``` + +### Log Files + +```bash +# Cluster logs +journalctl -u pve-cluster +journalctl -u corosync + +# VM/Container logs +journalctl | grep +tail -f /var/log/pve/tasks/* + +# Firewall logs +journalctl -u pve-firewall + +# Web interface logs +journalctl -u pveproxy +``` + +## Troubleshooting Workflows + +### VM Won't Start + +1. Check for locks: `qm unlock ` +2. Verify storage: `pvesm status` +3. Check resources: `free -h`, `df -h` +4. Review config: `qm config ` +5. Check logs: `journalctl | grep ` +6. Try manual start: `qm start --debug` + +### Migration Failure + +1. Verify shared storage: `pvesm status` +2. Check target resources: `pvesh get /nodes//status` +3. Verify network: `ping ` +4. Check version match: `pveversion` on both nodes +5. Review migration logs + +### Cluster Quorum Lost + +1. Check status: `pvecm status` +2. Identify online nodes +3. If majority lost, set expected: `pvecm expected ` +4. Recover remaining nodes +5. Rejoin lost nodes when available + +### Storage Mount Failed + +1. Check network: `ping ` +2. Verify mount: `mount | grep ` +3. Try manual mount +4. Check permissions on storage server +5. Review `/var/log/syslog` + +### High CPU/Memory Usage + +1. Identify culprit: `top`, `htop` +2. Check VM resources: `qm monitor ` → `info balloon` +3. Review resource allocation across cluster +4. Consider migration or resource limits + +## Recovery Procedures + +### Remove Failed Node + +```bash +# On healthy node +pvecm delnode + +# Clean up node-specific configs +rm -rf /etc/pve/nodes/ +``` + +### Force Stop Locked VM + +```bash +# Remove lock +qm unlock + +# If still stuck, find and kill QEMU process +ps aux | grep +kill + +# Force cleanup +qm stop --skiplock +``` + +### Recover from Corrupt Config + +```bash +# Backup current config +cp /etc/pve/qemu-server/.conf /root/.conf.bak + +# Edit config manually +nano /etc/pve/qemu-server/.conf + +# Or restore from backup +qmrestore +``` + +## Health Check Script + +```bash +#!/bin/bash +echo "=== Cluster Status ===" +pvecm status + +echo -e "\n=== Node Resources ===" +for node in $(pvecm nodes | awk 'NR>1 {print $3}'); do + echo "--- $node ---" + pvesh get /nodes/$node/status --output-format yaml | grep -E '^(cpu|memory):' +done + +echo -e "\n=== Storage Status ===" +pvesm status + +echo -e "\n=== Running VMs ===" +qm list | grep running + +echo -e "\n=== Running Containers ===" +pct list | grep running +``` diff --git a/skills/proxmox/references/vm-lxc.md b/skills/proxmox/references/vm-lxc.md new file mode 100644 index 0000000..fe70384 --- /dev/null +++ b/skills/proxmox/references/vm-lxc.md @@ -0,0 +1,103 @@ +# VM vs LXC Reference + +## Decision Matrix + +### Use VM (QEMU/KVM) When + +- Running Windows or non-Linux OS +- Need full kernel isolation +- Running untrusted workloads +- Complex hardware passthrough needed +- Different kernel version required +- GPU passthrough required + +### Use LXC When + +- Running Linux services +- Need lightweight, fast startup +- Comfortable with shared kernel +- Want better density/performance +- Simple application containers +- Development environments + +## QEMU/KVM VMs + +Full hardware virtualization with any OS support. + +### Hardware Configuration + +| Setting | Options | Recommendation | +|---------|---------|----------------| +| CPU type | host, kvm64, custom | `host` for performance | +| Boot | UEFI, BIOS | UEFI for modern OS | +| Display | VNC, SPICE, NoVNC | NoVNC for web access | + +### Storage Controllers + +| Type | Performance | Use Case | +|------|-------------|----------| +| VirtIO | Fastest | Linux, Windows with drivers | +| SCSI | Fast | General purpose | +| SATA | Moderate | Compatibility | +| IDE | Slow | Legacy OS | + +### Network Adapters + +| Type | Performance | Use Case | +|------|-------------|----------| +| VirtIO | Fastest | Linux, Windows with drivers | +| E1000 | Good | Compatibility | +| RTL8139 | Slow | Legacy OS | + +### Features + +- Snapshots (requires compatible storage) +- Templates for rapid cloning +- Live migration (requires shared storage) +- Hardware passthrough (GPU, USB, PCI) + +## LXC Containers + +OS-level virtualization with shared kernel. + +### Container Types + +| Type | Security | Use Case | +|------|----------|----------| +| Unprivileged | Higher (recommended) | Production workloads | +| Privileged | Lower | Docker-in-LXC, NFS mounts | + +### Resource Controls + +- CPU cores and limits +- Memory hard/soft limits +- Disk I/O throttling +- Network bandwidth limits + +### Storage Options + +- Bind mounts from host +- Volume storage +- ZFS datasets + +### Features + +- Fast startup (seconds) +- Lower memory overhead +- Higher density per host +- Templates from Proxmox repo + +## Migration Considerations + +### VM Migration Requirements + +- Shared storage (Ceph, NFS, iSCSI) +- Same CPU architecture +- Compatible Proxmox versions +- Network connectivity between nodes + +### LXC Migration Requirements + +- Shared storage for live migration +- Same architecture +- Unprivileged preferred for portability diff --git a/skills/terraform/SKILL.md b/skills/terraform/SKILL.md new file mode 100644 index 0000000..b4e074b --- /dev/null +++ b/skills/terraform/SKILL.md @@ -0,0 +1,85 @@ +--- +name: terraform +description: | + Terraform infrastructure-as-code reference for HCL syntax, state management, + module design, and provider configuration. Use when working with Terraform + configurations (.tf files), running terraform commands, troubleshooting state + issues, or designing modules. Includes Telmate Proxmox provider patterns. + Triggers: terraform, tfstate, .tf files, HCL, modules, providers, proxmox_vm_qemu. +--- + +# Terraform Skill + +Infrastructure-as-code reference for Terraform configurations, state management, and provider patterns. + +## Quick Reference + +```bash +# Core workflow +terraform init # Initialize, download providers +terraform validate # Syntax validation +terraform fmt -recursive # Format HCL files +terraform plan # Preview changes +terraform apply # Apply changes + +# Inspection +terraform state list # List resources in state +terraform state show # Show resource details +terraform graph | dot -Tsvg > graph.svg # Dependency graph + +# Debug +TF_LOG=DEBUG terraform plan 2>debug.log +``` + +## Core Workflow + +``` +init → validate → fmt → plan → apply +``` + +1. **init**: Download providers, initialize backend +2. **validate**: Check syntax and configuration validity +3. **fmt**: Ensure consistent formatting +4. **plan**: Preview what will change (review carefully) +5. **apply**: Execute changes + +## Reference Files + +Load on-demand based on task: + +| Topic | File | When to Load | +|-------|------|--------------| +| Proxmox Gotchas | [proxmox/gotchas.md](references/proxmox/gotchas.md) | Critical provider issues, workarounds | +| Proxmox Auth | [proxmox/authentication.md](references/proxmox/authentication.md) | Provider config, API tokens | +| Proxmox VMs | [proxmox/vm-qemu.md](references/proxmox/vm-qemu.md) | proxmox_vm_qemu resource patterns | +| Proxmox Errors | [proxmox/troubleshooting.md](references/proxmox/troubleshooting.md) | Common errors, debugging | +| State | [state-management.md](references/state-management.md) | Backends, locking, operations | +| Modules | [module-design.md](references/module-design.md) | Module patterns, composition | +| Security | [security.md](references/security.md) | Secrets, state security | +| External | [external-resources.md](references/external-resources.md) | Official docs, links | + +## Validation Checklist + +Before `terraform apply`: + +- [ ] `terraform init` completed successfully +- [ ] `terraform validate` passes +- [ ] `terraform fmt` applied +- [ ] `terraform plan` reviewed (check destroy/replace operations) +- [ ] Backend configured correctly (for team environments) +- [ ] State locking enabled (if remote backend) +- [ ] Sensitive variables marked `sensitive = true` +- [ ] Provider versions pinned in `terraform.tf` +- [ ] No secrets in version control +- [ ] Blast radius assessed (what could break?) + +## Variable Precedence + +(highest to lowest) + +1. `-var` flag: `terraform apply -var="name=value"` +2. `-var-file` flag: `terraform apply -var-file=prod.tfvars` +3. `*.auto.tfvars` files (alphabetically) +4. `terraform.tfvars` file +5. `TF_VAR_*` environment variables +6. Variable defaults in `variables.tf` diff --git a/skills/terraform/references/external-resources.md b/skills/terraform/references/external-resources.md new file mode 100644 index 0000000..d0fbda9 --- /dev/null +++ b/skills/terraform/references/external-resources.md @@ -0,0 +1,66 @@ +# External Resources + +Pointers to official documentation and community resources. + +## Official HashiCorp Documentation + +| Resource | URL | Use For | +|----------|-----|---------| +| Terraform Docs | https://developer.hashicorp.com/terraform/docs | Language reference, CLI commands | +| Terraform Tutorials | https://developer.hashicorp.com/terraform/tutorials | Step-by-step learning paths | +| Language Reference | https://developer.hashicorp.com/terraform/language | HCL syntax, expressions, functions | +| CLI Reference | https://developer.hashicorp.com/terraform/cli | Command options and usage | +| Best Practices | https://developer.hashicorp.com/terraform/cloud-docs/recommended-practices | Official workflow recommendations | + +## Terraform Registry + +| Resource | URL | Use For | +|----------|-----|---------| +| Provider Registry | https://registry.terraform.io/browse/providers | Find and explore providers | +| Module Registry | https://registry.terraform.io/browse/modules | Pre-built modules | +| Telmate Proxmox | https://registry.terraform.io/providers/Telmate/proxmox/latest/docs | Proxmox provider docs | +| AWS Provider | https://registry.terraform.io/providers/hashicorp/aws/latest/docs | AWS resource reference | + +## Proxmox Resources + +| Resource | URL | Use For | +|----------|-----|---------| +| Telmate Provider Docs | https://registry.terraform.io/providers/Telmate/proxmox/latest/docs | Resource configuration | +| Telmate GitHub | https://github.com/Telmate/terraform-provider-proxmox | Source, issues, examples | +| Proxmox VE API | https://pve.proxmox.com/pve-docs/api-viewer/ | Understanding API calls | +| Proxmox Wiki | https://pve.proxmox.com/wiki/Main_Page | Proxmox concepts and setup | + +## Community Resources + +| Resource | URL | Use For | +|----------|-----|---------| +| Terraform Best Practices | https://www.terraform-best-practices.com | Community-maintained guide | +| Awesome Terraform | https://github.com/shuaibiyy/awesome-terraform | Curated list of resources | +| Terraform Weekly | https://www.yourdevopsmentor.com/terraform-weekly | News and updates | + +## Learning Resources + +| Resource | URL | Use For | +|----------|-----|---------| +| HashiCorp Learn | https://developer.hashicorp.com/terraform/tutorials | Official tutorials | +| Terraform Up & Running | https://www.terraformupandrunning.com/ | Comprehensive book | + +## Tools + +| Tool | URL | Use For | +|------|-----|---------| +| TFLint | https://github.com/terraform-linters/tflint | Linting and best practices | +| Checkov | https://github.com/bridgecrewio/checkov | Security scanning | +| Infracost | https://github.com/infracost/infracost | Cost estimation | +| Terragrunt | https://terragrunt.gruntwork.io/ | DRY Terraform configurations | +| tfenv | https://github.com/tfutils/tfenv | Terraform version management | + +## Quick Links + +**Most commonly needed:** + +1. **HCL Syntax**: https://developer.hashicorp.com/terraform/language/syntax/configuration +2. **Functions**: https://developer.hashicorp.com/terraform/language/functions +3. **Expressions**: https://developer.hashicorp.com/terraform/language/expressions +4. **Backend Configuration**: https://developer.hashicorp.com/terraform/language/settings/backends +5. **Proxmox VM Resource**: https://registry.terraform.io/providers/Telmate/proxmox/latest/docs/resources/vm_qemu diff --git a/skills/terraform/references/module-design.md b/skills/terraform/references/module-design.md new file mode 100644 index 0000000..e2f7fc6 --- /dev/null +++ b/skills/terraform/references/module-design.md @@ -0,0 +1,165 @@ +# Module Design + +## Standard Structure + +``` +modules// +├── main.tf # Resources +├── variables.tf # Inputs +├── outputs.tf # Outputs +├── versions.tf # Provider constraints +``` + +## Module Example + +```hcl +# modules/vm/variables.tf +variable "name" { + description = "VM name" + type = string +} + +variable "target_node" { + description = "Proxmox node" + type = string +} + +variable "specs" { + type = object({ + cores = number + memory = number + disk = optional(string, "50G") + }) +} +``` + +```hcl +# modules/vm/main.tf +resource "proxmox_vm_qemu" "vm" { + name = var.name + target_node = var.target_node + cores = var.specs.cores + memory = var.specs.memory +} +``` + +```hcl +# modules/vm/outputs.tf +output "ip" { + value = proxmox_vm_qemu.vm.default_ipv4_address +} +``` + +```hcl +# Usage +module "web" { + source = "./modules/vm" + name = "web-01" + target_node = "pve1" + specs = { cores = 4, memory = 8192 } +} +``` + +## Complex Variable Types + +```hcl +# Map of objects +variable "vms" { + type = map(object({ + node = string + cores = number + memory = number + })) +} + +# Object with optional fields +variable "network" { + type = object({ + bridge = string + vlan = optional(number) + ip = optional(string, "dhcp") + }) +} +``` + +## Variable Validation + +```hcl +variable "environment" { + type = string + validation { + condition = contains(["dev", "staging", "prod"], var.environment) + error_message = "Must be dev, staging, or prod." + } +} + +variable "cores" { + type = number + validation { + condition = var.cores >= 1 && var.cores <= 32 + error_message = "Cores must be 1-32." + } +} +``` + +## Module Composition + +```hcl +module "network" { + source = "../../modules/network" + # ... +} + +module "web" { + source = "../../modules/vm" + network_id = module.network.id # Implicit dependency +} + +module "database" { + source = "../../modules/vm" + depends_on = [module.network] # Explicit dependency +} +``` + +## for_each vs count + +```hcl +# count - index-based (0, 1, 2) +module "worker" { + source = "./modules/vm" + count = 3 + name = "worker-${count.index}" +} +# Access: module.worker[0] + +# for_each - key-based (preferred) +module "vm" { + source = "./modules/vm" + for_each = var.vms + name = each.key + specs = each.value +} +# Access: module.vm["web"] +``` + +## Version Constraints + +```hcl +# modules/vm/versions.tf +terraform { + required_version = ">= 1.0" + required_providers { + proxmox = { + source = "telmate/proxmox" + version = "~> 3.0" + } + } +} +``` + +```hcl +# Pin module version +module "vm" { + source = "git::https://github.com/org/modules.git//vm?ref=v2.1.0" +} +``` diff --git a/skills/terraform/references/proxmox/authentication.md b/skills/terraform/references/proxmox/authentication.md new file mode 100644 index 0000000..87603b7 --- /dev/null +++ b/skills/terraform/references/proxmox/authentication.md @@ -0,0 +1,44 @@ +# Proxmox Provider Authentication + +## Provider Configuration + +```hcl +terraform { + required_providers { + proxmox = { + source = "telmate/proxmox" + version = "~> 3.0" + } + } +} + +provider "proxmox" { + pm_api_url = "https://proxmox.example.com:8006/api2/json" + pm_api_token_id = "terraform@pve!mytoken" + pm_api_token_secret = var.pm_api_token_secret + pm_tls_insecure = false # true for self-signed certs + pm_parallel = 4 # concurrent operations + pm_timeout = 600 # API timeout seconds +} +``` + +## Create API Token + +```bash +pveum user add terraform@pve +pveum aclmod / -user terraform@pve -role PVEAdmin +pveum user token add terraform@pve mytoken +``` + +## Environment Variables + +```bash +export PM_API_TOKEN_ID="terraform@pve!mytoken" +export PM_API_TOKEN_SECRET="xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" +``` + +## Official Resources + +- [Provider Docs](https://registry.terraform.io/providers/Telmate/proxmox/latest/docs) +- [GitHub](https://github.com/Telmate/terraform-provider-proxmox) +- [Proxmox API](https://pve.proxmox.com/pve-docs/api-viewer/) diff --git a/skills/terraform/references/proxmox/gotchas.md b/skills/terraform/references/proxmox/gotchas.md new file mode 100644 index 0000000..b9753a7 --- /dev/null +++ b/skills/terraform/references/proxmox/gotchas.md @@ -0,0 +1,86 @@ +# Proxmox Provider Gotchas + +Critical issues when using Telmate Proxmox provider with Terraform. + +## 1. Cloud-Init Changes Not Tracked + +Terraform does **not** detect changes to cloud-init snippet file contents. + +```hcl +# PROBLEM: Changing vendor-data.yml won't trigger replacement +resource "proxmox_vm_qemu" "vm" { + cicustom = "vendor=local:snippets/vendor-data.yml" +} + +# SOLUTION: Use replace_triggered_by +resource "local_file" "vendor_data" { + filename = "vendor-data.yml" + content = templatefile("vendor-data.yml.tftpl", { ... }) +} + +resource "proxmox_vm_qemu" "vm" { + cicustom = "vendor=local:snippets/vendor-data.yml" + + lifecycle { + replace_triggered_by = [ + local_file.vendor_data.content_base64sha256 + ] + } +} +``` + +## 2. Storage Type vs Storage Pool + +Different concepts - don't confuse: + +```hcl +disks { + scsi { + scsi0 { + disk { + storage = "local-lvm" # Pool NAME (from Proxmox datacenter) + size = "50G" + } + } + } +} +scsihw = "virtio-scsi-single" # Controller TYPE +``` + +- **Storage pool** = Where data stored (local-lvm, ceph-pool, nfs-share) +- **Disk type** = Interface (scsi, virtio, ide, sata) + +## 3. Network Interface Naming + +Proxmox VMs get predictable names by device order: + +| NIC Order | Guest Name | +|-----------|------------| +| First | ens18 | +| Second | ens19 | +| Third | ens20 | + +**NOT** eth0, eth1. Configure cloud-init netplan matching `ens*`. + +## 4. API Token Expiration + +Long operations (20+ VMs) can exceed token lifetime. + +```hcl +provider "proxmox" { + pm_api_token_id = "terraform@pve!mytoken" + pm_api_token_secret = var.pm_api_token_secret + pm_timeout = 1200 # 20 minutes for large operations +} +``` + +Use API tokens (longer-lived) not passwords. + +## 5. Full Clone vs Linked Clone + +```hcl +full_clone = true # Independent copy - safe, slower, more storage +full_clone = false # References template - BREAKS if template modified +``` + +**Always use `full_clone = true` for production.** Linked clones only for disposable test VMs. diff --git a/skills/terraform/references/proxmox/troubleshooting.md b/skills/terraform/references/proxmox/troubleshooting.md new file mode 100644 index 0000000..3ce86ea --- /dev/null +++ b/skills/terraform/references/proxmox/troubleshooting.md @@ -0,0 +1,66 @@ +# Proxmox Troubleshooting + +## VM Creation Stuck + +``` +Timeout waiting for VM to be created +``` + +**Causes**: Template missing, storage full, network unreachable +**Debug**: Check Proxmox task log in web UI + +## Clone Failed + +``` +VM template not found +``` + +**Check**: `qm list | grep template-name` +**Causes**: Template doesn't exist, wrong node, permission issue + +## SSH Timeout + +``` +Timeout waiting for SSH +``` + +**Debug**: +1. VM console in Proxmox UI +2. `cloud-init status` on VM +3. `ip addr` to verify network + +**Causes**: Cloud-init failed, network misconfigured, firewall + +## State Drift + +``` +Plan shows changes for unchanged resources +``` + +**Causes**: Manual changes in Proxmox UI, provider bug +**Fix**: +```bash +terraform refresh +terraform plan # Verify +``` + +## API Errors + +``` +500 Internal Server Error +``` + +**Causes**: Invalid config, resource constraints, API timeout +**Debug**: Check `/var/log/pveproxy/access.log` on Proxmox node + +## Permission Denied + +``` +Permission check failed +``` + +**Fix**: Verify API token has required permissions: +```bash +pveum acl list +pveum user permissions terraform@pve +``` diff --git a/skills/terraform/references/proxmox/vm-qemu.md b/skills/terraform/references/proxmox/vm-qemu.md new file mode 100644 index 0000000..ae61f61 --- /dev/null +++ b/skills/terraform/references/proxmox/vm-qemu.md @@ -0,0 +1,86 @@ +# proxmox_vm_qemu Resource + +## Basic VM from Template + +```hcl +resource "proxmox_vm_qemu" "vm" { + name = "my-vm" + target_node = "pve1" + clone = "ubuntu-template" + full_clone = true + + cores = 4 + sockets = 1 + memory = 8192 + cpu = "host" + + onboot = true + agent = 1 # QEMU guest agent + + scsihw = "virtio-scsi-single" + disks { + scsi { + scsi0 { + disk { + storage = "local-lvm" + size = "50G" + } + } + } + } + + network { + bridge = "vmbr0" + model = "virtio" + } + + # Cloud-init + os_type = "cloud-init" + ciuser = "ubuntu" + sshkeys = var.ssh_public_key + ipconfig0 = "ip=dhcp" + # Static: ipconfig0 = "ip=192.168.1.10/24,gw=192.168.1.1" + + # Custom cloud-init + cicustom = "vendor=local:snippets/vendor-data.yml" +} +``` + +## Lifecycle Management + +```hcl +lifecycle { + prevent_destroy = true # Block accidental deletion + + ignore_changes = [ + network, # Ignore manual changes + ] + + replace_triggered_by = [ + local_file.cloud_init.content_base64sha256 + ] + + create_before_destroy = true # Blue-green deployment +} +``` + +## Multiple VMs with for_each + +```hcl +variable "vms" { + type = map(object({ + node = string + cores = number + memory = number + })) +} + +resource "proxmox_vm_qemu" "vm" { + for_each = var.vms + name = each.key + target_node = each.value.node + cores = each.value.cores + memory = each.value.memory + # ... +} +``` diff --git a/skills/terraform/references/security.md b/skills/terraform/references/security.md new file mode 100644 index 0000000..8d6b2c1 --- /dev/null +++ b/skills/terraform/references/security.md @@ -0,0 +1,92 @@ +# Security + +## Secrets Management + +### Environment Variables (Recommended) + +```bash +export TF_VAR_proxmox_password="secret" +export TF_VAR_api_token="xxxxx" +terraform apply +``` + +### Sensitive Variables + +```hcl +variable "database_password" { + type = string + sensitive = true # Hidden in logs/plan +} +``` + +### External Secrets Managers + +**HashiCorp Vault**: +```hcl +data "vault_generic_secret" "db" { + path = "secret/database" +} + +resource "some_resource" "x" { + password = data.vault_generic_secret.db.data["password"] +} +``` + +**1Password CLI**: +```bash +export TF_VAR_password="$(op read 'op://vault/item/password')" +terraform apply +``` + +## State Security + +**CRITICAL**: State contains secrets in plaintext. + +### Encrypt at Rest + +```hcl +backend "s3" { + encrypt = true + kms_key_id = "arn:aws:kms:..." # Optional KMS +} +``` + +### Restrict Access + +- IAM/RBAC on backend storage +- Enable state locking +- Never commit state to git + +## Provider Credentials + +```hcl +provider "proxmox" { + pm_api_token_id = "terraform@pve!mytoken" + pm_api_token_secret = var.pm_api_token_secret # From env +} +``` + +Create minimal-permission API user: +```bash +pveum user add terraform@pve +pveum aclmod / -user terraform@pve -role PVEVMAdmin +pveum user token add terraform@pve terraform-token +``` + +## Sensitive Outputs + +```hcl +output "db_password" { + value = random_password.db.result + sensitive = true +} +``` + +## Checklist + +- [ ] Sensitive vars marked `sensitive = true` +- [ ] Secrets via env vars or secrets manager +- [ ] State backend encryption enabled +- [ ] State locking enabled +- [ ] No credentials in .tf files +- [ ] Provider credentials minimal permissions diff --git a/skills/terraform/references/state-management.md b/skills/terraform/references/state-management.md new file mode 100644 index 0000000..3697344 --- /dev/null +++ b/skills/terraform/references/state-management.md @@ -0,0 +1,112 @@ +# State Management + +## Remote Backend (Recommended) + +```hcl +terraform { + backend "s3" { + bucket = "terraform-state" + key = "project/terraform.tfstate" + region = "us-east-1" + encrypt = true + dynamodb_table = "terraform-locks" # State locking + } +} +``` + +### S3-Compatible (MinIO, Ceph) + +```hcl +terraform { + backend "s3" { + bucket = "terraform-state" + key = "project/terraform.tfstate" + region = "us-east-1" # Required but ignored + + endpoint = "https://minio.example.com" + skip_credentials_validation = true + skip_metadata_api_check = true + skip_region_validation = true + force_path_style = true + } +} +``` + +## State Operations + +```bash +# List resources +terraform state list +terraform state list proxmox_vm_qemu.* + +# Show resource details +terraform state show proxmox_vm_qemu.web + +# Rename resource +terraform state mv proxmox_vm_qemu.old proxmox_vm_qemu.new + +# Move to module +terraform state mv proxmox_vm_qemu.web modules.web.proxmox_vm_qemu.main + +# Remove from state (doesn't destroy) +terraform state rm proxmox_vm_qemu.orphaned + +# Import existing resource +terraform import proxmox_vm_qemu.web pve1/qemu/100 + +# Update state from infrastructure +terraform refresh +``` + +## State Migration + +```bash +# Change backend - updates terraform block, then: +terraform init -migrate-state + +# Reinitialize without migration +terraform init -reconfigure +``` + +## State Locking + +Prevents concurrent modifications. Enable via backend config: +- S3: `dynamodb_table` +- Consul: Built-in +- HTTP: `lock_address` + +### Force Unlock (Emergency) + +```bash +# Only when certain no operation running +terraform force-unlock LOCK_ID +``` + +## Troubleshooting + +### State Lock Timeout + +``` +Error: Error acquiring state lock +``` + +1. Wait for other operation +2. Verify no process running +3. `terraform force-unlock LOCK_ID` if safe + +### State Drift + +``` +Plan shows unexpected changes +``` + +```bash +terraform refresh # Update state from real infra +terraform plan # Review changes +``` + +### Corrupted State + +1. Restore from backup +2. `terraform state pull > backup.tfstate` +3. Last resort: `terraform state rm` and re-import