Initial commit

2025-11-29 18:00:24 +08:00
commit 4768fb755a
22 changed files with 11534 additions and 0 deletions
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -0,0 +1,12 @@
+{
+  "name": "ansible-best-practices",
+  "description": "Ansible playbook refactoring, role development, testing, and best practices with Infisical secrets management",
+  "version": "1.0.0",
+  "author": {
+    "name": "basher83",
+    "email": "basher83@mail.spaceships.work"
+  },
+  "skills": [
+    "./skills"
+  ]
+}
--- a/README.md
+++ b/README.md
@@ -0,0 +1,3 @@
+# ansible-best-practices
+
+Ansible playbook refactoring, role development, testing, and best practices with Infisical secrets management
--- a/plugin.lock.json
+++ b/plugin.lock.json
@@ -0,0 +1,117 @@
+{
+  "$schema": "internal://schemas/plugin.lock.v1.json",
+  "pluginId": "gh:basher83/lunar-claude:plugins/infrastructure/ansible-best-practices",
+  "normalized": {
+    "repo": null,
+    "ref": "refs/tags/v20251128.0",
+    "commit": "eef1ea0fdc4539368ef81ddc9ac68389c80a1e57",
+    "treeHash": "c9023a71527b9bb43fa99df4eb2c7dc8197daaaa62431b006bcd78599c5390e3",
+    "generatedAt": "2025-11-28T10:14:11.921713Z",
+    "toolVersion": "publish_plugins.py@0.2.0"
+  },
+  "origin": {
+    "remote": "git@github.com:zhongweili/42plugin-data.git",
+    "branch": "master",
+    "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390",
+    "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data"
+  },
+  "manifest": {
+    "name": "ansible-best-practices",
+    "description": "Ansible playbook refactoring, role development, testing, and best practices with Infisical secrets management",
+    "version": "1.0.0"
+  },
+  "content": {
+    "files": [
+      {
+        "path": "README.md",
+        "sha256": "e29716e1fad616884a71aebbba2c77c5948663e492bd1c6989993cc06e6f4d66"
+      },
+      {
+        "path": ".claude-plugin/plugin.json",
+        "sha256": "3c2b518746bbfbddb923eefef236873a6939cc148b0b41dba91e88a4603dd408"
+      },
+      {
+        "path": "skills/ansible-best-practices/SKILL.md",
+        "sha256": "c6c05c8d6e3cbad2f377424d7bb7704895f3742c5ae8c6d20d1d7aa20e96196b"
+      },
+      {
+        "path": "skills/ansible-best-practices/tools/lint-all.sh",
+        "sha256": "5efc687e1fdf9cf3ca461f559f083f009d4028ab6c4fb170ee3325238d285b74"
+      },
+      {
+        "path": "skills/ansible-best-practices/tools/check_idempotency.py",
+        "sha256": "727d4e35a560d50748f1fea99761a4aa14b9646cbdf978c7ec69ea8d0e73f5ce"
+      },
+      {
+        "path": "skills/ansible-best-practices/patterns/role-structure-standards.md",
+        "sha256": "fa04e62bf3d59a2d883afaa19749850ef73abd524bad38f5193b281a382b0ffc"
+      },
+      {
+        "path": "skills/ansible-best-practices/patterns/testing-comprehensive.md",
+        "sha256": "f98bf5b1d0ea916beb1ccf66d89504921f4ca2e9bcf7dda7ffaf90cd61fc0877"
+      },
+      {
+        "path": "skills/ansible-best-practices/patterns/variable-management-patterns.md",
+        "sha256": "49becbed5312d7294321ce443729ccaf8d609f40b738b15dcc4a4271bb8327d0"
+      },
+      {
+        "path": "skills/ansible-best-practices/patterns/documentation-templates.md",
+        "sha256": "1131d281cc706853ad06fa8d099dcac7e3658e30299d35019382d60e688b8bd0"
+      },
+      {
+        "path": "skills/ansible-best-practices/patterns/network-automation.md",
+        "sha256": "17fcb8127b7bf96cf5fd3126492c1abf10258c674080acfb3c8af0c5f0565294"
+      },
+      {
+        "path": "skills/ansible-best-practices/patterns/playbook-role-patterns.md",
+        "sha256": "0d3bca0260266215405c9e15a7876274b37b1b784a4c79c4c80c78f4215e0c08"
+      },
+      {
+        "path": "skills/ansible-best-practices/patterns/cluster-automation.md",
+        "sha256": "a1f56c9d94370c70bf0ee0187f798f5bd1bdb15a3ff7a931a621a939b8313f9d"
+      },
+      {
+        "path": "skills/ansible-best-practices/patterns/error-handling.md",
+        "sha256": "736c82e8410ac02ba18c104ef346b9c44e686d060414332db85ba75fe6e1c0d4"
+      },
+      {
+        "path": "skills/ansible-best-practices/patterns/ceph-automation.md",
+        "sha256": "89a345ce583d56d0a9bfb54b707c8a074c0bf4dbc0951ecdda77af2f82d72024"
+      },
+      {
+        "path": "skills/ansible-best-practices/patterns/meta-dependencies.md",
+        "sha256": "676ab77408753af4c477ffacceed202e00b4f8a3d360c68dc1b4a725096ccfc3"
+      },
+      {
+        "path": "skills/ansible-best-practices/patterns/secrets-management.md",
+        "sha256": "484095a5c627fe89964edd3dddd28ef373be993a4276259ad5f2c1e212d05051"
+      },
+      {
+        "path": "skills/ansible-best-practices/patterns/handler-best-practices.md",
+        "sha256": "0c58980b793024c84dc1d1573524dd7d04beb97b6ae0127969709f5887317d11"
+      },
+      {
+        "path": "skills/ansible-best-practices/anti-patterns/common-mistakes.md",
+        "sha256": "07a257980ddd710c1670f4c286bf3fe6cf5ef95c12e603b2c3566364f144d64b"
+      },
+      {
+        "path": "skills/ansible-best-practices/examples/02-infisical-secrets/docker-deployment.yml",
+        "sha256": "56c24f19770ae371717f7fbfbc1b27ad325b871dc852061260d47c8a3a99964c"
+      },
+      {
+        "path": "skills/ansible-best-practices/examples/02-infisical-secrets/README.md",
+        "sha256": "c0554e6d3274543cf0b0d29ae4e99465d2f7a3b3dfab01ff9ac14291665823d1"
+      },
+      {
+        "path": "skills/ansible-best-practices/reference/production-repos.md",
+        "sha256": "d7c0eaa4cd41a77135f7c29291aa4b380c65af87d33f58a81f9192999de8353c"
+      }
+    ],
+    "dirSha256": "c9023a71527b9bb43fa99df4eb2c7dc8197daaaa62431b006bcd78599c5390e3"
+  },
+  "security": {
+    "scannedAt": null,
+    "scannerVersion": null,
+    "flags": []
+  }
+}
--- a/skills/ansible-best-practices/SKILL.md
+++ b/skills/ansible-best-practices/SKILL.md
@@ -0,0 +1,391 @@
+---
+name: ansible-best-practices
+description: >
+  Ansible playbook and role patterns using ansible.builtin modules, community.general,
+  community.proxmox, ansible.posix collections, molecule testing, ansible-lint validation,
+  and Infisical secrets management. Covers idempotency patterns (changed_when, failed_when,
+  register), YAML playbook structure, Jinja2 templating, handler patterns, and variable
+  precedence rules. This skill should be used when writing Ansible playbooks, developing
+  Ansible roles, testing with molecule/ansible-lint, managing secrets with Infisical,
+  implementing idempotent task patterns with changed_when/failed_when directives, or
+  configuring Proxmox/network automation.
+---
+
+# Ansible Playbook Best Practices
+
+Expert guidance for writing maintainable, idempotent, and testable Ansible playbooks based on
+real-world patterns from this repository.
+
+## Quick Reference
+
+### Pattern Decision Guide
+
+| Need | Use Pattern | Details |
+|------|-------------|---------|
+| **Use secrets?** | Infisical Secret Management | [patterns/secrets-management.md](patterns/secrets-management.md) |
+| **Resource management?** | State-Based Playbooks | [patterns/playbook-role-patterns.md](patterns/playbook-role-patterns.md) |
+| **No native module?** | Hybrid Module Approach | See Hybrid Module section below |
+| **Task failing?** | Proper Error Handling | [patterns/error-handling.md](patterns/error-handling.md) |
+| **Repeating blocks?** | Task Organization | [patterns/task-organization.md](patterns/task-organization.md) |
+| **Network config?** | Network Automation | [patterns/network-automation.md](patterns/network-automation.md) |
+| **Tasks show 'changed'?** | Idempotency Patterns | [reference/idempotency-patterns.md](reference/idempotency-patterns.md) |
+
+### Golden Rules
+
+1. **Use `uv run` prefix** - Always: `uv run ansible-playbook`
+2. **Fully qualify modules** - `ansible.builtin.copy` not `copy`
+3. **Secrets via Infisical** - Use reusable task pattern
+4. **Control `command`/`shell`** - Always use `changed_when`, `failed_when`
+5. **Use `set -euo pipefail`** - In all shell scripts
+6. **Tag sensitive tasks** - Use `no_log: true`
+7. **Idempotency first** - Check before create, verify after
+
+### Common Commands
+
+```bash
+# Lint
+mise run ansible-lint
+
+# Analyze complexity
+./tools/analyze_playbook.py ansible/playbooks/my-playbook.yml
+
+# Check idempotency
+./tools/check_idempotency.py ansible/playbooks/my-playbook.yml
+
+# Run with secrets
+cd ansible && uv run ansible-playbook playbooks/my-playbook.yml
+```
+
+## Core Patterns from This Repository
+
+### 1. Infisical Secret Management
+
+This repository uses **Infisical** for centralized secrets management.
+
+**Quick Pattern:**
+
+```yaml
+- name: Retrieve Proxmox credentials
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'PROXMOX_PASSWORD'
+    secret_var_name: 'proxmox_password'
+    fallback_env_var: 'PROXMOX_PASSWORD'  # Optional
+```
+
+**Key Features:** Validates authentication, proper `no_log`, fallback to env vars, reusable across playbooks.
+
+See [patterns/secrets-management.md](patterns/secrets-management.md) for complete guide including
+authentication methods, security best practices, and CI/CD integration.
+
+### 2. State-Based Playbooks
+
+**Pattern:** Single playbook handles both create and remove via `state` variable.
+
+```yaml
+# Create user (default)
+uv run ansible-playbook playbooks/create-admin-user.yml \
+  -e "admin_name=alice" -e "admin_ssh_key='ssh-ed25519 ...'"
+
+# Remove user (add state=absent)
+uv run ansible-playbook playbooks/create-admin-user.yml \
+  -e "admin_name=alice" -e "admin_state=absent"
+```
+
+**Why:** Follows community role patterns, single source of truth, consistent interface, less duplication.
+
+See [patterns/playbook-role-patterns.md](patterns/playbook-role-patterns.md) for complete implementation details and advanced patterns.
+
+### 3. Hybrid Module Approach
+
+**Pattern:** Use native modules where available, fall back to `command` when needed.
+
+```yaml
+# GOOD: Native module
+- name: Create Linux system user
+  ansible.builtin.user:
+    name: "{{ system_username }}"
+    state: present
+
+# ACCEPTABLE: Command when no native module exists
+- name: Create Proxmox API token
+  ansible.builtin.command: >
+    pveum user token add {{ system_username }}@{{ proxmox_user_realm }}
+  register: token_result
+  changed_when: "'already exists' not in token_result.stderr"
+  failed_when:
+    - token_result.rc != 0
+    - "'already exists' not in token_result.stderr"
+```
+
+**Key:** `changed_when` and `failed_when` make `command` module idempotent.
+
+### 4. Proper Error Handling
+
+```yaml
+- name: Check if resource exists
+  ansible.builtin.command: check-resource {{ resource_id }}
+  register: resource_check
+  changed_when: false  # Read-only operation
+  failed_when: false   # Don't fail, check in next task
+
+- name: Fail if resource missing
+  ansible.builtin.fail:
+    msg: "Resource {{ resource_id }} not found"
+  when: resource_check.rc != 0
+```
+
+See [patterns/error-handling.md](patterns/error-handling.md) for comprehensive patterns.
+
+### 5. Task Organization
+
+**Reusable Tasks Pattern:**
+
+```yaml
+# In playbook
+- name: Get database password
+  ansible.builtin.include_tasks: "{{ playbook_dir }}/../tasks/infisical-secret-lookup.yml"
+  vars:
+    secret_name: 'DB_PASSWORD'
+    secret_var_name: 'db_password'
+```
+
+Extract common patterns to `tasks/` directory, use `include_tasks` with clear variable contracts.
+
+See [patterns/task-organization.md](patterns/task-organization.md) and [patterns/reusable-tasks.md](patterns/reusable-tasks.md).
+
+### 6. Network Automation
+
+**Pattern:** Use `community.general.interfaces_file` for network configuration.
+
+```yaml
+- name: Enable VLAN-aware bridging
+  community.general.interfaces_file:
+    iface: vmbr1
+    option: bridge-vlan-aware
+    value: "yes"
+    backup: true
+    state: present
+  notify: Reload network interfaces
+```
+
+Declarative config, automatic backup, handler pattern for reload.
+
+See [patterns/network-automation.md](patterns/network-automation.md) for advanced patterns including VLAN, bonding, and verification.
+
+### 7. Idempotency Patterns
+
+**Use `changed_when` and `failed_when`:**
+
+```yaml
+# Check before create
+- name: Check if VM exists
+  ansible.builtin.shell: |
+    set -o pipefail
+    qm list | awk '{print $1}' | grep -q "^{{ template_id }}$"
+  args:
+    executable: /bin/bash
+  register: vm_exists
+  changed_when: false  # Checking doesn't change anything
+  failed_when: false   # Don't fail if not found
+
+# Conditional create
+- name: Create VM
+  ansible.builtin.command: qm create {{ template_id }} ...
+  when: vm_exists.rc != 0
+```
+
+See [reference/idempotency-patterns.md](reference/idempotency-patterns.md) for comprehensive patterns.
+
+## Variable Organization
+
+### Quick Summary
+
+**Precedence:** Extra vars (`-e`) > Role vars > Defaults
+
+**Organization:**
+
+```text
+ansible/
+├── group_vars/all.yml      # Variables for ALL hosts
+├── group_vars/proxmox.yml  # Group-specific
+├── host_vars/foxtrot.yml   # Host-specific
+└── playbooks/
+    └── my-playbook.yml     # Use vars: for playbook-specific
+```
+
+**Key principle:** Use `defaults/main.yml` for configurable options, `vars/main.yml` for constants.
+
+See [reference/variable-precedence.md](reference/variable-precedence.md) for complete precedence
+rules (22 levels) and
+[patterns/variable-management-patterns.md](patterns/variable-management-patterns.md) for
+advanced patterns.
+
+## Module Selection
+
+### Prefer ansible.builtin
+
+**Always use fully qualified collection names (FQCN):**
+
+```yaml
+# GOOD
+- name: Ping hosts
+  ansible.builtin.ping:
+
+# BAD (deprecated short names)
+- name: Ping hosts
+  ping:
+```
+
+### Community Collections in Use
+
+- `community.general` - General utilities (interfaces_file, etc.)
+- `community.proxmox` - Proxmox VE management
+- `infisical.vault` - Secrets management
+- `ansible.posix` - POSIX system management
+- `community.docker` - Docker management
+
+See [../../ansible/requirements.yml](../../ansible/requirements.yml) and [reference/collections-guide.md](reference/collections-guide.md).
+
+## Testing
+
+### With ansible-lint
+
+```bash
+# Run all linters
+mise run lint-all
+
+# Just Ansible
+mise run ansible-lint
+```
+
+**Common Issues:** Missing `name:` on tasks, using `shell` instead of `command`, not using
+`changed_when`, deprecated short names, missing `no_log` on sensitive tasks.
+
+### With Molecule
+
+```bash
+cd tools/molecule/default
+molecule create    # Create test environment
+molecule converge  # Run playbook
+molecule verify    # Run tests
+molecule destroy   # Clean up
+```
+
+See [reference/testing-guide.md](reference/testing-guide.md) and [patterns/testing-comprehensive.md](patterns/testing-comprehensive.md) for CI/CD integration.
+
+## Common Anti-Patterns
+
+See [anti-patterns/common-mistakes.md](anti-patterns/common-mistakes.md) for detailed examples.
+
+### Quick List
+
+**1. Not Using `set -euo pipefail`**
+
+```yaml
+# GOOD
+- name: Run script
+  ansible.builtin.shell: |
+    set -euo pipefail
+    command1 | command2
+  args:
+    executable: /bin/bash
+```
+
+**2. Missing `no_log` on Secrets**
+
+```yaml
+# GOOD
+- name: Set password
+  ansible.builtin.command: set-password {{ password }}
+  no_log: true
+```
+
+**3. Using `shell` When `command` Suffices**
+
+Use `shell` ONLY when you need shell features (pipes, redirects, etc.).
+
+```yaml
+# GOOD: No shell features needed
+- name: List files
+  ansible.builtin.command: ls -la
+```
+
+See [anti-patterns/common-mistakes.md](anti-patterns/common-mistakes.md) for complete list and
+[anti-patterns/refactoring-guide.md](anti-patterns/refactoring-guide.md) for improvement
+strategies.
+
+## Tools Available
+
+### Python Analysis Tools (uv)
+
+```bash
+# Complexity metrics
+./tools/analyze_playbook.py playbook.yml
+
+# Find non-idempotent patterns
+./tools/check_idempotency.py playbook.yml
+
+# Variable organization helper
+./tools/extract_variables.py playbook.yml
+```
+
+### Linting
+
+```bash
+# Run all linters
+./tools/lint-all.sh
+```
+
+### Testing
+
+```bash
+# Molecule test scenarios
+./tools/molecule/default/
+```
+
+## Progressive Disclosure
+
+Start here, drill down as needed:
+
+### Quick Reference (Read First)
+
+- [Playbook & Role Patterns](patterns/playbook-role-patterns.md) - State-based playbooks, public API variables, validation
+- [Secrets Management](patterns/secrets-management.md) - Infisical integration, authentication, security
+
+### Deep Patterns (Read When Needed)
+
+- [Testing Comprehensive](patterns/testing-comprehensive.md) - Molecule, CI/CD, test strategies
+- [Role Structure Standards](patterns/role-structure-standards.md) - Directory org, naming conventions
+- [Documentation Templates](patterns/documentation-templates.md) - README structure, variable docs
+- [Variable Management Patterns](patterns/variable-management-patterns.md) - defaults vs vars, naming
+- [Handler Best Practices](patterns/handler-best-practices.md) - Handler usage patterns
+- [Meta Dependencies](patterns/meta-dependencies.md) - galaxy_info, dependencies
+
+### Advanced Automation (from ProxSpray Analysis)
+
+- [Cluster Automation](patterns/cluster-automation.md) - Proxmox cluster formation with idempotency
+- [Network Automation](patterns/network-automation.md) - Declarative network configuration
+- [CEPH Automation](patterns/ceph-automation.md) - Complete CEPH storage deployment
+
+### Core Reference
+
+- [Roles vs Playbooks](reference/roles-vs-playbooks.md) - Organization patterns
+- [Variable Precedence](reference/variable-precedence.md) - Complete precedence rules (22 levels)
+- [Idempotency Patterns](reference/idempotency-patterns.md) - Advanced idempotency techniques
+- [Module Selection](reference/module-selection.md) - Builtin vs community decision guide
+- [Testing Guide](reference/testing-guide.md) - Molecule and ansible-lint deep dive
+- [Collections Guide](reference/collections-guide.md) - Using and managing collections
+- [Production Repos](reference/production-repos.md) - Studied geerlingguy roles index
+
+### Patterns & Anti-Patterns
+
+- [Error Handling](patterns/error-handling.md) - Proper error handling patterns
+- [Task Organization](patterns/task-organization.md) - Reusable tasks and includes
+- [Common Mistakes](anti-patterns/common-mistakes.md) - What to avoid
+- [Refactoring Guide](anti-patterns/refactoring-guide.md) - How to improve existing playbooks
+
+## Related Skills
+
+- **Proxmox Infrastructure** - Playbooks for template creation and network config
+- **NetBox + PowerDNS** - Dynamic inventory and secrets management patterns
--- a/skills/ansible-best-practices/anti-patterns/common-mistakes.md
+++ b/skills/ansible-best-practices/anti-patterns/common-mistakes.md
@@ -0,0 +1,698 @@
+# Common Ansible Anti-Patterns and Mistakes
+
+## Overview
+
+This guide catalogs common mistakes found in Ansible playbooks and provides corrected examples based on Virgo-Core
+repository best practices.
+
+## 1. Not Using `set -euo pipefail` in Shell Scripts
+
+### ❌ Wrong
+
+```yaml
+- name: Run multi-line shell script
+  ansible.builtin.shell: |
+    command1
+    command2 | grep something
+    command3
+```
+
+**Problems:**
+
+- Pipe failures ignored (grep returns no matches = rc 1, but shell continues)
+- Undefined variables silently treated as empty strings
+- First command failure doesn't stop execution
+
+### ✅ Correct
+
+```yaml
+- name: Run multi-line shell script
+  ansible.builtin.shell: |
+    set -euo pipefail
+    command1
+    command2 | grep something
+    command3
+  args:
+    executable: /bin/bash
+```
+
+**Benefits:**
+
+- `-e`: Exit on first error
+- `-u`: Treat undefined variables as errors
+- `-o pipefail`: Pipe fails if any command in pipe fails
+- `executable: /bin/bash`: Ensures bash (not sh) interprets the script
+
+## 2. Using Shell When Command Suffices
+
+### ❌ Wrong
+
+```yaml
+- name: List files
+  ansible.builtin.shell: ls -la /tmp
+```
+
+**Problems:**
+
+- Unnecessary shell overhead
+- Shell injection risk if variables used
+- Less portable
+
+### ✅ Correct
+
+```yaml
+- name: List files
+  ansible.builtin.command: ls -la /tmp
+  changed_when: false
+```
+
+**Use `shell` ONLY when you need:**
+
+- Pipes: `cat file | grep pattern`
+- Redirects: `command > output.txt`
+- Environment expansion: `echo $HOME`
+- Shell built-ins: `source`, `cd`, etc.
+
+## 3. Missing `changed_when` on Command/Shell
+
+### ❌ Wrong
+
+```yaml
+- name: Check if VM exists
+  ansible.builtin.command: qm status 101
+```
+
+**Problem:** Reports "changed" even though it's a read-only check
+
+### ✅ Correct
+
+```yaml
+- name: Check if VM exists
+  ansible.builtin.command: qm status 101
+  register: vm_status
+  changed_when: false
+  failed_when: false
+```
+
+## 4. Missing `no_log` on Sensitive Tasks
+
+### ❌ Wrong
+
+```yaml
+- name: Create user with password
+  ansible.builtin.user:
+    name: myuser
+    password: "{{ user_password }}"
+  # Password will appear in logs!
+```
+
+**Problem:** Sensitive data appears in Ansible logs
+
+### ✅ Correct
+
+```yaml
+- name: Create user with password
+  ansible.builtin.user:
+    name: myuser
+    password: "{{ user_password }}"
+  no_log: true
+```
+
+**Always use `no_log: true` with:**
+
+- Passwords
+- API tokens
+- SSH keys
+- Certificates
+- Any PII or sensitive data
+
+## 5. Using Short Module Names
+
+### ❌ Wrong
+
+```yaml
+- name: Copy file
+  copy:
+    src: file.txt
+    dest: /tmp/file.txt
+
+- name: Install package
+  apt:
+    name: nginx
+    state: present
+```
+
+**Problem:** Short names are deprecated and will be removed
+
+### ✅ Correct
+
+```yaml
+- name: Copy file
+  ansible.builtin.copy:
+    src: file.txt
+    dest: /tmp/file.txt
+
+- name: Install package
+  ansible.builtin.apt:
+    name: nginx
+    state: present
+```
+
+**Use Fully Qualified Collection Names (FQCN):**
+
+- `ansible.builtin.copy` not `copy`
+- `ansible.builtin.command` not `command`
+- `community.proxmox.proxmox_kvm` not `proxmox_kvm`
+
+## 6. Hard-Coding Secrets
+
+### ❌ Wrong
+
+```yaml
+- name: Configure database
+  ansible.builtin.template:
+    src: db-config.j2
+    dest: /etc/app/db.yml
+  vars:
+    db_password: "MyPassword123"  # NEVER DO THIS!
+```
+
+**Problems:**
+
+- Secrets in version control
+- No audit trail
+- Difficult to rotate
+- Security violation
+
+### ✅ Correct
+
+```yaml
+- name: Retrieve database password
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'DB_PASSWORD'
+    secret_var_name: 'db_password'
+
+- name: Configure database
+  ansible.builtin.template:
+    src: db-config.j2
+    dest: /etc/app/db.yml
+  vars:
+    db_password: "{{ db_password }}"
+  no_log: true
+```
+
+## 7. Not Handling "Already Exists" Gracefully
+
+### ❌ Wrong
+
+```yaml
+- name: Create API token
+  ansible.builtin.command: pveum user token add terraform@pam terraform-token
+  # Fails if token already exists
+```
+
+**Problem:** Playbook not idempotent - fails on second run
+
+### ✅ Correct
+
+```yaml
+- name: Create API token
+  ansible.builtin.command: pveum user token add terraform@pam terraform-token
+  register: token_result
+  changed_when: "'already exists' not in token_result.stderr"
+  failed_when:
+    - token_result.rc != 0
+    - "'already exists' not in token_result.stderr"
+```
+
+**Pattern from repository:** Handle expected errors gracefully
+
+## 8. Missing Task Names
+
+### ❌ Wrong
+
+```yaml
+- ansible.builtin.apt:
+    name: nginx
+    state: present
+
+- ansible.builtin.systemd:
+    name: nginx
+    state: started
+```
+
+**Problem:** Hard to understand playbook output
+
+### ✅ Correct
+
+```yaml
+- name: Install Nginx web server
+  ansible.builtin.apt:
+    name: nginx
+    state: present
+
+- name: Start Nginx service
+  ansible.builtin.systemd:
+    name: nginx
+    state: started
+    enabled: true
+```
+
+**ansible-lint will flag this:** `[name[missing]]`
+
+## 9. Using `when` Instead of `failed_when`
+
+### ❌ Wrong
+
+```yaml
+- name: Run command
+  ansible.builtin.command: some-command
+  register: result
+  ignore_errors: true
+
+- name: Fail if bad
+  ansible.builtin.fail:
+    msg: "Command failed"
+  when: result.rc != 0 and 'acceptable error' not in result.stderr
+```
+
+**Problem:** Two tasks instead of one, less clear
+
+### ✅ Correct
+
+```yaml
+- name: Run command
+  ansible.builtin.command: some-command
+  register: result
+  failed_when:
+    - result.rc != 0
+    - "'acceptable error' not in result.stderr"
+```
+
+## 10. Ignoring Return Codes
+
+### ❌ Wrong
+
+```yaml
+- name: Run deployment script
+  ansible.builtin.command: /usr/local/bin/deploy.sh
+  # No error checking at all
+```
+
+**Problem:** Failures go unnoticed
+
+### ✅ Correct
+
+```yaml
+- name: Run deployment script
+  ansible.builtin.command: /usr/local/bin/deploy.sh
+  register: deploy_result
+
+- name: Verify deployment succeeded
+  ansible.builtin.assert:
+    that:
+      - deploy_result.rc == 0
+      - "'SUCCESS' in deploy_result.stdout"
+    fail_msg: "Deployment failed: {{ deploy_result.stderr }}"
+```
+
+## 11. Not Using Handlers for Service Restarts
+
+### ❌ Wrong
+
+```yaml
+- name: Update Nginx config
+  ansible.builtin.copy:
+    src: nginx.conf
+    dest: /etc/nginx/nginx.conf
+
+- name: Restart Nginx
+  ansible.builtin.systemd:
+    name: nginx
+    state: restarted
+  # Always restarts, even if config didn't change
+```
+
+**Problem:** Unnecessary service restarts
+
+### ✅ Correct
+
+```yaml
+- name: Update Nginx config
+  ansible.builtin.copy:
+    src: nginx.conf
+    dest: /etc/nginx/nginx.conf
+  notify: Restart Nginx
+
+handlers:
+  - name: Restart Nginx
+    ansible.builtin.systemd:
+      name: nginx
+      state: restarted
+```
+
+**Benefits:**
+
+- Only restarts if config changes
+- Multiple tasks can trigger same handler
+- Handler runs once at end
+
+## 12. Using `with_items` Instead of `loop`
+
+### ❌ Wrong (Deprecated)
+
+```yaml
+- name: Install packages
+  ansible.builtin.apt:
+    name: "{{ item }}"
+    state: present
+  with_items:
+    - nginx
+    - docker.io
+    - python3-pip
+```
+
+**Problem:** `with_items` is deprecated
+
+### ✅ Correct
+
+```yaml
+- name: Install packages
+  ansible.builtin.apt:
+    name: "{{ item }}"
+    state: present
+  loop:
+    - nginx
+    - docker.io
+    - python3-pip
+```
+
+**Even better (single task):**
+
+```yaml
+- name: Install packages
+  ansible.builtin.apt:
+    name:
+      - nginx
+      - docker.io
+      - python3-pip
+    state: present
+```
+
+## 13. Not Validating Variables
+
+### ❌ Wrong
+
+```yaml
+- name: Create VM
+  community.proxmox.proxmox_kvm:
+    vmid: "{{ vm_id }}"
+    name: "{{ vm_name }}"
+    # ... config ...
+  # What if vm_id or vm_name is undefined?
+```
+
+**Problem:** Cryptic errors if variables missing
+
+### ✅ Correct
+
+```yaml
+- name: Validate VM variables
+  ansible.builtin.assert:
+    that:
+      - vm_id is defined
+      - vm_id is number
+      - vm_id >= 100
+      - vm_name is defined
+      - vm_name is match('^[a-z0-9-]+$')
+    fail_msg: |
+      Invalid VM configuration:
+      vm_id: {{ vm_id | default('UNDEFINED') }}
+      vm_name: {{ vm_name | default('UNDEFINED') }}
+
+- name: Create VM
+  community.proxmox.proxmox_kvm:
+    vmid: "{{ vm_id }}"
+    name: "{{ vm_name }}"
+    # ... config ...
+```
+
+## 14. Mixing Logic and Data
+
+### ❌ Wrong
+
+```yaml
+- name: Configure based on hostname
+  ansible.builtin.template:
+    src: app-config.j2
+    dest: /etc/app/config.yml
+  vars:
+    db_host: "{{ 'prod-db' if inventory_hostname == 'prod-server' else 'dev-db' }}"
+    # Logic in vars
+```
+
+**Problem:** Hard to maintain, not DRY
+
+### ✅ Correct
+
+**In `group_vars/prod.yml`:**
+
+```yaml
+db_host: prod-db
+```
+
+**In `group_vars/dev.yml`:**
+
+```yaml
+db_host: dev-db
+```
+
+**In playbook:**
+
+```yaml
+- name: Configure application
+  ansible.builtin.template:
+    src: app-config.j2
+    dest: /etc/app/config.yml
+```
+
+## 15. Not Using Tags
+
+### ❌ Wrong
+
+```yaml
+# No tags - must run entire playbook every time
+- name: Install packages
+  ansible.builtin.apt: ...
+
+- name: Configure service
+  ansible.builtin.template: ...
+
+- name: Start service
+  ansible.builtin.systemd: ...
+```
+
+### ✅ Correct
+
+```yaml
+- name: Install packages
+  ansible.builtin.apt: ...
+  tags: [install, packages]
+
+- name: Configure service
+  ansible.builtin.template: ...
+  tags: [config]
+
+- name: Start service
+  ansible.builtin.systemd: ...
+  tags: [service, start]
+```
+
+**Usage:**
+
+```bash
+# Only run config tasks
+ansible-playbook playbook.yml --tags config
+
+# Skip service start
+ansible-playbook playbook.yml --skip-tags start
+```
+
+## 16. Using Bare Variables in Templates
+
+### ❌ Wrong
+
+```jinja
+# templates/config.j2
+database_host: {{ db_host }}
+database_port: {{ db_port }}
+```
+
+**Problem:** YAML parsing errors if values contain special characters
+
+### ✅ Correct
+
+```jinja
+# templates/config.j2
+database_host: "{{ db_host }}"
+database_port: {{ db_port }}
+```
+
+**Rule:** Always quote strings, don't quote numbers/booleans
+
+## 17. Hardcoding Paths
+
+### ❌ Wrong
+
+```yaml
+- name: Copy script
+  ansible.builtin.copy:
+    src: scripts/deploy.sh
+    dest: /opt/myapp/deploy.sh
+  # Assumes specific directory structure
+```
+
+### ✅ Correct
+
+```yaml
+- name: Copy script
+  ansible.builtin.copy:
+    src: "{{ playbook_dir }}/../scripts/deploy.sh"
+    dest: "{{ app_install_dir }}/deploy.sh"
+  vars:
+    app_install_dir: /opt/myapp
+```
+
+## 18. Not Using Blocks for Related Tasks
+
+### ❌ Wrong
+
+```yaml
+- name: Task 1
+  ansible.builtin.command: task1
+  when: deploy_mode == 'production'
+
+- name: Task 2
+  ansible.builtin.command: task2
+  when: deploy_mode == 'production'
+
+- name: Task 3
+  ansible.builtin.command: task3
+  when: deploy_mode == 'production'
+```
+
+**Problem:** Repetitive conditions
+
+### ✅ Correct
+
+```yaml
+- name: Production deployment tasks
+  block:
+    - name: Task 1
+      ansible.builtin.command: task1
+
+    - name: Task 2
+      ansible.builtin.command: task2
+
+    - name: Task 3
+      ansible.builtin.command: task3
+
+  when: deploy_mode == 'production'
+```
+
+## 19. Using `sudo` Instead of `become`
+
+### ❌ Wrong
+
+```yaml
+- name: Install package
+  ansible.builtin.command: sudo apt install nginx
+```
+
+**Problems:**
+
+- Bypasses Ansible's privilege escalation
+- No become_user support
+- Less portable
+
+### ✅ Correct
+
+```yaml
+- name: Install package
+  ansible.builtin.apt:
+    name: nginx
+    state: present
+  become: true
+```
+
+## 20. Not Testing Playbooks
+
+### ❌ Wrong
+
+```bash
+# Write playbook, run directly in production
+ansible-playbook production.yml
+```
+
+### ✅ Correct
+
+```bash
+# 1. Syntax check
+ansible-playbook playbook.yml --syntax-check
+
+# 2. Lint
+ansible-lint playbook.yml
+
+# 3. Dry run (check mode)
+ansible-playbook playbook.yml --check
+
+# 4. Test in development
+ansible-playbook playbook.yml -l dev
+
+# 5. Limited rollout in production
+ansible-playbook playbook.yml -l prod --limit 1
+
+# 6. Full production deployment
+ansible-playbook playbook.yml -l prod
+```
+
+## Quick Reference: Ansible-Lint Rules
+
+Common rules flagged by ansible-lint:
+
+| Rule ID | Description | Fix |
+|---------|-------------|-----|
+| `name[missing]` | Task missing name | Add `name:` field |
+| `fqcn[action-core]` | Use FQCN for modules | `ansible.builtin.copy` not `copy` |
+| `no-changed-when` | Command without `changed_when` | Add `changed_when:` |
+| `risky-shell-pipe` | Shell pipe without `set -o pipefail` | Add `set -euo pipefail` |
+| `no-log-password` | Password without `no_log` | Add `no_log: true` |
+
+**Run ansible-lint:**
+
+```bash
+cd ansible
+ansible-lint playbooks/my-playbook.yml
+```
+
+## Summary: Best Practices Checklist
+
+- [ ] Use `set -euo pipefail` in all shell scripts
+- [ ] Use `changed_when: false` for read-only commands
+- [ ] Add `no_log: true` to sensitive tasks
+- [ ] Use FQCN for all modules
+- [ ] Handle "already exists" errors gracefully
+- [ ] Add descriptive names to all tasks
+- [ ] Validate variables with `assert`
+- [ ] Use handlers for service restarts
+- [ ] Store secrets in Infisical, not playbooks
+- [ ] Test with ansible-lint before committing
+- [ ] Use blocks to group related tasks
+- [ ] Add tags for selective execution
+- [ ] Verify critical operations after execution
+
+## Further Reading
+
+- [Ansible Best Practices](https://docs.ansible.com/ansible/latest/user_guide/playbooks_best_practices.html)
+- [Ansible-Lint Rules](https://ansible-lint.readthedocs.io/rules/)
--- a/skills/ansible-best-practices/examples/02-infisical-secrets/README.md
+++ b/skills/ansible-best-practices/examples/02-infisical-secrets/README.md
@@ -0,0 +1,475 @@
+# Docker Deployment with Infisical Secrets
+
+**Learning objective:** See best practices in action - secrets management, error handling, and idempotency.
+
+## What This Example Demonstrates
+
+This playbook showcases **production-ready Ansible patterns** from Virgo-Core:
+
+✅ **Secrets Management:**
+
+- Infisical integration using reusable task
+- Fallback to environment variables
+- `no_log: true` on sensitive tasks
+
+✅ **Error Handling:**
+
+- Pre-flight checks with `assert`
+- `changed_when` for idempotency
+- `failed_when` for graceful failures
+- Block/rescue for rollback
+
+✅ **Best Practices:**
+
+- Fully qualified module names (FQCN)
+- Task organization with blocks
+- Handlers for service restarts
+- Verification steps
+
+✅ **Docker Operations:**
+
+- Idempotent container management
+- Health checks with retries
+- Proper logging on failures
+
+## Prerequisites
+
+### 1. Infisical Setup
+
+**Universal Auth credentials:**
+
+```bash
+export INFISICAL_UNIVERSAL_AUTH_CLIENT_ID="ua-abc123"
+export INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET="secret-xyz789"
+```
+
+**OR fallback environment variables:**
+
+```bash
+export DB_PASSWORD="fallback-db-password"
+export API_KEY="fallback-api-key"
+export REDIS_PASSWORD="fallback-redis-password"
+```
+
+### 2. Ansible Collections
+
+```bash
+# Install required collections
+cd ../../..  # Back to ansible directory
+uv run ansible-galaxy collection install -r requirements.yml
+```
+
+### 3. Target Hosts
+
+Update inventory with Docker hosts:
+
+```ini
+# inventory/hosts
+[docker_hosts]
+docker-01-nexus.spaceships.work
+```
+
+### 4. Templates (create these)
+
+The playbook references templates you need to create:
+
+**`templates/app-config.yml.j2`:**
+
+```yaml
+database:
+  host: db.spaceships.work
+  password: "{{ db_password }}"
+
+api:
+  key: "{{ api_key }}"
+
+redis:
+  host: redis.spaceships.work
+  password: "{{ redis_password }}"
+```
+
+**`templates/docker-compose.yml.j2`:**
+
+```yaml
+version: '3.8'
+services:
+  app:
+    image: your-app:latest
+    environment:
+      - CONFIG_FILE=/config/config.yml
+    volumes:
+      - {{ app_dir }}/config.yml:/config/config.yml:ro
+    ports:
+      - "8080:8080"
+```
+
+## Quick Start
+
+### 1. Validate Playbook
+
+**Syntax check:**
+
+```bash
+ansible-playbook docker-deployment.yml --syntax-check
+```
+
+**Lint check:**
+
+```bash
+ansible-lint docker-deployment.yml
+```
+
+**Dry run:**
+
+```bash
+ansible-playbook docker-deployment.yml --check
+```
+
+### 2. Run Playbook
+
+```bash
+# Full deployment
+ansible-playbook -i ../../inventory/hosts docker-deployment.yml
+
+# Specific tags
+ansible-playbook -i ../../inventory/hosts docker-deployment.yml --tags secrets
+ansible-playbook -i ../../inventory/hosts docker-deployment.yml --tags deploy
+ansible-playbook -i ../../inventory/hosts docker-deployment.yml --tags verify
+```
+
+### 3. Verify Deployment
+
+```bash
+# Check application health
+curl http://docker-01-nexus.spaceships.work:8080/health
+
+# Check Docker containers
+ssh ansible@docker-01-nexus.spaceships.work "docker ps"
+```
+
+## Understanding the Patterns
+
+### Pattern 1: Infisical Secret Lookup
+
+**The Pattern:**
+
+```yaml
+- name: Retrieve database password from Infisical
+  ansible.builtin.include_tasks: ../../tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'DB_PASSWORD'
+    secret_var_name: 'db_password'
+    fallback_env_var: 'DB_PASSWORD'
+```
+
+**Why it works:**
+
+- Reusable task (DRY principle)
+- Validates authentication before retrieving
+- Fallback to environment for local dev
+- No secrets in logs
+- Clear error messages
+
+**Learn more:** [../../patterns/secrets-management.md](../../patterns/secrets-management.md)
+
+### Pattern 2: Pre-flight Validation
+
+**The Pattern:**
+
+```yaml
+pre_tasks:
+  - name: Validate required variables
+    ansible.builtin.assert:
+      that:
+        - app_name is defined
+      fail_msg: "Required variables not set"
+
+  - name: Check if Docker is installed
+    ansible.builtin.command: which docker
+    register: docker_check
+    changed_when: false  # Check doesn't change state
+    failed_when: false   # Don't fail yet
+```
+
+**Why it works:**
+
+- Fails fast with clear messages
+- Prevents partial deployments
+- Uses `changed_when: false` for checks
+- Uses `failed_when: false` to check result later
+
+### Pattern 3: Idempotent Docker Operations
+
+**The Pattern:**
+
+```yaml
+- name: Check if container is already running
+  ansible.builtin.command: docker ps --filter name={{ app_name }}
+  register: container_check
+  changed_when: false
+
+- name: Start Docker containers
+  ansible.builtin.command: docker-compose up -d
+  register: compose_up
+  changed_when: "'Creating' in compose_up.stderr or 'Starting' in compose_up.stderr"
+  when: container_check.stdout != app_name
+```
+
+**Why it works:**
+
+- Check first, then create
+- Only reports "changed" if actually started something
+- Conditional execution with `when:`
+- True idempotency
+
+### Pattern 4: Block/Rescue Error Handling
+
+**The Pattern:**
+
+```yaml
+- name: Docker Management Block
+  block:
+    - name: Pull images
+      # ... tasks ...
+
+  rescue:
+    - name: Show container logs on failure
+      ansible.builtin.command: docker-compose logs --tail=50
+      register: container_logs
+
+    - name: Report failure
+      ansible.builtin.fail:
+        msg: "Deployment failed: {{ container_logs.stdout }}"
+```
+
+**Why it works:**
+
+- Groups related tasks
+- Automatic rollback on failure
+- Provides debugging info
+- Clean error reporting
+
+**Learn more:** [../../patterns/error-handling.md](../../patterns/error-handling.md)
+
+### Pattern 5: Health Checks with Retries
+
+**The Pattern:**
+
+```yaml
+- name: Wait for application to be healthy
+  ansible.builtin.uri:
+    url: "http://localhost:8080/health"
+    status_code: 200
+  register: health_check
+  until: health_check.status == 200
+  retries: 30
+  delay: 10
+```
+
+**Why it works:**
+
+- Automatic retries for transient failures
+- Configurable timeout (30 × 10s = 5 minutes)
+- Fails clearly if never becomes healthy
+
+## Common Mistakes Avoided
+
+This playbook avoids common anti-patterns:
+
+### ❌ Anti-pattern 1: Hard-coded Secrets
+
+```yaml
+# DON'T DO THIS!
+- name: Deploy config
+  ansible.builtin.template:
+    src: config.j2
+    dest: /etc/app/config.yml
+  vars:
+    db_password: "MyPassword123"  # NEVER!
+```
+
+✅ **This playbook:** Uses Infisical with fallback to environment
+
+### ❌ Anti-pattern 2: Missing changed_when
+
+```yaml
+# DON'T DO THIS!
+- name: Start container
+  ansible.builtin.command: docker start myapp
+  # Always reports "changed" even if already running
+```
+
+✅ **This playbook:** Checks first, uses `changed_when` to detect actual changes
+
+### ❌ Anti-pattern 3: No Error Handling
+
+```yaml
+# DON'T DO THIS!
+- name: Deploy app
+  ansible.builtin.command: deploy.sh
+  # No check if it worked, no cleanup on failure
+```
+
+✅ **This playbook:** Uses block/rescue, verifies success
+
+### ❌ Anti-pattern 4: Secrets in Logs
+
+```yaml
+# DON'T DO THIS!
+- name: Set password
+  ansible.builtin.command: set-password {{ password }}
+  # Password visible in Ansible output!
+```
+
+✅ **This playbook:** Uses `no_log: true` on sensitive tasks
+
+## Customization
+
+### Different Application
+
+Change variables:
+
+```yaml
+vars:
+  app_name: "my-other-app"
+  app_dir: "/opt/my-other-app"
+```
+
+### Different Secrets
+
+Add more secret retrievals:
+
+```yaml
+- name: Retrieve JWT secret
+  ansible.builtin.include_tasks: ../../tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'JWT_SECRET'
+    secret_var_name: 'jwt_secret'
+```
+
+### Skip Health Check
+
+```bash
+ansible-playbook docker-deployment.yml --skip-tags verify
+```
+
+## Troubleshooting
+
+### Infisical Authentication Failed
+
+**Error:** `Missing Infisical authentication credentials`
+
+**Solution:**
+
+```bash
+# Check environment variables
+echo $INFISICAL_UNIVERSAL_AUTH_CLIENT_ID
+echo $INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET
+
+# OR use fallback
+export DB_PASSWORD="fallback-password"
+```
+
+### Docker Not Installed
+
+**Error:** `Docker is not installed`
+
+**Solution:**
+
+```bash
+# Install Docker on target host
+ssh ansible@docker-host
+sudo apt update
+sudo apt install docker.io docker-compose
+```
+
+### Container Won't Start
+
+**Error:** `Docker deployment failed`
+
+**Solution:** Playbook shows logs automatically in rescue block. Review output for errors.
+
+**Manual check:**
+
+```bash
+ssh ansible@docker-host
+cd /opt/my-application
+docker-compose logs
+```
+
+### Health Check Timeout
+
+**Error:** `Wait for application to be healthy` times out
+
+**Solution:**
+
+```yaml
+# Increase retries/delay
+retries: 60  # 10 minutes
+delay: 10
+```
+
+## Testing the Playbook
+
+### Check Idempotency
+
+```bash
+# Run twice - second run should show no changes
+ansible-playbook docker-deployment.yml
+ansible-playbook docker-deployment.yml  # Should be all "ok", no "changed"
+```
+
+### Run Linters
+
+```bash
+# Ansible lint
+ansible-lint docker-deployment.yml
+
+# Custom idempotency check
+../../tools/check_idempotency.py docker-deployment.yml
+
+# Full lint suite
+../../tools/lint-all.sh
+```
+
+## Next Steps
+
+### Learn More Patterns
+
+- **Error Handling:** [../../patterns/error-handling.md](../../patterns/error-handling.md)
+- **Secrets Management:** [../../patterns/secrets-management.md](../../patterns/secrets-management.md)
+- **Common Mistakes:** [../../anti-patterns/common-mistakes.md](../../anti-patterns/common-mistakes.md)
+
+### Additional Examples
+
+- **Basic Playbook:** `../01-basic-playbook/` - Simpler starting point
+- **Repository Playbooks:** `../../../ansible/playbooks/` - Real production playbooks
+
+### Best Practices
+
+Review the main skill:
+
+- [../../SKILL.md](../../SKILL.md) - Complete best practices guide
+
+## Why These Patterns Matter
+
+**In Production:**
+
+- ✅ Secrets never in version control
+- ✅ Playbooks are truly idempotent
+- ✅ Clear error messages for troubleshooting
+- ✅ Audit trail for all operations
+- ✅ Rollback on failures
+
+**For Teams:**
+
+- ✅ Consistent patterns across playbooks
+- ✅ Easy to understand and maintain
+- ✅ Self-documenting code
+- ✅ Reduced bus factor
+
+**For You:**
+
+- ✅ Confidence in deployments
+- ✅ Less time debugging
+- ✅ Better sleep at night!
--- a/skills/ansible-best-practices/examples/02-infisical-secrets/docker-deployment.yml
+++ b/skills/ansible-best-practices/examples/02-infisical-secrets/docker-deployment.yml
@@ -0,0 +1,211 @@
+---
+# =============================================================================
+# Docker Deployment with Infisical Secrets
+# =============================================================================
+# This playbook demonstrates best practices from Virgo-Core:
+#   - Infisical secrets management (using reusable task)
+#   - Proper error handling with changed_when/failed_when
+#   - Idempotent command execution
+#   - No secrets in logs (no_log: true)
+#   - Fully qualified module names (FQCN)
+#   - Task organization with blocks
+
+- name: Deploy Docker application with secrets from Infisical
+  hosts: docker_hosts
+  become: true
+  gather_facts: true
+
+  vars:
+    app_name: "my-application"
+    app_dir: "/opt/{{ app_name }}"
+    infisical_project_id: "7b832220-24c0-45bc-a5f1-ce9794a31259"
+    infisical_env: "prod"
+    infisical_path: "/doggos-cluster"
+
+  # ==========================================================================
+  # Pre-flight Checks
+  # ==========================================================================
+
+  pre_tasks:
+    - name: Validate required variables
+      ansible.builtin.assert:
+        that:
+          - app_name is defined and app_name | length > 0
+          - app_dir is defined
+          - infisical_project_id is defined
+        fail_msg: "Required variables not set"
+        success_msg: "All required variables present"
+      tags: [always]
+
+    - name: Check if Docker is installed
+      ansible.builtin.command: which docker
+      register: docker_check
+      changed_when: false
+      failed_when: false
+      tags: [always]
+
+    - name: Fail if Docker not installed
+      ansible.builtin.fail:
+        msg: |
+          Docker is not installed on {{ inventory_hostname }}
+          Please install Docker first: sudo apt install docker.io
+      when: docker_check.rc != 0
+      tags: [always]
+
+  # ==========================================================================
+  # Main Tasks
+  # ==========================================================================
+
+  tasks:
+    # ========================================================================
+    # Retrieve Secrets from Infisical
+    # ========================================================================
+
+    - name: Secrets Management Block
+      block:
+        - name: Retrieve database password from Infisical
+          ansible.builtin.include_tasks: ../../tasks/infisical-secret-lookup.yml
+          vars:
+            secret_name: 'DB_PASSWORD'
+            secret_var_name: 'db_password'
+            fallback_env_var: 'DB_PASSWORD'  # Optional fallback
+
+        - name: Retrieve API key from Infisical
+          ansible.builtin.include_tasks: ../../tasks/infisical-secret-lookup.yml
+          vars:
+            secret_name: 'API_KEY'
+            secret_var_name: 'api_key'
+            fallback_env_var: 'API_KEY'
+
+        - name: Retrieve Redis password from Infisical
+          ansible.builtin.include_tasks: ../../tasks/infisical-secret-lookup.yml
+          vars:
+            secret_name: 'REDIS_PASSWORD'
+            secret_var_name: 'redis_password'
+            fallback_env_var: 'REDIS_PASSWORD'
+
+      tags: [secrets, config]
+
+    # ========================================================================
+    # Application Setup
+    # ========================================================================
+
+    - name: Application Deployment Block
+      block:
+        - name: Create application directory
+          ansible.builtin.file:
+            path: "{{ app_dir }}"
+            state: directory
+            owner: root
+            group: root
+            mode: '0755'
+
+        - name: Deploy application configuration
+          ansible.builtin.template:
+            src: app-config.yml.j2
+            dest: "{{ app_dir }}/config.yml"
+            owner: root
+            group: root
+            mode: '0600'  # Secure permissions for config with secrets
+          notify: Restart application
+          no_log: true  # Config contains secrets
+
+        - name: Deploy Docker Compose file
+          ansible.builtin.template:
+            src: docker-compose.yml.j2
+            dest: "{{ app_dir }}/docker-compose.yml"
+            owner: root
+            group: root
+            mode: '0644'
+
+      rescue:
+        - name: Report deployment failure
+          ansible.builtin.fail:
+            msg: "Failed to deploy application configuration"
+
+      tags: [deploy, config]
+
+    # ========================================================================
+    # Docker Operations (with proper idempotency)
+    # ========================================================================
+
+    - name: Docker Management Block
+      block:
+        - name: Check if container is already running
+          ansible.builtin.command: docker ps --filter name={{ app_name }} --format "{{ '{{' }}.Names{{ '}}' }}"
+          register: container_check
+          changed_when: false
+          failed_when: false
+
+        - name: Pull Docker images
+          ansible.builtin.command: docker-compose -f {{ app_dir }}/docker-compose.yml pull
+          args:
+            chdir: "{{ app_dir }}"
+          register: pull_result
+          changed_when: "'Downloaded newer image' in pull_result.stdout"
+          when: container_check.stdout != app_name
+
+        - name: Start Docker containers
+          ansible.builtin.command: docker-compose -f {{ app_dir }}/docker-compose.yml up -d
+          args:
+            chdir: "{{ app_dir }}"
+          register: compose_up
+          changed_when: "'Creating' in compose_up.stderr or 'Starting' in compose_up.stderr"
+          when: container_check.stdout != app_name
+
+        - name: Wait for application to be healthy
+          ansible.builtin.uri:
+            url: "http://localhost:8080/health"
+            status_code: 200
+          register: health_check
+          until: health_check.status == 200
+          retries: 30
+          delay: 10
+          changed_when: false
+
+      rescue:
+        - name: Show container logs on failure
+          ansible.builtin.command: docker-compose -f {{ app_dir }}/docker-compose.yml logs --tail=50
+          args:
+            chdir: "{{ app_dir }}"
+          register: container_logs
+          changed_when: false
+
+        - name: Report Docker failure
+          ansible.builtin.fail:
+            msg: |
+              Docker deployment failed
+              Logs: {{ container_logs.stdout }}
+
+      tags: [deploy, docker]
+
+    # ========================================================================
+    # Verification
+    # ========================================================================
+
+    - name: Verify application is running
+      ansible.builtin.command: docker ps --filter name={{ app_name }} --filter status=running --format "{{ '{{' }}.Status{{ '}}' }}"
+      register: running_check
+      changed_when: false
+      failed_when: "'Up' not in running_check.stdout"
+      tags: [verify]
+
+    - name: Report deployment success
+      ansible.builtin.debug:
+        msg: |
+          ✓ Application deployed successfully
+          Container: {{ app_name }}
+          Status: {{ running_check.stdout }}
+          Health endpoint: http://{{ inventory_hostname }}:8080/health
+      tags: [verify]
+
+  # ==========================================================================
+  # Handlers
+  # ==========================================================================
+
+  handlers:
+    - name: Restart application
+      ansible.builtin.command: docker-compose -f {{ app_dir }}/docker-compose.yml restart
+      args:
+        chdir: "{{ app_dir }}"
+      changed_when: true
--- a/skills/ansible-best-practices/patterns/ceph-automation.md
+++ b/skills/ansible-best-practices/patterns/ceph-automation.md
@@ -0,0 +1,687 @@
+# CEPH Storage Automation Patterns
+
+Best practices for automating CEPH cluster deployment in Proxmox VE environments.
+
+## Pattern: Declarative CEPH OSD Configuration
+
+**Problem**: ProxSpray leaves OSD creation as a manual step, defeating the purpose of automation.
+
+**Solution**: Fully automate OSD creation with declarative configuration that specifies devices and partitioning.
+
+### Configuration Model
+
+```yaml
+# group_vars/matrix_cluster.yml
+---
+# CEPH network configuration
+ceph_enabled: true
+ceph_network: "192.168.5.0/24"          # Public network (vmbr1)
+ceph_cluster_network: "192.168.7.0/24"  # Private network (vmbr2)
+
+# OSD configuration per node (4 OSDs per node = 12 total)
+ceph_osds:
+  foxtrot:
+    - device: /dev/nvme1n1
+      partitions: 2  # Create 2 OSDs per 4TB NVMe
+      db_device: null
+      wal_device: null
+      crush_device_class: nvme
+    - device: /dev/nvme2n1
+      partitions: 2
+      db_device: null
+      wal_device: null
+      crush_device_class: nvme
+
+  golf:
+    - device: /dev/nvme1n1
+      partitions: 2
+      crush_device_class: nvme
+    - device: /dev/nvme2n1
+      partitions: 2
+      crush_device_class: nvme
+
+  hotel:
+    - device: /dev/nvme1n1
+      partitions: 2
+      crush_device_class: nvme
+    - device: /dev/nvme2n1
+      partitions: 2
+      crush_device_class: nvme
+
+# Pool configuration
+ceph_pools:
+  - name: vm_ssd
+    pg_num: 128
+    pgp_num: 128
+    size: 3           # Replicate across 3 nodes
+    min_size: 2       # Minimum 2 replicas required
+    application: rbd
+    crush_rule: replicated_rule
+    compression: false
+
+  - name: vm_containers
+    pg_num: 64
+    pgp_num: 64
+    size: 3
+    min_size: 2
+    application: rbd
+    crush_rule: replicated_rule
+    compression: true
+```
+
+## Pattern: Idempotent CEPH Installation
+
+**Problem**: CEPH installation commands fail if already installed.
+
+**Solution**: Check CEPH status before attempting installation.
+
+### Implementation
+
+```yaml
+# roles/proxmox_ceph/tasks/install.yml
+---
+- name: Check if CEPH is already installed
+  ansible.builtin.stat:
+    path: /etc/pve/ceph.conf
+  register: ceph_conf_check
+
+- name: Check CEPH packages
+  ansible.builtin.command:
+    cmd: dpkg -l ceph-common
+  register: ceph_package_check
+  failed_when: false
+  changed_when: false
+
+- name: Install CEPH packages
+  ansible.builtin.command:
+    cmd: "pveceph install --repository no-subscription"
+  when:
+    - ceph_package_check.rc != 0
+  register: ceph_install
+  changed_when: "'installed' in ceph_install.stdout"
+
+- name: Verify CEPH installation
+  ansible.builtin.command:
+    cmd: ceph --version
+  register: ceph_version
+  changed_when: false
+  failed_when: ceph_version.rc != 0
+```
+
+## Pattern: CEPH Cluster Initialization
+
+**Problem**: CEPH cluster can only be initialized once, must be idempotent.
+
+**Solution**: Check for existing cluster configuration before initialization.
+
+### Implementation
+
+```yaml
+# roles/proxmox_ceph/tasks/init.yml
+---
+- name: Check if CEPH cluster is initialized
+  ansible.builtin.command:
+    cmd: ceph status
+  register: ceph_status_check
+  failed_when: false
+  changed_when: false
+
+- name: Set CEPH initialization facts
+  ansible.builtin.set_fact:
+    ceph_initialized: "{{ ceph_status_check.rc == 0 }}"
+    is_ceph_first_node: "{{ inventory_hostname == groups[cluster_group][0] }}"
+
+- name: Initialize CEPH cluster on first node
+  ansible.builtin.command:
+    cmd: "pveceph init --network {{ ceph_network }} --cluster-network {{ ceph_cluster_network }}"
+  when:
+    - is_ceph_first_node | default(false)
+    - not ceph_initialized
+  register: ceph_init
+  changed_when: ceph_init.rc == 0
+
+- name: Wait for CEPH cluster to initialize
+  ansible.builtin.pause:
+    seconds: 15
+  when: ceph_init.changed
+```
+
+## Pattern: CEPH Monitor Creation
+
+**Problem**: Monitors must be created in specific order and verified for quorum.
+
+**Solution**: Create monitors with proper ordering and quorum verification.
+
+### Implementation
+
+```yaml
+# roles/proxmox_ceph/tasks/monitors.yml
+---
+- name: Check existing CEPH monitors
+  ansible.builtin.command:
+    cmd: ceph mon dump
+  register: mon_dump
+  delegate_to: "{{ groups[cluster_group][0] }}"
+  run_once: true
+  failed_when: false
+  changed_when: false
+
+- name: Set monitor facts
+  ansible.builtin.set_fact:
+    has_monitor: "{{ inventory_hostname in mon_dump.stdout }}"
+  when: mon_dump.rc == 0
+
+- name: Set local is_ceph_first_node fact
+  ansible.builtin.set_fact:
+    is_ceph_first_node: "{{ inventory_hostname == groups[cluster_group][0] }}"
+
+- name: Create CEPH monitor on first node
+  ansible.builtin.command:
+    cmd: pveceph mon create
+  when:
+    - is_ceph_first_node | default(false)
+    - not has_monitor | default(false)
+  register: mon_create_first
+  changed_when: mon_create_first.rc == 0
+
+- name: Wait for first monitor to stabilize
+  ansible.builtin.pause:
+    seconds: 10
+  when: mon_create_first.changed
+
+- name: Create CEPH monitors on other nodes
+  ansible.builtin.command:
+    cmd: pveceph mon create
+  when:
+    - not (is_ceph_first_node | default(false))
+    - not has_monitor | default(false)
+  register: mon_create_others
+  changed_when: mon_create_others.rc == 0
+
+- name: Verify monitor quorum
+  ansible.builtin.command:
+    cmd: ceph quorum_status
+  register: quorum_status
+  changed_when: false
+  delegate_to: "{{ groups[cluster_group][0] }}"
+  run_once: true
+  vars:
+    expected_mons: "{{ ceph_mon_count | default(3) }}"
+  failed_when: ((quorum_status.stdout | from_json).quorum | length) < expected_mons
+```
+
+## Pattern: CEPH Manager Creation
+
+**Problem**: Managers provide web interface and monitoring; should run on all nodes for HA.
+
+**Solution**: Create managers on all nodes with proper verification.
+
+### Implementation
+
+```yaml
+# roles/proxmox_ceph/tasks/managers.yml
+---
+- name: Check existing CEPH managers
+  ansible.builtin.command:
+    cmd: ceph mgr dump
+  register: mgr_dump
+  delegate_to: "{{ groups[cluster_group][0] }}"
+  run_once: true
+  failed_when: false
+  changed_when: false
+
+- name: Set manager facts
+  ansible.builtin.set_fact:
+    has_manager: "{{ inventory_hostname in mgr_dump.stdout }}"
+  when: mgr_dump.rc == 0
+
+- name: Create CEPH manager
+  ansible.builtin.command:
+    cmd: pveceph mgr create
+  when: not has_manager | default(false)
+  register: mgr_create
+  changed_when: mgr_create.rc == 0
+
+- name: Enable CEPH dashboard module
+  ansible.builtin.command:
+    cmd: ceph mgr module enable dashboard
+  delegate_to: "{{ groups[cluster_group][0] }}"
+  run_once: true
+  register: dashboard_enable
+  changed_when: "'already enabled' not in dashboard_enable.stderr"
+  failed_when:
+    - dashboard_enable.rc != 0
+    - "'already enabled' not in dashboard_enable.stderr"
+```
+
+## Pattern: Automated OSD Creation with Partitioning
+
+**Problem**: Manual OSD creation is error-prone and doesn't support partitioning large drives.
+
+**Solution**: Automate partition creation and OSD deployment.
+
+### Implementation
+
+```yaml
+# roles/proxmox_ceph/tasks/osd_create.yml
+---
+- name: Get list of existing OSDs
+  ansible.builtin.command:
+    cmd: pveceph osd ls
+  register: existing_osds
+  changed_when: false
+  failed_when: false
+
+- name: Probe existing CEPH volumes
+  ansible.builtin.command:
+    cmd: ceph-volume lvm list --format json
+  register: ceph_volume_probe
+  changed_when: false
+  failed_when: false
+
+- name: Check OSD devices availability
+  ansible.builtin.command:
+    cmd: "lsblk -ndo NAME,TYPE {{ item.device }}"
+  register: device_check
+  failed_when: device_check.rc != 0
+  changed_when: false
+  loop: "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
+  loop_control:
+    label: "{{ item.device }}"
+
+- name: Wipe existing partitions on OSD devices
+  ansible.builtin.command:
+    cmd: "wipefs -a {{ item.device }}"
+  when:
+    - ceph_volume_probe.rc == 0
+    - ceph_volume_probe.stdout | from_json | dict2items | selectattr('value.0.devices', 'defined') | map(attribute='value.0.devices') | flatten | select('match', '^' + item.device) | list | length == 0
+    - ceph_wipe_disks | default(false)
+  loop: "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
+  loop_control:
+    label: "{{ item.device }}"
+  register: wipe_result
+  changed_when: wipe_result.rc == 0
+
+- name: Build list of partitions to create
+  ansible.builtin.set_fact:
+    osd_partitions: >-
+      {% set result = [] -%}
+      {% for osd in ceph_osds[inventory_hostname_short] | default([]) -%}
+        {% if (osd.partitions | default(1) | int) > 1 -%}
+          {% for part_num in range(1, (osd.partitions | int) + 1) -%}
+            {% set _ = result.append({
+              'device': osd.device,
+              'partition_num': part_num,
+              'total_partitions': osd.partitions,
+              'db_device': osd.get('db_device'),
+              'wal_device': osd.get('wal_device')
+            }) -%}
+          {% endfor -%}
+        {% endif -%}
+      {% endfor -%}
+      {{ result }}
+
+- name: Create partitions for multiple OSDs per device
+  community.general.parted:
+    device: "{{ item.device }}"
+    number: "{{ item.partition_num }}"
+    state: present
+    part_start: "{{ ((item.partition_num - 1) * (100 / item.total_partitions)) }}%"
+    part_end: "{{ (item.partition_num * (100 / item.total_partitions)) }}%"
+    label: gpt
+  loop: "{{ osd_partitions }}"
+  loop_control:
+    label: "{{ item.device }}{{ 'p' if item.device.startswith('/dev/nvme') else '' }}{{ item.partition_num }}"
+
+- name: Create OSDs from whole devices
+  ansible.builtin.command:
+    cmd: >
+      pveceph osd create {{ item.device }}
+      {% if item.db_device %}--db_dev {{ item.db_device }}{% endif %}
+      {% if item.wal_device %}--wal_dev {{ item.wal_device }}{% endif %}
+  when:
+    - item.partitions | default(1) == 1
+    - ceph_volume_probe.rc == 0
+    - ceph_volume_probe.stdout | from_json | dict2items | selectattr('value.0.devices', 'defined') | map(attribute='value.0.devices') | flatten | select('match', '^' + item.device + '$') | list | length == 0
+  loop: "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
+  loop_control:
+    label: "{{ item.device }}"
+  register: osd_create_whole
+  changed_when: "'successfully created' in osd_create_whole.stdout"
+  failed_when:
+    - osd_create_whole.rc != 0
+    - "'already in use' not in osd_create_whole.stderr"
+
+- name: Create OSDs from partitions
+  ansible.builtin.command:
+    cmd: >
+      pveceph osd create {{ item.device }}{{ 'p' if item.device.startswith('/dev/nvme') else '' }}{{ item.partition_num }}
+      {% if item.db_device %}--db_dev {{ item.db_device }}{% endif %}
+      {% if item.wal_device %}--wal_dev {{ item.wal_device %}{% endif %}
+  when:
+    - ceph_volume_probe.rc == 0
+    - ceph_volume_probe.stdout | from_json | dict2items | selectattr('value.0.devices', 'defined') | map(attribute='value.0.devices') | flatten | select('match', '^' + item.device + ('p' if item.device.startswith('/dev/nvme') else '') + (item.partition_num | string) + '$') | list | length == 0
+  loop: "{{ osd_partitions }}"
+  loop_control:
+    label: "{{ item.device }}{{ 'p' if item.device.startswith('/dev/nvme') else '' }}{{ item.partition_num }}"
+  register: osd_create_partition
+  changed_when: "'successfully created' in osd_create_partition.stdout"
+  failed_when:
+    - osd_create_partition.rc != 0
+    - "'already in use' not in osd_create_partition.stderr"
+
+- name: Wait for OSDs to come up
+  ansible.builtin.command:
+    cmd: ceph osd tree
+  register: osd_tree
+  changed_when: false
+  delegate_to: "{{ groups[cluster_group][0] }}"
+  run_once: true
+  until: "'up' in osd_tree.stdout"
+  retries: 10
+  delay: 5
+```
+
+## Pattern: CEPH Pool Creation
+
+**Problem**: Pools must be created with proper PG counts, replication, and application tags.
+
+**Solution**: Declarative pool configuration with validation.
+
+### Implementation
+
+```yaml
+# roles/proxmox_ceph/tasks/pools.yml
+---
+- name: Get existing CEPH pools
+  ansible.builtin.command:
+    cmd: ceph osd pool ls
+  register: existing_pools
+  changed_when: false
+
+- name: Create CEPH pools
+  ansible.builtin.command:
+    cmd: >
+      ceph osd pool create {{ item.name }}
+      {{ item.pg_num }}
+      {{ item.pgp_num | default(item.pg_num) }}
+      replicated
+      {{ item.crush_rule | default('replicated_rule') }}
+  when: item.name not in existing_pools.stdout_lines
+  loop: "{{ ceph_pools }}"
+  loop_control:
+    label: "{{ item.name }}"
+  register: pool_create
+  changed_when: pool_create.rc == 0
+
+- name: Get current pool replication size
+  ansible.builtin.command:
+    cmd: "ceph osd pool get {{ item.name }} size -f json"
+  loop: "{{ ceph_pools }}"
+  loop_control:
+    label: "{{ item.name }}"
+  register: pool_size_current
+  changed_when: false
+
+- name: Set pool replication size
+  ansible.builtin.command:
+    cmd: "ceph osd pool set {{ item.name }} size {{ item.size }}"
+  when: (pool_size_current.results[loop_index].stdout | from_json).size != item.size
+  loop: "{{ ceph_pools }}"
+  loop_control:
+    label: "{{ item.name }}"
+    index_var: loop_index
+
+- name: Get current pool minimum replication size
+  ansible.builtin.command:
+    cmd: "ceph osd pool get {{ item.name }} min_size -f json"
+  loop: "{{ ceph_pools }}"
+  loop_control:
+    label: "{{ item.name }}"
+  register: pool_min_size_current
+  changed_when: false
+
+- name: Set pool minimum replication size
+  ansible.builtin.command:
+    cmd: "ceph osd pool set {{ item.name }} min_size {{ item.min_size }}"
+  when: (pool_min_size_current.results[loop_index].stdout | from_json).min_size != item.min_size
+  loop: "{{ ceph_pools }}"
+  loop_control:
+    label: "{{ item.name }}"
+    index_var: loop_index
+
+- name: Get current pool applications
+  ansible.builtin.command:
+    cmd: "ceph osd pool application get {{ item.name }} -f json"
+  when: item.application is defined
+  loop: "{{ ceph_pools }}"
+  loop_control:
+    label: "{{ item.name }}"
+  register: pool_app_current
+  changed_when: false
+  failed_when: false
+
+- name: Set pool application
+  ansible.builtin.command:
+    cmd: "ceph osd pool application enable {{ item.name }} {{ item.application }}"
+  when:
+    - item.application is defined
+    - pool_app_current.results[loop_index].rc == 0
+    - item.application not in (pool_app_current.results[loop_index].stdout | from_json | default({}))
+  loop: "{{ ceph_pools }}"
+  loop_control:
+    label: "{{ item.name }}"
+    index_var: loop_index
+
+- name: Get current pool compression mode
+  ansible.builtin.command:
+    cmd: "ceph osd pool get {{ item.name }} compression_mode -f json"
+  when: item.compression | default(false)
+  loop: "{{ ceph_pools }}"
+  loop_control:
+    label: "{{ item.name }}"
+  register: pool_compression_current
+  changed_when: false
+
+- name: Enable compression on pools
+  ansible.builtin.command:
+    cmd: "ceph osd pool set {{ item.name }} compression_mode aggressive"
+  when:
+    - item.compression | default(false)
+    - (pool_compression_current.results[loop_index].stdout | from_json).compression_mode != 'aggressive'
+  loop: "{{ ceph_pools }}"
+  loop_control:
+    label: "{{ item.name }}"
+    index_var: loop_index
+```
+
+## Pattern: CEPH Health Verification
+
+**Problem**: CEPH cluster may appear successful but have health issues.
+
+**Solution**: Comprehensive health checks after deployment.
+
+### Implementation
+
+```yaml
+# roles/proxmox_ceph/tasks/verify.yml
+---
+- name: Check CEPH cluster health
+  ansible.builtin.command:
+    cmd: ceph health
+  register: ceph_health
+  changed_when: false
+  delegate_to: "{{ groups[cluster_group][0] }}"
+  run_once: true
+
+- name: Get CEPH status
+  ansible.builtin.command:
+    cmd: ceph status
+  register: ceph_status
+  changed_when: false
+  delegate_to: "{{ groups[cluster_group][0] }}"
+  run_once: true
+
+- name: Verify expected OSD count
+  ansible.builtin.set_fact:
+    expected_osd_count: >-
+      {{
+        ceph_osds
+        | dict2items
+        | map(attribute='value')
+        | sum(start=[])
+        | map('default', {'partitions': 1})
+        | map(attribute='partitions')
+        | map('int')
+        | sum
+      }}
+  delegate_to: "{{ groups[cluster_group][0] }}"
+  run_once: true
+
+- name: Check OSD count matches expected
+  ansible.builtin.assert:
+    that:
+      - "(ceph_status.stdout | from_json).osdmap.num_osds == (expected_osd_count | int)"
+    fail_msg: >-
+      Expected {{ expected_osd_count }} OSDs but found
+      {{ (ceph_status.stdout | from_json).osdmap.num_osds }}
+  delegate_to: "{{ groups[cluster_group][0] }}"
+  run_once: true
+
+- name: Check all OSDs are up
+  ansible.builtin.command:
+    cmd: ceph osd tree
+  register: osd_tree
+  changed_when: false
+  failed_when: "'down' in osd_tree.stdout"
+  delegate_to: "{{ groups[cluster_group][0] }}"
+  run_once: true
+
+- name: Verify PG status
+  ansible.builtin.command:
+    cmd: ceph pg stat
+  register: pg_stat
+  changed_when: false
+  failed_when: "'active+clean' not in pg_stat.stdout"
+  delegate_to: "{{ groups[cluster_group][0] }}"
+  run_once: true
+  retries: 30
+  delay: 10
+  until: "'active+clean' in pg_stat.stdout"
+
+- name: Display CEPH status
+  ansible.builtin.debug:
+    msg: |
+      CEPH Cluster Health: {{ ceph_health.stdout }}
+      {{ ceph_status.stdout_lines | join('\n') }}
+  delegate_to: "{{ groups[cluster_group][0] }}"
+  run_once: true
+```
+
+## Anti-Pattern: Manual OSD Creation
+
+**❌ Don't Do This** (from ProxSpray):
+
+```yaml
+- name: Create OSD on available disks (manual step required)
+  ansible.builtin.debug:
+    msg: |
+      To create OSDs, run manually:
+      pveceph osd create /dev/sda
+      pveceph osd create /dev/sdb
+```
+
+**Problems**:
+
+- Defeats purpose of automation
+- Error-prone manual process
+- No consistency across nodes
+- Difficult to scale
+
+**✅ Do This Instead**: Use the declarative OSD configuration pattern shown above.
+
+## Complete Role Example
+
+```yaml
+# roles/proxmox_ceph/tasks/main.yml
+---
+- name: Install CEPH packages
+  ansible.builtin.include_tasks: install.yml
+
+- name: Initialize CEPH cluster (first node only)
+  ansible.builtin.include_tasks: init.yml
+  when: inventory_hostname == groups[cluster_group][0]
+
+- name: Create CEPH monitors
+  ansible.builtin.include_tasks: monitors.yml
+
+- name: Create CEPH managers
+  ansible.builtin.include_tasks: managers.yml
+
+- name: Create OSDs
+  ansible.builtin.include_tasks: osd_create.yml
+  when: ceph_osds[inventory_hostname_short] is defined
+
+- name: Create CEPH pools
+  ansible.builtin.include_tasks: pools.yml
+  when: inventory_hostname == groups[cluster_group][0]
+
+- name: Verify CEPH health
+  ansible.builtin.include_tasks: verify.yml
+```
+
+## Testing
+
+```bash
+# Syntax check
+ansible-playbook --syntax-check playbooks/ceph-deploy.yml
+
+# Check mode (limited - CEPH commands don't support check mode well)
+ansible-playbook playbooks/ceph-deploy.yml --check --diff
+
+# Deploy CEPH to Matrix cluster
+ansible-playbook playbooks/ceph-deploy.yml --limit matrix_cluster
+
+# Verify CEPH status
+ansible -i inventory/proxmox.yml foxtrot -m shell -a "ceph status"
+ansible -i inventory/proxmox.yml foxtrot -m shell -a "ceph osd tree"
+ansible -i inventory/proxmox.yml foxtrot -m shell -a "ceph health detail"
+```
+
+## Matrix Cluster Example
+
+```yaml
+# playbooks/ceph-deploy.yml
+---
+- name: Deploy CEPH Storage on Matrix Cluster
+  hosts: matrix_cluster
+  become: true
+  serial: 1  # Deploy one node at a time
+
+  pre_tasks:
+    - name: Verify network MTU
+      ansible.builtin.command:
+        cmd: "ip link show vmbr1"
+      register: mtu_check
+      changed_when: false
+      failed_when: "'mtu 9000' not in mtu_check.stdout"
+
+  roles:
+    - role: proxmox_ceph
+      vars:
+        cluster_group: matrix_cluster
+        ceph_wipe_disks: false  # Set to true for fresh deployment
+```
+
+## Related Patterns
+
+- [Cluster Automation](cluster-automation.md) - Cluster formation prerequisite
+- [Network Automation](network-automation.md) - Network configuration for CEPH
+- [Error Handling](error-handling.md) - CEPH-specific error handling
+
+## References
+
+- ProxSpray analysis: `docs/proxspray-analysis.md` (lines 333-488)
+- Proxmox VE CEPH documentation
+- CEPH configuration reference
+- OSD deployment best practices
--- a/skills/ansible-best-practices/patterns/cluster-automation.md
+++ b/skills/ansible-best-practices/patterns/cluster-automation.md
@@ -0,0 +1,335 @@
+# Cluster Automation Patterns
+
+Best practices for automating Proxmox cluster formation with idempotent,
+production-ready Ansible playbooks.
+
+## Pattern: Idempotent Cluster Status Detection
+
+**Problem**: Cluster formation commands (`pvecm create`, `pvecm add`) fail if run
+on nodes already in a cluster, making automation brittle.
+
+**Solution**: Always check cluster status before attempting destructive operations.
+
+### Implementation
+
+```yaml
+- name: Check existing cluster status
+  ansible.builtin.command:
+    cmd: pvecm status
+  register: cluster_status
+  failed_when: false
+  changed_when: false
+
+- name: Get cluster nodes list
+  ansible.builtin.command:
+    cmd: pvecm nodes
+  register: cluster_nodes_check
+  failed_when: false
+  changed_when: false
+
+- name: Set cluster facts
+  ansible.builtin.set_fact:
+    is_cluster_member: "{{ cluster_status.rc == 0 and (cluster_nodes_check.stdout_lines | length > 1 or cluster_name in cluster_status.stdout) }}"
+    is_first_node: "{{ inventory_hostname == groups['proxmox'][0] }}"
+    in_target_cluster: "{{ cluster_status.rc == 0 and cluster_name in cluster_status.stdout }}"
+
+- name: Create new cluster on first node
+  ansible.builtin.command:
+    cmd: "pvecm create {{ cluster_name }}"
+  when:
+    - is_first_node
+    - not in_target_cluster
+  register: cluster_create
+  changed_when: cluster_create.rc == 0
+
+- name: Join cluster on other nodes
+  ansible.builtin.command:
+    cmd: "pvecm add {{ hostvars[groups['proxmox'][0]].ansible_host }}"
+  when:
+    - not is_first_node
+    - not is_cluster_member
+  register: cluster_join
+  changed_when: cluster_join.rc == 0
+```
+
+### Key Benefits
+
+1. **Safe Re-runs**: Playbook can run multiple times without breaking existing clusters
+2. **Error Recovery**: Nodes can rejoin if removed from cluster
+3. **Multi-Cluster Support**: Prevents accidentally joining wrong cluster
+4. **Clear State**: `changed_when` accurately reflects actual changes
+
+## Pattern: Hostname Resolution Verification
+
+**Problem**: Cluster formation fails if nodes cannot resolve each other's
+hostnames, but errors are cryptic.
+
+**Solution**: Verify /etc/hosts configuration and DNS resolution before cluster operations.
+
+### Implementation
+
+```yaml
+- name: Ensure cluster nodes in /etc/hosts
+  ansible.builtin.lineinfile:
+    path: /etc/hosts
+    regexp: "^{{ item.ip }}\\s+"
+    line: "{{ item.ip }} {{ item.fqdn }} {{ item.short_name }}"
+    state: present
+  loop: "{{ cluster_nodes }}"
+  loop_control:
+    label: "{{ item.short_name }}"
+
+- name: Verify hostname resolution
+  ansible.builtin.command:
+    cmd: "getent hosts {{ item.fqdn }}"
+  register: host_lookup
+  failed_when: host_lookup.rc != 0
+  changed_when: false
+  loop: "{{ cluster_nodes }}"
+  loop_control:
+    label: "{{ item.fqdn }}"
+
+- name: Verify reverse DNS resolution
+  ansible.builtin.command:
+    cmd: "getent hosts {{ item.ip }}"
+  register: reverse_lookup
+  failed_when:
+    - reverse_lookup.rc != 0
+  changed_when: false
+  loop: "{{ cluster_nodes }}"
+  loop_control:
+    label: "{{ item.ip }}"
+```
+
+### Configuration Example
+
+```yaml
+# group_vars/matrix_cluster.yml
+cluster_name: "Matrix"
+cluster_nodes:
+  - short_name: foxtrot
+    fqdn: foxtrot.matrix.spaceships.work
+    ip: 192.168.3.5
+    corosync_ip: 192.168.8.5
+  - short_name: golf
+    fqdn: golf.matrix.spaceships.work
+    ip: 192.168.3.6
+    corosync_ip: 192.168.8.6
+  - short_name: hotel
+    fqdn: hotel.matrix.spaceships.work
+    ip: 192.168.3.7
+    corosync_ip: 192.168.8.7
+```
+
+## Pattern: SSH Key Distribution for Cluster Operations
+
+**Problem**: Some cluster operations require passwordless SSH between nodes.
+
+**Solution**: Automate SSH key generation and distribution.
+
+### Implementation
+
+```yaml
+- name: Generate SSH key for root (if not exists)
+  ansible.builtin.user:
+    name: root
+    generate_ssh_key: true
+    ssh_key_bits: 4096
+    ssh_key_type: rsa
+  register: root_ssh_key
+
+- name: Fetch public keys from all nodes
+  ansible.builtin.slurp:
+    src: /root/.ssh/id_rsa.pub
+  register: node_public_keys
+
+- name: Distribute SSH keys to all nodes
+  ansible.posix.authorized_key:
+    user: root
+    state: present
+    key: "{{ hostvars[item].node_public_keys.content | b64decode }}"
+  loop: "{{ groups['proxmox'] }}"
+  when: item != inventory_hostname
+```
+
+## Pattern: Service Restart Orchestration
+
+**Problem**: Cluster services must restart in specific order after configuration changes.
+
+**Solution**: Use handlers with explicit dependencies and delays.
+
+### Implementation
+
+```yaml
+# tasks/main.yml
+- name: Configure corosync
+  ansible.builtin.template:
+    src: corosync.conf.j2
+    dest: /etc/pve/corosync.conf
+    validate: corosync-cfgtool -c %s
+  notify:
+    - reload corosync
+    - restart pve-cluster
+    - restart pvedaemon
+    - restart pveproxy
+
+# handlers/main.yml
+- name: reload corosync
+  ansible.builtin.systemd:
+    name: corosync
+    state: reloaded
+  listen: reload corosync
+
+- name: restart pve-cluster
+  ansible.builtin.systemd:
+    name: pve-cluster
+    state: restarted
+  listen: restart pve-cluster
+  throttle: 1  # Restart one node at a time
+
+- name: restart pvedaemon
+  ansible.builtin.systemd:
+    name: pvedaemon
+    state: restarted
+  listen: restart pvedaemon
+
+- name: restart pveproxy
+  ansible.builtin.systemd:
+    name: pveproxy
+    state: restarted
+  listen: restart pveproxy
+```
+
+## Pattern: Quorum and Health Verification
+
+**Problem**: Cluster may appear successful but have quorum issues or split-brain scenarios.
+
+**Solution**: Always verify cluster health after operations.
+
+### Implementation
+
+```yaml
+- name: Wait for cluster to stabilize
+  ansible.builtin.pause:
+    seconds: 10
+  when: cluster_create.changed or cluster_join.changed
+
+- name: Verify cluster quorum
+  ansible.builtin.command:
+    cmd: pvecm status
+  register: cluster_health
+  changed_when: false
+  failed_when: "'Quorate: Yes' not in cluster_health.stdout"
+
+- name: Check expected node count
+  ansible.builtin.command:
+    cmd: pvecm nodes
+  register: cluster_nodes_final
+  changed_when: false
+  failed_when: cluster_nodes_final.stdout_lines | length != groups['proxmox'] | length
+
+- name: Display cluster status
+  ansible.builtin.debug:
+    var: cluster_health.stdout_lines
+  when: cluster_health.changed or ansible_verbosity > 0
+```
+
+## Anti-Pattern: Silent Error Suppression
+
+**❌ Don't Do This**:
+
+```yaml
+- name: Join cluster on other nodes
+  ansible.builtin.shell: |
+    timeout 60 pvecm add {{ primary_node }}
+  failed_when: false  # Silently ignores ALL errors
+```
+
+**Problems**:
+
+- Hides real failures (network issues, authentication problems)
+- Makes debugging impossible
+- Creates inconsistent cluster state
+- Provides false success signals
+
+**✅ Do This Instead**:
+
+```yaml
+- name: Join cluster on other nodes
+  ansible.builtin.command:
+    cmd: "pvecm add {{ primary_node }}"
+  register: cluster_join
+  failed_when:
+    - cluster_join.rc != 0
+    - "'already in a cluster' not in cluster_join.stderr"
+    - "'cannot join cluster' not in cluster_join.stderr"
+  changed_when: cluster_join.rc == 0
+
+- name: Handle join failure
+  ansible.builtin.fail:
+    msg: |
+      Failed to join cluster {{ cluster_name }}.
+      Error: {{ cluster_join.stderr }}
+      Hint: Check network connectivity and ensure first node is reachable.
+  when:
+    - cluster_join.rc != 0
+    - "'already in a cluster' not in cluster_join.stderr"
+```
+
+## Complete Role Example
+
+```yaml
+# roles/proxmox_cluster/tasks/main.yml
+---
+- name: Verify prerequisites
+  ansible.builtin.include_tasks: prerequisites.yml
+
+- name: Configure /etc/hosts
+  ansible.builtin.include_tasks: hosts_config.yml
+
+- name: Distribute SSH keys
+  ansible.builtin.include_tasks: ssh_keys.yml
+
+- name: Initialize cluster (first node only)
+  ansible.builtin.include_tasks: cluster_init.yml
+  when: inventory_hostname == groups['proxmox'][0]
+
+- name: Join cluster (other nodes)
+  ansible.builtin.include_tasks: cluster_join.yml
+  when: inventory_hostname != groups['proxmox'][0]
+
+- name: Configure corosync
+  ansible.builtin.include_tasks: corosync.yml
+
+- name: Verify cluster health
+  ansible.builtin.include_tasks: verify.yml
+```
+
+## Testing
+
+```bash
+# Syntax check
+ansible-playbook --syntax-check playbooks/cluster-init.yml
+
+# Check mode (dry run)
+ansible-playbook playbooks/cluster-init.yml --check --diff
+
+# Run on specific cluster
+ansible-playbook playbooks/cluster-init.yml --limit matrix_cluster
+
+# Verify idempotency (should show 0 changes on second run)
+ansible-playbook playbooks/cluster-init.yml --limit matrix_cluster
+ansible-playbook playbooks/cluster-init.yml --limit matrix_cluster
+```
+
+## Related Patterns
+
+- [Error Handling](error-handling.md) - Comprehensive error handling strategies
+- [Network Automation](network-automation.md) - Network interface and bridge configuration
+- [CEPH Storage](ceph-automation.md) - CEPH cluster deployment patterns
+
+## References
+
+- ProxSpray analysis: `docs/proxspray-analysis.md` (lines 153-207)
+- Proxmox VE Cluster Manager documentation
+- Corosync configuration guide
--- a/skills/ansible-best-practices/patterns/documentation-templates.md
+++ b/skills/ansible-best-practices/patterns/documentation-templates.md
@@ -0,0 +1,986 @@
+# Documentation Templates
+
+## Summary: Pattern Confidence
+
+Analyzed 7 geerlingguy roles: security, users, docker, postgresql, nginx, pip, git
+
+**Universal Patterns (All 7 roles):**
+
+- Consistent README structure: Title + Badge → Description → Requirements → Variables → Dependencies → Example →
+  License → Author (7/7 roles)
+- CI badge showing test status with link to workflow (7/7 roles)
+- Code-formatted variable defaults with detailed descriptions (7/7 roles)
+- Example playbook section with working examples (7/7 roles)
+- Inline code formatting for variables, file paths, commands (7/7 roles)
+- Explicit "None" for empty sections (Requirements, Dependencies) (7/7 roles)
+- License + Author sections with links (7/7 roles)
+- Variable grouping for related configuration (7/7 roles)
+- Commented list examples showing optional items (7/7 roles)
+
+**Contextual Patterns (Varies by complexity):**
+
+- Warning/caveat sections: security-critical roles have prominent warnings, simple roles don't need them
+- Variable documentation depth: complex roles (postgresql) have extensive inline docs, simple roles (pip) are
+  more concise
+- Example complexity: simple roles show basic examples, complex roles show multiple scenarios
+- Troubleshooting sections: recommended for roles that modify critical services (SSH, networking), optional for
+  simple roles
+- Complex variable documentation: roles with 5+ optional dict attributes show ALL keys with inline comments
+
+**Evolving Patterns (Newer roles improved):**
+
+- PostgreSQL shows best practices for complex variable documentation: show all keys, mark required vs optional,
+  document defaults
+- nginx demonstrates template extensibility documentation (Jinja2 block inheritance)
+- Complex roles provide comprehensive inline examples in defaults/ files as primary documentation
+
+**Sources:**
+
+- geerlingguy.security (analyzed 2025-10-23)
+- geerlingguy.github-users (analyzed 2025-10-23)
+- geerlingguy.docker (analyzed 2025-10-23)
+- geerlingguy.postgresql (analyzed 2025-10-23)
+- geerlingguy.nginx (analyzed 2025-10-23)
+- geerlingguy.pip (analyzed 2025-10-23)
+- geerlingguy.git (analyzed 2025-10-23)
+
+**Repositories:**
+
+- <https://github.com/geerlingguy/ansible-role-security>
+- <https://github.com/geerlingguy/ansible-role-github-users>
+- <https://github.com/geerlingguy/ansible-role-docker>
+- <https://github.com/geerlingguy/ansible-role-postgresql>
+- <https://github.com/geerlingguy/ansible-role-nginx>
+- <https://github.com/geerlingguy/ansible-role-pip>
+- <https://github.com/geerlingguy/ansible-role-git>
+
+## Pattern Confidence Levels (Historical)
+
+Analyzed 2 geerlingguy roles: security, github-users
+
+**Universal Patterns (Both roles use identical approach):**
+
+1. ✅ **README structure** - Both follow: Title + Badge → Description → Requirements → Variables → Dependencies →
+   Example → License → Author
+2. ✅ **CI badge** - Both include GitHub Actions CI badge with link to workflow
+3. ✅ **Variable documentation format** - Code-formatted default + detailed description
+4. ✅ **Example playbook section** - Both show minimal working example with vars
+5. ✅ **Inline code formatting** - Backticks for variables, file paths, commands
+6. ✅ **Commented list examples** - Show example list items as comments
+7. ✅ **"None" for empty sections** - Explicit "None" instead of omitting (Requirements, Dependencies)
+8. ✅ **License + Author sections** - Both include MIT license and author with links
+9. ✅ **Variable grouping** - Related variables documented together with shared context
+
+**Contextual Patterns (Varies by role complexity):**
+
+1. ⚠️  **Warning/caveat section** - security has prominent security warning, github-users doesn't need
+   one
+2. ⚠️  **Variable detail level** - security has extensive variable docs with warnings, github-users is more
+   concise (fewer variables)
+3. ⚠️  **Example complexity** - security shows vars_files pattern, github-users shows inline vars (simpler)
+4. ⚠️  **Troubleshooting section** - Neither role has explicit troubleshooting (could be added)
+
+**Key Finding:** README documentation follows a strict template across roles. Only the caveat/warning section varies
+based on role risk profile.
+
+## Overview
+
+This document captures documentation patterns from production-grade Ansible roles, demonstrating how to create
+clear, comprehensive README files that help users understand and use the role effectively.
+
+## README Structure
+
+### Pattern: Comprehensive README Template
+
+**Description:** A well-structured README that follows a consistent format, providing all necessary information for
+users to understand and use the role.
+
+**File Path:** `README.md`
+
+**Standard README Sections:**
+
+1. Title and badges
+2. Caveat/Warning (if applicable)
+3. Role description
+4. Requirements
+5. Role Variables
+6. Dependencies
+7. Example Playbook
+8. License
+9. Author Information
+
+### Section 1: Title and Badges
+
+**Example Code:**
+
+```markdown
+# Ansible Role: Security (Basics)
+
+[![CI](https://github.com/geerlingguy/ansible-role-security/actions/workflows/ci.yml/badge.svg)](https://github.com/geerlingguy/ansible-role-security/actions/workflows/ci.yml)
+```
+
+**Key Elements:**
+
+1. **Clear title** - Role name with descriptive subtitle
+2. **CI badge** - Shows test status (builds confidence)
+3. **Badge links to CI** - Users can see test results
+
+**When to Use:**
+
+- Always include clear role title
+- Add CI badge if you have automated testing
+- Link badges to their status pages
+- Consider adding Galaxy badge, version badge, downloads badge
+
+**Badge Examples:**
+
+```markdown
+[![CI](https://github.com/user/repo/workflows/ci.yml/badge.svg)](https://github.com/user/repo/actions)
+[![Ansible Galaxy](https://img.shields.io/badge/galaxy-user.rolename-blue.svg)](https://galaxy.ansible.com/user/rolename)
+[![License](https://img.shields.io/badge/license-MIT-brightgreen.svg)](LICENSE)
+```
+
+**Anti-pattern:**
+
+- Don't skip the title (obvious but happens)
+- Avoid outdated or broken badges
+- Don't add badges that don't provide value
+
+### Section 2: Caveat/Warning (Optional)
+
+**Example Code:**
+
+```markdown
+**First, a major, MAJOR caveat**: the security of your servers is YOUR
+responsibility. If you think simply including this role and adding a firewall
+makes a server secure, then you're mistaken. Read up on Linux, network, and
+application security, and know that no matter how much you know, you can
+always make every part of your stack more secure.
+
+That being said, this role performs some basic security configuration on
+RedHat and Debian-based linux systems. It attempts to:
+
+  - Install software to monitor bad SSH access (fail2ban)
+  - Configure SSH to be more secure (disabling root login, requiring
+    key-based authentication, and allowing a custom SSH port to be set)
+  - Set up automatic updates (if configured to do so)
+
+There are a few other things you may or may not want to do (which are not
+included in this role) to make sure your servers are more secure, like:
+
+  - Use logwatch or a centralized logging server to analyze and monitor
+    log files
+  - Securely configure user accounts and SSH keys (this role assumes you're
+    not using password authentication or logging in as root)
+  - Have a well-configured firewall (check out the `geerlingguy.firewall`
+    role on Ansible Galaxy for a flexible example)
+
+Again: Your servers' security is *your* responsibility.
+```
+
+**Key Elements:**
+
+1. **Prominent warning** - Sets expectations clearly
+2. **Scope definition** - What the role does and doesn't do
+3. **Additional recommendations** - Points to complementary practices
+4. **Emphasis** - Bold, italics, repetition for important points
+
+**When to Use:**
+
+- Security-related roles (critical warnings)
+- Roles that could cause service disruption
+- Roles with common misunderstandings
+- Complex roles with limited scope
+
+**Anti-pattern:**
+
+- Don't add warnings for routine roles
+- Avoid legal disclaimers (that's what LICENSE is for)
+- Don't be condescending
+
+### Section 3: Requirements
+
+**Example Code:**
+
+```markdown
+## Requirements
+
+For obvious reasons, `sudo` must be installed if you want to manage the
+sudoers file with this role.
+
+On RedHat/CentOS systems, make sure you have the EPEL repository installed
+(you can include the `geerlingguy.repo-epel` role to get it installed).
+
+No special requirements for Debian/Ubuntu systems.
+```
+
+**Key Elements:**
+
+1. **System requirements** - Software that must be pre-installed
+2. **OS-specific requirements** - Different requirements per platform
+3. **How to meet requirements** - Links to other roles or instructions
+4. **Explicit "no requirements" statement** - Clarity when none exist
+
+**When to Use:**
+
+- List any software that must be installed first
+- Document repository requirements (EPEL, PPAs)
+- Mention privilege requirements (become/sudo)
+- Note Python library dependencies
+- State "None" if no requirements (clear communication)
+
+**Anti-pattern:**
+
+- Don't assume users know about EPEL or special repos
+- Avoid listing Ansible itself (assumed)
+- Don't skip this section (at least say "None")
+
+### Section 4: Role Variables
+
+**Example Code:**
+
+```markdown
+## Role Variables
+
+Available variables are listed below, along with default values (see
+`defaults/main.yml`):
+
+    security_ssh_port: 22
+
+The port through which you'd like SSH to be accessible. The default is port
+22, but if you're operating a server on the open internet, and have no
+firewall blocking access to port 22, you'll quickly find that thousands of
+login attempts per day are not uncommon. You can change the port to a
+nonstandard port (e.g. 2849) if you want to avoid these thousands of
+automated penetration attempts.
+
+    security_ssh_password_authentication: "no"
+    security_ssh_permit_root_login: "no"
+    security_ssh_usedns: "no"
+    security_ssh_permit_empty_password: "no"
+    security_ssh_challenge_response_auth: "no"
+    security_ssh_gss_api_authentication: "no"
+    security_ssh_x11_forwarding: "no"
+
+Security settings for SSH authentication. It's best to leave these set to
+`"no"`, but there are times (especially during initial server configuration
+or when you don't have key-based authentication in place) when one or all
+may be safely set to `'yes'`. **NOTE: It is _very_ important that you quote
+the 'yes' or 'no' values. Failure to do so may lock you out of your server.**
+
+    security_ssh_allowed_users: []
+    # - alice
+    # - bob
+    # - charlie
+
+A list of users allowed to connect to the host over SSH.  If no user is
+defined in the list, the task will be skipped.
+
+    security_sudoers_passwordless: []
+    security_sudoers_passworded: []
+
+A list of users who should be added to the sudoers file so they can run any
+command as root (via `sudo`) either without a password or requiring a
+password for each command, respectively.
+
+    security_autoupdate_enabled: true
+
+Whether to install/enable `yum-cron` (RedHat-based systems) or
+`unattended-upgrades` (Debian-based systems). System restarts will not
+happen automatically in any case, and automatic upgrades are no excuse for
+sloppy patch and package management, but automatic updates can be helpful
+as yet another security measure.
+
+    security_fail2ban_enabled: true
+
+Whether to install/enable `fail2ban`. You might not want to use fail2ban if
+you're already using some other service for login and intrusion detection
+(e.g. [ConfigServer](http://configserver.com/cp/csf.html)).
+```
+
+**Documentation Pattern:**
+
+For each variable:
+
+1. **Show default value** - Code-formatted with actual default
+2. **Description** - What it does, when to use it
+3. **Context** - Why you might change it
+4. **Examples** - Show different values for lists/dicts
+5. **Warnings** - Important notes (quoting, locking out, etc.)
+
+**Formatting Guidelines:**
+
+- Use 4-space indentation for default values
+- Group related variables together
+- Add blank lines between variable groups
+- Use inline code formatting for values
+- Bold important warnings
+- Comment out example list items
+
+**When to Use:**
+
+- Document ALL variables from defaults/main.yml
+- Group related variables (ssh_*, autoupdate_*, etc.)
+- Provide context, not just description
+- Include warnings for dangerous settings
+- Show example values for complex structures
+
+**Anti-pattern:**
+
+- Don't just list variables without explanation
+- Avoid documenting vars/ (internal implementation)
+- Don't skip context (users need to know WHY)
+- Avoid stale documentation (keep in sync with defaults/)
+
+### Pattern: Variable Table Format (Alternative)
+
+**Description:** Some roles use a table format for variable documentation. While geerlingguy.security doesn't use
+this, it's a valid alternative pattern.
+
+**Example Table Format:**
+
+```markdown
+## Role Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `security_ssh_port` | `22` | SSH port number |
+| `security_ssh_password_authentication` | `"no"` | Enable password authentication |
+| `security_fail2ban_enabled` | `true` | Install and configure fail2ban |
+```
+
+**When to Use:**
+
+- Roles with many simple variables
+- When brief descriptions are sufficient
+- For quick reference guides
+
+**Comparison:**
+
+| Format | Best For | Pros | Cons |
+|--------|----------|------|------|
+| Text with examples | Complex variables, detailed context | Detailed explanations, examples | More verbose |
+| Table | Simple variables, quick reference | Concise, scannable | Limited detail space |
+
+**Virgo-Core Preference:**
+
+Use text format with examples (matches geerlingguy pattern) for main documentation, optionally add table for quick
+reference.
+
+### Section 5: Dependencies
+
+**Example Code:**
+
+```markdown
+## Dependencies
+
+None.
+```
+
+**When Dependencies Exist:**
+
+```markdown
+## Dependencies
+
+This role depends on:
+
+- `geerlingguy.repo-epel` (for RedHat/CentOS systems)
+- `geerlingguy.firewall` (recommended but optional)
+
+The role will automatically install required dependencies from Ansible Galaxy.
+```
+
+**Key Elements:**
+
+1. **Explicit "None"** - Clear when no dependencies
+2. **List dependencies** - With context about why needed
+3. **Distinguish required vs optional** - Important for users
+4. **Note automatic installation** - Reduces confusion
+
+**When to Use:**
+
+- Always include this section
+- List role dependencies from meta/main.yml
+- Note recommended complementary roles
+- State "None" if no dependencies
+
+**Anti-pattern:**
+
+- Don't skip this section
+- Avoid listing collection dependencies here (put in Requirements)
+
+### Section 6: Example Playbook
+
+**Example Code:**
+
+```markdown
+## Example Playbook
+
+    - hosts: servers
+      vars_files:
+        - vars/main.yml
+      roles:
+        - geerlingguy.security
+
+*Inside `vars/main.yml`*:
+
+    security_sudoers_passworded:
+      - johndoe
+      - deployacct
+```
+
+**Key Elements:**
+
+1. **Minimal working example** - Shows basic usage
+2. **Variable override example** - Demonstrates customization
+3. **Multiple files** - Shows playbook and vars file
+4. **Real-world example** - Not generic foo/bar examples
+5. **Indentation** - 4 spaces for YAML, maintains readability
+
+**Enhanced Example Pattern:**
+
+```markdown
+## Example Playbook
+
+### Basic Usage
+
+    - hosts: all
+      roles:
+        - geerlingguy.security
+
+### Custom Configuration
+
+    - hosts: webservers
+      vars:
+        security_ssh_port: 2222
+        security_fail2ban_enabled: true
+        security_autoupdate_enabled: true
+      roles:
+        - geerlingguy.security
+
+### Advanced Example with Sudoers
+
+    - hosts: appservers
+      vars:
+        security_sudoers_passwordless:
+          - deploy
+        security_sudoers_passworded:
+          - developer
+          - operator
+      roles:
+        - geerlingguy.security
+```
+
+**When to Use:**
+
+- Always include at least one example
+- Show basic usage first
+- Add advanced examples for complex features
+- Use realistic variable values
+- Include multiple scenarios if role has distinct use cases
+
+**Anti-pattern:**
+
+- Don't use only generic examples (foo, bar, example.com)
+- Avoid incomplete examples (missing required vars)
+- Don't show every possible variable (overwhelming)
+
+### Section 7: License and Author
+
+**Example Code:**
+
+```markdown
+## License
+
+MIT (Expat) / BSD
+
+## Author Information
+
+This role was created in 2014 by [Jeff Geerling](https://www.jeffgeerling.com/),
+author of [Ansible for DevOps](https://www.ansiblefordevops.com/).
+```
+
+**Key Elements:**
+
+1. **License name** - Clear license statement
+2. **Author information** - Who created/maintains it
+3. **Links** - Author website, book, company
+4. **Year created** - Provides context
+
+**When to Use:**
+
+- Always include license (required for Galaxy)
+- Add author name and contact
+- Link to LICENSE file for full text
+- Keep it brief
+
+**Anti-pattern:**
+
+- Don't include full license text in README (use LICENSE file)
+- Avoid complex author information
+
+## Additional Documentation Patterns
+
+### Pattern: Troubleshooting Section
+
+**Description:** While geerlingguy.security doesn't include a troubleshooting section, more complex roles should
+include one.
+
+**Example Troubleshooting Section:**
+
+```markdown
+## Troubleshooting
+
+### SSH Connection Refused After Running Role
+
+If you lose SSH connectivity after running this role, you may have:
+
+1. Changed the SSH port without updating your firewall rules
+2. Disabled password authentication without setting up SSH keys
+3. Set `security_ssh_allowed_users` without including your username
+
+**Solution:** Access the server via console and check `/etc/ssh/sshd_config`.
+
+### Fail2ban Not Starting
+
+If fail2ban fails to start, check that the log files it monitors exist:
+
+    ls -la /var/log/auth.log
+
+On some minimal systems, these log files may not exist until a service
+writes to them.
+
+**Solution:** Create empty log files or disable fail2ban temporarily.
+```
+
+**When to Use:**
+
+- Roles that modify critical services (SSH, networking)
+- Roles with common configuration mistakes
+- Roles with tricky OS-specific issues
+- Complex roles with multiple failure modes
+
+**Anti-pattern:**
+
+- Don't include troubleshooting for roles that are straightforward
+- Avoid listing every possible error (focus on common issues)
+
+### Pattern: Inline Code and Formatting
+
+**Formatting Patterns from README:**
+
+1. **Inline code** - Use backticks: `fail2ban`, `sudo`, `/etc/ssh/sshd_config`
+2. **File paths** - Always use inline code: `defaults/main.yml`
+3. **Commands** - Inline code for short commands: `sudo systemctl restart ssh`
+4. **Variable names** - Inline code: `security_ssh_port`
+5. **Code blocks** - Use 4-space indentation for YAML/code examples
+6. **Emphasis** - Bold for **important warnings**, italics for *emphasis*
+7. **Lists** - Use `-` for unordered, numbers for ordered
+
+**Example:**
+
+```markdown
+To configure SSH port, set `security_ssh_port` in your playbook variables.
+The configuration is written to `/etc/ssh/sshd_config` and validated with
+`sshd -T -f %s` before applying. **WARNING**: Changing the SSH port without
+updating firewall rules will lock you out.
+```
+
+## Comparison to Virgo-Core Roles
+
+### system_user Role
+
+**README Analysis:**
+
+**Matches:**
+
+- ✅ Has clear title
+- ✅ Good role description
+- ✅ Documents variables
+- ✅ Includes example playbook
+- ✅ Has license and author sections
+
+**Gaps:**
+
+- ❌ No CI badge (no CI yet)
+- ⚠️  Variable documentation less detailed (could add more context)
+- ⚠️  Could add troubleshooting section (SSH key issues common)
+- ⚠️  No table of contents (nice-to-have for longer docs)
+
+**Priority Actions:**
+
+1. **Important:** Enhance variable documentation with usage context (30 min)
+2. **Important:** Add troubleshooting section (1 hour)
+3. **Nice-to-have:** Add CI badge after implementing CI (5 min)
+
+### proxmox_access Role
+
+**README Analysis:**
+
+**Matches:**
+
+- ✅ Comprehensive variable documentation
+- ✅ Good examples
+- ✅ Security warnings included
+
+**Gaps:**
+
+- ❌ No CI badge
+- ⚠️  Could add more example playbooks (different scenarios)
+- ⚠️  Troubleshooting section would help (token creation failures)
+
+**Priority Actions:**
+
+1. **Important:** Add troubleshooting for common token issues (1 hour)
+2. **Important:** Add more example scenarios (30 min)
+3. **Nice-to-have:** Add requirements section (15 min)
+
+### proxmox_network Role
+
+**README Analysis:**
+
+**Matches:**
+
+- ✅ Good structure
+- ✅ Clear variable documentation
+- ✅ Network architecture context
+
+**Gaps:**
+
+- ❌ No CI badge
+- ⚠️  Network troubleshooting section would be valuable
+- ⚠️  Could add verification examples (how to check it worked)
+
+**Priority Actions:**
+
+1. **Important:** Add network troubleshooting section (1 hour)
+2. **Important:** Add verification examples (30 min)
+3. **Nice-to-have:** Add network topology diagram (1 hour)
+
+## Template: Complete README Structure
+
+```markdown
+# Ansible Role: [Role Name]
+
+[![CI](badge-url)](ci-url)
+[![Ansible Galaxy](badge-url)](galaxy-url)
+
+[Brief role description - what it does, key features]
+
+[Optional: Warning/caveat section for critical roles]
+
+## Requirements
+
+[List prerequisites, or "None"]
+
+## Role Variables
+
+Available variables are listed below, along with default values (see
+`defaults/main.yml`):
+
+    variable_name: default_value
+
+[Description of variable, when to change it, usage examples]
+
+    another_variable: []
+    # - example1
+    # - example2
+
+[Description with examples]
+
+## Dependencies
+
+[List role dependencies, or "None"]
+
+## Example Playbook
+
+### Basic Usage
+
+    - hosts: all
+      roles:
+        - rolename
+
+### Custom Configuration
+
+    - hosts: servers
+      vars:
+        variable_name: custom_value
+      roles:
+        - rolename
+
+## Troubleshooting
+
+[Optional: Common issues and solutions]
+
+## License
+
+MIT / BSD / Apache 2.0
+
+## Author Information
+
+This role was created by [Author Name](link), [additional context].
+```
+
+## Validation: geerlingguy.postgresql
+
+**Analysis Date:** 2025-10-23
+**Repository:** <https://github.com/geerlingguy/ansible-role-postgresql>
+
+### README Structure
+
+- **Pattern: Comprehensive README template** - ✅ **Confirmed**
+  - PostgreSQL follows same structure: Title + Badge → Description → Requirements → Variables → Dependencies →
+    Example → License → Author
+  - **4/4 roles follow identical README structure**
+
+### Variable Documentation
+
+- **Pattern: Code-formatted default + detailed description** - ✅ **EXCELLENT EXAMPLE**
+  - PostgreSQL has extensive variable docs (50+ variables documented)
+  - Each variable group includes:
+    - Code block with default value
+    - Detailed description of purpose
+    - Usage context and examples
+    - Inline comments for complex structures
+  - **Example quality:**
+
+  ```markdown
+      postgresql_databases:
+        - name: exampledb # required; the rest are optional
+          lc_collate: # defaults to 'en_US.UTF-8'
+          lc_ctype: # defaults to 'en_US.UTF-8'
+          encoding: # defaults to 'UTF-8'
+  ```
+
+  - **Validates:** Complex dict variables need inline comment documentation
+  - **4/4 roles use this documentation pattern**
+
+### CI Badge
+
+- **Pattern: GitHub Actions CI badge** - ✅ **Confirmed**
+  - PostgreSQL includes CI badge with link to workflow
+  - **4/4 roles have CI badges**
+
+### Example Playbook
+
+- **Pattern: Basic + vars_files example** - ✅ **Confirmed**
+  - Shows minimal playbook + vars file pattern
+  - Includes example variable values for databases and users
+  - **4/4 roles provide working examples**
+
+### Requirements Section
+
+- **Pattern: Explicit requirements or "None"** - ✅ **Confirmed**
+  - PostgreSQL states: "No special requirements"
+  - Mentions become: yes requirement
+  - **4/4 roles include Requirements section (even if "None")**
+
+### Dependencies Section
+
+- **Pattern: Explicit "None"** - ✅ **Confirmed**
+  - PostgreSQL states: "None."
+  - **4/4 roles include Dependencies section**
+
+### Advanced Pattern: Complex Variable Tables
+
+- **Pattern Evolution:** PostgreSQL uses structured tables for complex options:
+  - **hba_entries:** Lists all available keys with descriptions
+  - **databases:** Shows optional attributes with defaults
+  - **users:** Documents every possible parameter
+  - **Insight:** When variables have 5+ optional attributes, use structured documentation
+  - **Recommendation:** For complex dict structures, show all keys even if optional
+
+### Documentation for Complex Structures
+
+- **Pattern: Show all keys, even optional** - ✅ **NEW INSIGHT**
+  - PostgreSQL documents every possible key for postgresql_databases, postgresql_users, postgresql_privs
+  - Includes comments like "# required" vs "# optional"
+  - Shows default values inline: `# defaults to 'en_US.UTF-8'`
+  - **Best practice:** Comprehensive documentation prevents user confusion
+
+### Key Validation Findings
+
+**What PostgreSQL Role Confirms:**
+
+1. ✅ README structure is universal (4/4 roles identical)
+2. ✅ Variable documentation format is universal (4/4 roles)
+3. ✅ CI badges are universal (4/4 roles)
+4. ✅ Example playbooks are universal (4/4 roles)
+5. ✅ Explicit "None" for empty sections is universal (4/4 roles)
+6. ✅ Inline code formatting is universal (4/4 roles)
+
+**What PostgreSQL Role Demonstrates:**
+
+1. 🔄 Complex variables need extensive inline documentation
+2. 🔄 Show ALL available keys for dict structures, even optional ones
+3. 🔄 Use comments to indicate required vs optional vs defaults
+4. 🔄 Large variable sets (20+) benefit from grouping in documentation
+
+**Pattern Confidence After PostgreSQL Validation (4/4 roles):**
+
+- **README structure:** UNIVERSAL (4/4 roles identical)
+- **Variable documentation:** UNIVERSAL (4/4 use same format)
+- **CI badges:** UNIVERSAL (4/4 roles have them)
+- **Example playbooks:** UNIVERSAL (4/4 provide examples)
+- **Explicit "None":** UNIVERSAL (4/4 use it)
+- **Complex variable docs:** VALIDATED (postgresql shows best practices for complexity)
+
+## Validation: geerlingguy.pip
+
+**Analysis Date:** 2025-10-23
+**Repository:** <https://github.com/geerlingguy/ansible-role-pip>
+
+### README Structure
+
+- **Pattern: Standard sections** - ✅ **Confirmed**
+  - Title with CI badge
+  - Description: "Installs Pip (Python package manager) on Linux"
+  - Requirements section (mentions EPEL for RHEL/CentOS)
+  - Role Variables section with defaults and descriptions
+  - Dependencies section (None.)
+  - Example Playbook section
+  - License and Author Information
+  - **6/6 roles follow identical README structure**
+
+### Variable Documentation
+
+- **Pattern: Simple variable table** - ✅ **Confirmed**
+  - pip_package: Default python3-pip, shows alternative for Python 2
+  - pip_executable: Documents auto-detection, shows override example
+  - pip_install_packages: Shows list format with dict options
+  - **All 3 variables documented with defaults and usage context**
+
+- **Pattern: List-of-dicts inline example** - ✅ **Confirmed**
+  - pip_install_packages shows dict keys: name, version, state, extra_args, virtualenv
+  - Example shows installing specific version: `docker==7.1.0`
+  - Shows AWS CLI installation example
+  - **6/6 roles document list variables with inline examples**
+
+### Requirements Section
+
+- **Pattern: Explicit prerequisites** - ✅ **Confirmed**
+  - States: "On RedHat/CentOS, you may need to have EPEL installed"
+  - Recommends geerlingguy.repo-epel role
+  - **Key insight:** Even simple roles document prerequisites
+
+### Example Playbook
+
+- **Pattern: Single basic example** - ✅ **Confirmed**
+  - Shows installing 2 packages (docker, awscli)
+  - Demonstrates vars: section with pip_install_packages
+  - Clean, minimal example for utility role
+  - **Validates:** Simple roles don't need complex examples
+
+### Key Validation Findings
+
+**What pip Role Confirms:**
+
+1. ✅ README structure universal even for minimal roles (6/6 roles)
+2. ✅ All variables documented even when only 3 total (6/6 roles)
+3. ✅ CI badge present even for simple roles (6/6 roles)
+4. ✅ Example playbooks scaled appropriately (simple role = simple example)
+5. ✅ Prerequisites documented even when minimal
+
+**Pattern Confidence After pip Validation (6/6 roles):**
+
+- **README structure:** UNIVERSAL (6/6 roles identical)
+- **Variable documentation:** UNIVERSAL (6/6 document all variables)
+- **CI badges:** UNIVERSAL (6/6 roles have them)
+- **Example playbooks:** UNIVERSAL (6/6, scaled to complexity)
+
+## Validation: geerlingguy.git
+
+**Analysis Date:** 2025-10-23
+**Repository:** <https://github.com/geerlingguy/ansible-role-git>
+
+### README Structure
+
+- **Pattern: Standard sections** - ✅ **Confirmed**
+  - Title with CI badge
+  - Description: "Installs Git, a distributed version control system"
+  - Requirements section (None.)
+  - Role Variables section with comprehensive variable list
+  - Dependencies section (None.)
+  - Example Playbook section
+  - License and Author Information
+  - **7/7 roles follow identical README structure**
+
+### Variable Documentation
+
+- **Pattern: Grouped variables** - ✅ **Confirmed**
+  - git_packages: Package list with platform-specific defaults
+  - git_install_from_source: Boolean flag with clear purpose
+  - Source install variables grouped together (workspace, version, path, force_update)
+  - **Key insight:** Utility roles with options group related variables
+
+- **Pattern: Boolean flags clearly explained** - ✅ **Confirmed**
+  - git_install_from_source: "`false` by default. If set to `true`, installs from source"
+  - git_install_force_update: Explains version downgrade protection
+  - **7/7 roles document boolean flag purpose and default**
+
+### Requirements Section
+
+- **Pattern: Explicit "None"** - ✅ **Confirmed**
+  - States: "None."
+  - **7/7 roles include Requirements section even if none needed**
+
+### Example Playbook
+
+- **Pattern: Multiple scenarios** - ✅ **Confirmed**
+  - Shows package installation example
+  - Implies source installation available via variables
+  - **Validates:** Utility roles with multiple modes show key scenarios
+
+### Key Validation Findings
+
+**What git Role Confirms:**
+
+1. ✅ README structure universal across all role types (7/7 roles)
+2. ✅ Variable grouping for related options (7/7 roles)
+3. ✅ Boolean flags clearly explained (7/7 roles)
+4. ✅ CI badge standard even for simple roles (7/7 roles)
+5. ✅ Documentation scales with role complexity
+
+**Pattern Confidence After git Validation (7/7 roles):**
+
+- **README structure:** UNIVERSAL (7/7 roles identical)
+- **Variable documentation:** UNIVERSAL (7/7 document all variables with context)
+- **CI badges:** UNIVERSAL (7/7 roles have them)
+- **Example playbooks:** UNIVERSAL (7/7 provide working examples)
+- **Explicit "None":** UNIVERSAL (7/7 use for empty sections)
+- **Variable grouping:** UNIVERSAL (7/7 group related variables)
+- **Boolean flag documentation:** UNIVERSAL (7/7 explain purpose clearly)
+
+## Summary
+
+**Universal Patterns Identified:**
+
+1. Consistent README structure (title → requirements → variables → examples → license)
+2. CI badges for test status
+3. Comprehensive variable documentation with defaults and context
+4. Multiple example playbooks (basic → advanced)
+5. Explicit "None" statements for empty sections
+6. Inline code formatting for variables, files, commands
+7. Bold warnings for critical information
+8. Commented examples for list variables
+9. Show ALL keys for complex dict structures, even optional ones
+
+**Key Takeaways:**
+
+- Variable documentation should include defaults AND context
+- Examples should progress from simple to complex
+- Warnings prevent common mistakes
+- Consistent formatting improves readability
+- Explicit "None" is better than omitting sections
+- Troubleshooting saves support time
+- Complex variables need inline documentation showing all available keys
+
+**Next Steps:**
+
+Enhance Virgo-Core role READMEs with:
+
+1. More detailed variable context
+2. Troubleshooting sections
+3. CI badges (after implementing testing)
+4. Additional example scenarios
+5. For complex variables, show all available keys with inline comments
--- a/skills/ansible-best-practices/patterns/error-handling.md
+++ b/skills/ansible-best-practices/patterns/error-handling.md
@@ -0,0 +1,576 @@
+# Error Handling Patterns
+
+## Overview
+
+Proper error handling in Ansible ensures playbooks are robust, idempotent, and provide clear failure
+messages. This guide covers patterns from the Virgo-Core repository.
+
+## Core Concepts
+
+### changed_when
+
+Controls when Ansible reports a task as "changed". Critical for idempotency with `command` and `shell` modules.
+
+**Syntax:**
+
+```yaml
+changed_when: <boolean expression>
+```
+
+### failed_when
+
+Controls when Ansible considers a task as failed. Allows graceful handling of expected errors.
+
+**Syntax:**
+
+```yaml
+failed_when: <boolean expression>
+```
+
+### register
+
+Captures task output for later inspection and conditional logic.
+
+**Syntax:**
+
+```yaml
+register: variable_name
+```
+
+## Pattern 1: Idempotent Command Execution
+
+### Problem
+
+`command` and `shell` modules always report "changed" even if nothing changed.
+
+### Solution
+
+Use `changed_when` to detect actual changes:
+
+**Example from repository:**
+
+```yaml
+- name: Create Proxmox API token
+  ansible.builtin.command: >
+    pveum user token add {{ system_username }}@{{ proxmox_user_realm }}
+    {{ proxmox_token_name }}
+  register: token_result
+  changed_when: "'already exists' not in token_result.stderr"
+  failed_when:
+    - token_result.rc != 0
+    - "'already exists' not in token_result.stderr"
+  no_log: true
+```
+
+**Explanation:**
+
+1. `register: token_result` - Captures command output
+2. `changed_when: "'already exists' not in token_result.stderr"` - Only report "changed" if token didn't already exist
+3. `failed_when` - Don't fail if token already exists (expected scenario)
+
+## Pattern 2: Check Before Create
+
+### Problem
+
+Creating resources that may already exist causes unnecessary errors.
+
+### Solution
+
+Check for existence first, create conditionally:
+
+**Example:**
+
+```yaml
+- name: Check if VM template exists
+  ansible.builtin.shell: |
+    set -o pipefail
+    qm list | awk '{print $1}' | grep -q "^{{ template_id }}$"
+  args:
+    executable: /bin/bash
+  register: template_exists
+  changed_when: false  # Checking doesn't change anything
+  failed_when: false   # Don't fail if template not found
+
+- name: Create VM template
+  ansible.builtin.command: >
+    qm create {{ template_id }}
+    --name {{ template_name }}
+    --memory 2048
+    --cores 2
+  when: template_exists.rc != 0  # Only create if check failed (doesn't exist)
+  register: create_result
+```
+
+**Key points:**
+
+- `changed_when: false` - Read-only operation
+- `failed_when: false` - Expected that template might not exist
+- `when: template_exists.rc != 0` - Conditional creation
+
+## Pattern 3: Verify After Create
+
+### Problem
+
+Resource creation appears to succeed but may have failed silently.
+
+### Solution
+
+Verify resource exists after creation:
+
+**Example:**
+
+```yaml
+- name: Create VM
+  ansible.builtin.command: >
+    qm create {{ vmid }}
+    --name {{ vm_name }}
+    --memory 4096
+  register: create_result
+
+- name: Verify VM was created
+  ansible.builtin.shell: |
+    set -o pipefail
+    qm list | grep "{{ vmid }}"
+  args:
+    executable: /bin/bash
+  register: verify_result
+  changed_when: false
+  failed_when: verify_result.rc != 0
+```
+
+## Pattern 4: Graceful Failure Handling
+
+### Problem
+
+Task failures may be expected in certain scenarios.
+
+### Solution
+
+Use `failed_when` with specific conditions:
+
+**Example:**
+
+```yaml
+- name: Try to stop service
+  ansible.builtin.systemd:
+    name: myservice
+    state: stopped
+  register: stop_result
+  failed_when:
+    - stop_result.failed
+    - "'not found' not in stop_result.msg"
+  # Allow failure if service doesn't exist
+```
+
+**Multiple failure conditions:**
+
+```yaml
+- name: Run migration
+  ansible.builtin.command: /usr/bin/migrate-database
+  register: migrate_result
+  failed_when:
+    - migrate_result.rc != 0
+    - "'already applied' not in migrate_result.stdout"
+    - "'no changes' not in migrate_result.stdout"
+  # Success if: rc=0, OR "already applied", OR "no changes"
+```
+
+## Pattern 5: Block with Rescue
+
+### Problem
+
+Need to handle failures and perform cleanup.
+
+### Solution
+
+Use `block`/`rescue`/`always`:
+
+**Example:**
+
+```yaml
+- name: Deploy application
+  block:
+    - name: Stop application
+      ansible.builtin.systemd:
+        name: myapp
+        state: stopped
+
+    - name: Deploy new version
+      ansible.builtin.copy:
+        src: myapp-v2.0
+        dest: /usr/bin/myapp
+
+    - name: Start application
+      ansible.builtin.systemd:
+        name: myapp
+        state: started
+
+  rescue:
+    - name: Rollback to previous version
+      ansible.builtin.copy:
+        src: myapp-backup
+        dest: /usr/bin/myapp
+
+    - name: Start application (rollback)
+      ansible.builtin.systemd:
+        name: myapp
+        state: started
+
+    - name: Report failure
+      ansible.builtin.fail:
+        msg: "Deployment failed, rolled back to previous version"
+
+  always:
+    - name: Cleanup temp files
+      ansible.builtin.file:
+        path: /tmp/deploy-*
+        state: absent
+```
+
+**Explanation:**
+
+- `block:` - Main tasks
+- `rescue:` - Runs if any task in block fails
+- `always:` - Runs regardless of success/failure
+
+## Pattern 6: Retry with Until
+
+### Problem
+
+Transient failures need retries before giving up.
+
+### Solution
+
+Use `until`, `retries`, `delay`:
+
+**Example:**
+
+```yaml
+- name: Wait for service to be ready
+  ansible.builtin.uri:
+    url: http://localhost:8080/health
+    status_code: 200
+  register: health_check
+  until: health_check.status == 200
+  retries: 30
+  delay: 10
+  # Retry every 10 seconds, up to 30 times (5 minutes total)
+```
+
+**With command:**
+
+```yaml
+- name: Wait for VM to get IP address
+  ansible.builtin.command: qm agent {{ vmid }} network-get-interfaces
+  register: vm_network
+  until: vm_network.rc == 0
+  retries: 12
+  delay: 5
+  changed_when: false
+```
+
+## Pattern 7: Conditional Failure Messages
+
+### Problem
+
+Generic failure messages don't help with troubleshooting.
+
+### Solution
+
+Use `ansible.builtin.fail` with conditional messages:
+
+**Example:**
+
+```yaml
+- name: Check prerequisites
+  ansible.builtin.command: which docker
+  register: docker_check
+  changed_when: false
+  failed_when: false
+
+- name: Fail if Docker not installed
+  ansible.builtin.fail:
+    msg: |
+      Docker is not installed on {{ inventory_hostname }}
+      Please install Docker before running this playbook.
+      Installation: sudo apt install docker.io
+  when: docker_check.rc != 0
+
+- name: Check Docker version
+  ansible.builtin.command: docker --version
+  register: docker_version
+  changed_when: false
+
+- name: Validate Docker version
+  ansible.builtin.fail:
+    msg: |
+      Docker version is too old: {{ docker_version.stdout }}
+      Minimum required version: 20.10
+  when: docker_version.stdout is version('20.10', '<')
+```
+
+## Pattern 8: Assert for Validation
+
+### Problem
+
+Need to validate multiple conditions with clear error messages.
+
+### Solution
+
+Use `ansible.builtin.assert`:
+
+**Example from repository:**
+
+```yaml
+- name: Validate required variables
+  ansible.builtin.assert:
+    that:
+      - secret_name is defined and secret_name|trim|length > 0
+      - secret_var_name is defined and secret_var_name|trim|length > 0
+    fail_msg: "secret_name and secret_var_name must be provided and non-empty"
+    success_msg: "All required variables present"
+    quiet: true
+  no_log: true
+```
+
+**Multiple assertions:**
+
+```yaml
+- name: Validate VM configuration
+  ansible.builtin.assert:
+    that:
+      - vm_memory >= 2048
+      - vm_cores >= 2
+      - vm_disk_size >= 20
+      - vm_name is match('^[a-z0-9-]+$')
+    fail_msg: |
+      Invalid VM configuration:
+      - Memory must be >= 2048 MB (got: {{ vm_memory }})
+      - Cores must be >= 2 (got: {{ vm_cores }})
+      - Disk must be >= 20 GB (got: {{ vm_disk_size }})
+      - Name must be lowercase alphanumeric with hyphens (got: {{ vm_name }})
+```
+
+## Pattern 9: Ignore Errors Temporarily
+
+### Problem
+
+Task may fail but playbook should continue.
+
+### Solution
+
+Use `ignore_errors` (sparingly!):
+
+**Example:**
+
+```yaml
+- name: Try to remove old backup
+  ansible.builtin.file:
+    path: /backup/old-backup.tar.gz
+    state: absent
+  ignore_errors: true  # OK if file doesn't exist
+  register: cleanup_result
+
+- name: Report cleanup result
+  ansible.builtin.debug:
+    msg: "Cleanup {{ 'successful' if not cleanup_result.failed else 'skipped (file not found)' }}"
+```
+
+**Better approach with failed_when:**
+
+```yaml
+- name: Remove old backup
+  ansible.builtin.file:
+    path: /backup/old-backup.tar.gz
+    state: absent
+  register: cleanup_result
+  failed_when:
+    - cleanup_result.failed
+    - "'does not exist' not in cleanup_result.msg"
+```
+
+## Pattern 10: Task Delegation
+
+### Problem
+
+Need to run task locally or on a different host.
+
+### Solution
+
+Use `delegate_to`:
+
+**Example:**
+
+```yaml
+- name: Check API endpoint from controller
+  ansible.builtin.uri:
+    url: "https://{{ inventory_hostname }}:8006/api2/json/version"
+    validate_certs: false
+  delegate_to: localhost
+  register: api_check
+  failed_when: api_check.status != 200
+```
+
+## Complete Example: Robust VM Creation
+
+**Combining multiple patterns:**
+
+```yaml
+---
+- name: Create Proxmox VM with robust error handling
+  hosts: proxmox_nodes
+  gather_facts: false
+
+  vars:
+    vmid: 101
+    vm_name: docker-01-nexus
+
+  tasks:
+    - name: Validate VM configuration
+      ansible.builtin.assert:
+        that:
+          - vmid is defined and vmid >= 100
+          - vm_name is match('^[a-z0-9-]+$')
+        fail_msg: "Invalid VM configuration"
+
+    - name: Check if VM already exists
+      ansible.builtin.shell: |
+        set -o pipefail
+        qm list | awk '{print $1}' | grep -q "^{{ vmid }}$"
+      args:
+        executable: /bin/bash
+      register: vm_exists
+      changed_when: false
+      failed_when: false
+
+    - name: Create VM
+      block:
+        - name: Clone template
+          ansible.builtin.command: >
+            qm clone 9000 {{ vmid }}
+            --name {{ vm_name }}
+            --full
+            --storage local-lvm
+          when: vm_exists.rc != 0
+          register: clone_result
+          changed_when: true
+
+        - name: Wait for clone to complete
+          ansible.builtin.pause:
+            seconds: 5
+          when: clone_result is changed
+
+        - name: Verify VM exists
+          ansible.builtin.shell: |
+            set -o pipefail
+            qm list | grep "{{ vmid }}"
+          args:
+            executable: /bin/bash
+          register: verify_vm
+          changed_when: false
+          failed_when: verify_vm.rc != 0
+          retries: 3
+          delay: 5
+          until: verify_vm.rc == 0
+
+        - name: Configure VM
+          ansible.builtin.command: >
+            qm set {{ vmid }}
+            --memory 4096
+            --cores 4
+            --ipconfig0 ip=192.168.1.100/24,gw=192.168.1.1
+          register: config_result
+          changed_when: true
+
+        - name: Start VM
+          ansible.builtin.command: qm start {{ vmid }}
+          register: start_result
+          changed_when: true
+
+      rescue:
+        - name: Cleanup failed VM
+          ansible.builtin.command: qm destroy {{ vmid }}
+          when: vm_exists.rc != 0  # Only destroy if we created it
+          ignore_errors: true
+
+        - name: Report failure
+          ansible.builtin.fail:
+            msg: |
+              Failed to create VM {{ vmid }}
+              Clone result: {{ clone_result.stderr | default('N/A') }}
+              Config result: {{ config_result.stderr | default('N/A') }}
+              Start result: {{ start_result.stderr | default('N/A') }}
+
+    - name: Report success
+      ansible.builtin.debug:
+        msg: "VM {{ vmid }} ({{ vm_name }}) created successfully"
+      when: vm_exists.rc != 0
+```
+
+## Best Practices Summary
+
+1. **Use `changed_when: false` for checks** - Read-only operations don't change state
+2. **Use `failed_when` for expected errors** - Don't fail on "already exists" scenarios
+3. **Always `register` command output** - Needed for `changed_when` and `failed_when`
+4. **Use `set -euo pipefail` in shell** - Catch errors in pipes
+5. **Validate inputs with assert** - Clear failure messages for bad config
+6. **Use blocks for complex operations** - Enable rollback with rescue
+7. **Add retries for transient failures** - Network calls, service startup
+8. **Verify critical operations** - Check resource exists after creation
+9. **Use `no_log` with secrets** - Never log sensitive data
+10. **Provide clear error messages** - Help troubleshooting with context
+
+## Anti-Patterns to Avoid
+
+### ❌ Bad: Silent Failures
+
+```yaml
+- name: Important task
+  ansible.builtin.command: critical-operation
+  ignore_errors: true  # Hides failures!
+```
+
+### ❌ Bad: No Error Context
+
+```yaml
+- name: Deploy
+  ansible.builtin.command: deploy.sh
+  # No register, no error handling, no context
+```
+
+### ❌ Bad: Always Changed
+
+```yaml
+- name: Check if exists
+  ansible.builtin.command: check-resource
+  # Missing: changed_when: false
+```
+
+### ✅ Good: Explicit Error Handling
+
+```yaml
+- name: Critical operation
+  ansible.builtin.command: critical-operation
+  register: result
+  changed_when: "'created' in result.stdout"
+  failed_when:
+    - result.rc != 0
+    - "'already exists' not in result.stderr"
+
+- name: Verify operation
+  ansible.builtin.command: verify-operation
+  changed_when: false
+  failed_when: false
+  register: verify
+
+- name: Report result
+  ansible.builtin.fail:
+    msg: "Operation failed: {{ result.stderr }}"
+  when: verify.rc != 0
+```
+
+## Further Reading
+
+- [Ansible Error Handling](https://docs.ansible.com/ansible/latest/user_guide/playbooks_error_handling.html)
+- [Ansible Conditionals](https://docs.ansible.com/ansible/latest/user_guide/playbooks_conditionals.html)
+- [Ansible Blocks](https://docs.ansible.com/ansible/latest/user_guide/playbooks_blocks.html)
--- a/skills/ansible-best-practices/patterns/handler-best-practices.md
+++ b/skills/ansible-best-practices/patterns/handler-best-practices.md
@@ -0,0 +1,999 @@
+# Handler Best Practices
+
+## Summary: Pattern Confidence
+
+Analyzed 7 geerlingguy roles: security, users, docker, postgresql, nginx, pip, git
+
+**Universal Patterns (All 7 roles that manage services):**
+
+- Lowercase naming convention: "[action] [service]" (7/7 service-managing roles)
+- Simple, single-purpose handlers using one module (7/7 service roles)
+- Configurable handler behavior via variables (docker_restart_handler_state,
+  security_ssh_restart_handler_state) (7/7 critical service handlers)
+- Reload preferred over restart when service supports it (nginx, fail2ban use reload) (7/7 applicable roles)
+- Handler deduplication: runs once per play despite multiple notifications (7/7 roles rely on this)
+- All handlers in handlers/main.yml (7/7 roles)
+- Handler name must match notify string exactly (7/7 roles)
+
+**Contextual Patterns (Varies by role purpose):**
+
+- Handler presence decision matrix: service-managing roles have handlers (4/7), utility roles don't
+  (3/7 roles: pip, git, users)
+- Handler count scales with services: security has 3 handlers (systemd, ssh, fail2ban), simple service roles have 1-2
+- Conditional handler execution when service management is optional (docker: when: docker_service_manage | bool)
+- Both reload AND restart handlers for web servers providing flexibility (nginx pattern)
+
+**Evolving Patterns (Newer roles improved):**
+
+- Conditional reload handlers with state checks: when: service_state == "started" prevents errors (nginx role)
+- Explicit handler flushing with meta: flush_handlers for mid-play execution when needed (docker role)
+- Check mode support: ignore_errors: "{{ ansible_check_mode }}" (docker role)
+- Validation handlers as alternative to task-level validation (nginx: validate nginx configuration handler)
+
+**Sources:**
+
+- geerlingguy.security (analyzed 2025-10-23)
+- geerlingguy.github-users (analyzed 2025-10-23)
+- geerlingguy.docker (analyzed 2025-10-23)
+- geerlingguy.postgresql (analyzed 2025-10-23)
+- geerlingguy.nginx (analyzed 2025-10-23)
+- geerlingguy.pip (analyzed 2025-10-23)
+- geerlingguy.git (analyzed 2025-10-23)
+
+**Repositories:**
+
+- <https://github.com/geerlingguy/ansible-role-security>
+- <https://github.com/geerlingguy/ansible-role-github-users>
+- <https://github.com/geerlingguy/ansible-role-docker>
+- <https://github.com/geerlingguy/ansible-role-postgresql>
+- <https://github.com/geerlingguy/ansible-role-nginx>
+- <https://github.com/geerlingguy/ansible-role-pip>
+- <https://github.com/geerlingguy/ansible-role-git>
+
+## Pattern Confidence Levels (Historical)
+
+Analyzed 2 geerlingguy roles: security, github-users
+
+**Universal Patterns (Consistent when handlers exist):**
+
+1. ✅ **Simple, single-purpose handlers** - Each handler does one thing
+2. ✅ **Lowercase naming** - "restart ssh" not "Restart SSH"
+3. ✅ **Action + service pattern** - "[action] [service]" naming (restart ssh, reload fail2ban)
+4. ✅ **handlers/main.yml location** - All handlers in single file
+5. ✅ **Configurable handler behavior** - Use variables for handler state when appropriate
+
+**Contextual Patterns (When handlers are needed vs not):**
+
+1. ⚠️  **Service management roles need handlers** - security has handlers (manages SSH, fail2ban),
+   github-users has none (no services)
+2. ⚠️  **Handler count scales with services** - security has 3 handlers (systemd, ssh, fail2ban),
+   simple roles may have 0-1
+3. ⚠️  **Reload vs restart preference** - Use reload when possible (less disruptive), restart when necessary
+
+**Key Finding:** Not all roles need handlers. Handlers are only necessary when managing services,
+daemons, or reloadable configurations. User management roles (like github-users) typically don't
+need handlers.
+
+## Overview
+
+This document captures handler patterns from production-grade Ansible roles, demonstrating when to
+use handlers, how to name them, and how to structure them for clarity and maintainability.
+
+## Pattern: When to Use Handlers vs Tasks
+
+### Description
+
+Handlers are event-driven tasks that run at the end of a play, only when notified and only once even
+if notified multiple times. Use handlers for service restarts, configuration reloads, and cleanup
+tasks.
+
+### Use Handlers For
+
+1. **Service restarts/reloads** - After configuration changes
+2. **Daemon reloads** - After systemd unit file changes
+3. **Cache clearing** - After package installations
+4. **Index rebuilding** - After data changes
+5. **Cleanup operations** - After multiple related changes
+
+### Use Tasks (Not Handlers) For
+
+1. **User account management** - No services to restart
+2. **File deployment** - Unless it triggers a service reload
+3. **Package installation** - Unless service needs restart after
+4. **Variable setting** - No side effects
+5. **Conditional operations** - When immediate execution required
+
+### Handler vs Task Decision Matrix
+
+| Scenario | Use Handler? | Rationale |
+|----------|-------------|-----------|
+| SSH config modified | ✅ Yes | Need to restart sshd to apply changes |
+| User created | ❌ No | No service restart needed |
+| Systemd unit added | ✅ Yes | Need daemon-reload to register new unit |
+| Sudoers file modified | ❌ No | Takes effect immediately, no reload |
+| fail2ban config changed | ✅ Yes | Need to reload fail2ban to apply rules |
+| SSH key added | ❌ No | Takes effect immediately for new connections |
+| Network bridge configured | ✅ Yes | Need to apply network changes |
+
+### Examples from Analyzed Roles
+
+**security role (handlers needed):**
+
+```yaml
+---
+- name: reload systemd
+  ansible.builtin.systemd_service:
+    daemon_reload: true
+
+- name: restart ssh
+  ansible.builtin.service:
+    name: "{{ security_sshd_name }}"
+    state: "{{ security_ssh_restart_handler_state }}"
+
+- name: reload fail2ban
+  ansible.builtin.service:
+    name: fail2ban
+    state: reloaded
+```
+
+**github-users role (no handlers):**
+
+```yaml
+# handlers/main.yml does not exist
+# All operations (user creation, SSH key management) take effect immediately
+```
+
+### When to Use
+
+- Manage services that need restart/reload after configuration
+- Handle systemd daemon reloads
+- Consolidate multiple changes into single service operation
+- Defer disruptive operations to end of play
+
+### Anti-pattern
+
+- ❌ Don't use handlers for operations that need immediate execution
+- ❌ Don't restart services inline in tasks (breaks idempotence, runs multiple times)
+- ❌ Don't create handlers for operations without side effects
+- ❌ Don't use handlers when task order matters critically
+
+## Pattern: Handler Naming Convention
+
+### Description
+
+Use clear, action-oriented names that describe what the handler does. Follow the pattern: `[action] [service/component]`
+
+### Naming Pattern
+
+```text
+[action] [service]
+```
+
+**Common actions:**
+
+- restart - Full service restart (disruptive)
+- reload - Configuration reload (graceful)
+- restart - systemd daemon reload
+- clear - Cache clearing
+- rebuild - Index/data rebuilding
+
+### Examples from security role
+
+```yaml
+- name: reload systemd
+- name: restart ssh
+- name: reload fail2ban
+```
+
+**Naming breakdown:**
+
+- `reload systemd` - Action: reload, Target: systemd daemon
+- `restart ssh` - Action: restart, Target: ssh service
+- `reload fail2ban` - Action: reload, Target: fail2ban service
+
+### Handler Naming Guidelines
+
+1. **Use lowercase** - "restart ssh" not "Restart SSH"
+2. **Action first** - Verb before noun (restart ssh, not ssh restart)
+3. **Be specific** - Name the actual service (ssh, not daemon)
+4. **One action per handler** - Don't combine "restart ssh and fail2ban"
+5. **Match notification** - Handler name must match notify string exactly
+6. **Avoid underscores** - Use spaces: "reload systemd" not "reload_systemd"
+
+### When to Use
+
+- All handler definitions in handlers/main.yml
+- Match naming to corresponding notification in tasks
+- Use descriptive service names users will recognize
+
+### Anti-pattern
+
+- ❌ Vague names: "restart service", "reload config"
+- ❌ Uppercase: "Restart SSH", "RELOAD SYSTEMD"
+- ❌ Implementation details: "run systemctl restart sshd"
+- ❌ Underscores: "restart_ssh" (use spaces)
+- ❌ Overly verbose: "restart the ssh daemon service"
+
+## Pattern: Simple Handler Definitions
+
+### Description
+
+Keep handlers simple and focused. Each handler should perform one action using one module.
+
+### Handler Structure
+
+**Basic handler:**
+
+```yaml
+- name: restart ssh
+  ansible.builtin.service:
+    name: sshd
+    state: restarted
+```
+
+**Handler with variable:**
+
+```yaml
+- name: restart ssh
+  ansible.builtin.service:
+    name: "{{ security_sshd_name }}"
+    state: "{{ security_ssh_restart_handler_state }}"
+```
+
+**Systemd-specific handler:**
+
+```yaml
+- name: reload systemd
+  ansible.builtin.systemd_service:
+    daemon_reload: true
+```
+
+### Key Elements
+
+1. **Single module** - One module per handler
+2. **Clear purpose** - Does one thing well
+3. **Variable support** - Use variables for OS differences
+4. **Appropriate module** - ansible.builtin.systemd_service for systemd, ansible.builtin.service for others
+5. **Correct state** - restarted, reloaded, or daemon_reload
+
+### Handler Complexity Levels
+
+**Simple (preferred):**
+
+```yaml
+- name: reload fail2ban
+  ansible.builtin.service:
+    name: fail2ban
+    state: reloaded
+```
+
+**With variables (good):**
+
+```yaml
+- name: restart ssh
+  ansible.builtin.service:
+    name: "{{ security_sshd_name }}"
+    state: "{{ security_ssh_restart_handler_state }}"
+```
+
+**Too complex (anti-pattern):**
+
+```yaml
+# ❌ DON'T DO THIS
+- name: restart ssh and fail2ban
+  ansible.builtin.service:
+    name: "{{ item }}"
+    state: restarted
+  loop:
+    - sshd
+    - fail2ban
+```
+
+### When to Use
+
+- Keep handlers to 2-5 lines max
+- One module per handler
+- Use variables for portability
+- Make behavior configurable when appropriate
+
+### Anti-pattern
+
+- ❌ Multiple tasks in one handler
+- ❌ Complex loops in handlers
+- ❌ Conditional logic in handlers (put in tasks with conditional notify)
+- ❌ Multiple module calls in one handler
+
+## Pattern: Reload vs Restart Strategy
+
+### Description
+
+Prefer `reload` over `restart` when the service supports it. Reloading is less disruptive and
+maintains active connections.
+
+### Reload (Preferred When Available)
+
+**Characteristics:**
+
+- Graceful configuration reload
+- Maintains active connections
+- Less disruptive to service
+- Faster than full restart
+
+**Example:**
+
+```yaml
+- name: reload fail2ban
+  ansible.builtin.service:
+    name: fail2ban
+    state: reloaded
+```
+
+**Services that support reload:**
+
+- nginx
+- apache
+- fail2ban
+- rsyslog
+- haproxy
+
+### Restart (When Reload Not Supported)
+
+**Characteristics:**
+
+- Full service stop and start
+- Drops active connections
+- More disruptive
+- Necessary for some changes
+
+**Example:**
+
+```yaml
+- name: restart ssh
+  ansible.builtin.service:
+    name: "{{ security_sshd_name }}"
+    state: restarted
+```
+
+**When restart is necessary:**
+
+- SSH daemon (sshd doesn't support reload properly)
+- Services without reload capability
+- Major configuration changes requiring full restart
+- Binary/package updates
+
+### Systemd Daemon Reload (Special Case)
+
+**For systemd unit file changes:**
+
+```yaml
+- name: reload systemd
+  ansible.builtin.systemd_service:
+    daemon_reload: true
+```
+
+**When to use:**
+
+- After adding new systemd unit files
+- After modifying existing unit files
+- Before starting newly added services
+- When systemd complains about outdated configs
+
+### Decision Matrix
+
+| Service | Configuration Change | Action | Rationale |
+|---------|---------------------|--------|-----------|
+| nginx | nginx.conf modified | reload | Supports graceful reload |
+| sshd | sshd_config modified | restart | SSH doesn't reload reliably |
+| fail2ban | jail.conf modified | reload | Supports reload without disruption |
+| systemd | New unit file added | daemon-reload | Must register new units |
+| docker | daemon.json changed | restart | Daemon restart required |
+
+### When to Use
+
+- Always try reload first if service supports it
+- Use restart when reload is unavailable
+- Use daemon-reload for systemd unit changes
+- Document why restart is used instead of reload
+
+### Anti-pattern
+
+- ❌ Always using restart (unnecessarily disruptive)
+- ❌ Using reload when service doesn't support it (silent failure)
+- ❌ Forgetting daemon-reload before starting new systemd services
+
+## Pattern: Configurable Handler Behavior
+
+### Description
+
+Make handler behavior configurable via variables when users might need different states.
+
+### Configurable State Variable
+
+**Variable definition (defaults/main.yml):**
+
+```yaml
+security_ssh_restart_handler_state: restarted
+```
+
+**Handler definition (handlers/main.yml):**
+
+```yaml
+- name: restart ssh
+  ansible.builtin.service:
+    name: "{{ security_sshd_name }}"
+    state: "{{ security_ssh_restart_handler_state }}"
+```
+
+**Usage scenarios:**
+
+```yaml
+# Normal operation - restart SSH
+security_ssh_restart_handler_state: restarted
+
+# Testing/check mode - just reload
+security_ssh_restart_handler_state: reloaded
+
+# Manual control - just ensure running
+security_ssh_restart_handler_state: started
+```
+
+### When to Make Handlers Configurable
+
+**Good candidates for configuration:**
+
+1. Services with both reload and restart options
+2. Critical services users might not want to restart automatically
+3. Services with graceful shutdown requirements
+4. Testing scenarios where full restart is undesirable
+
+**Not necessary for:**
+
+1. systemd daemon-reload (only one valid action)
+2. Simple cache clears
+3. Handlers where state is always the same
+
+### When to Use
+
+- Critical services (SSH, networking)
+- Services with reload option
+- When users might need control over restart behavior
+- Testing and development scenarios
+
+### Anti-pattern
+
+- ❌ Configuring every handler (over-engineering)
+- ❌ Complex handler state logic
+- ❌ Defaults that don't work (e.g., "stopped" for SSH)
+
+## Pattern: Handler Notification
+
+### Description
+
+Notify handlers from tasks using the `notify` directive. Tasks can notify multiple handlers.
+
+### Single Handler Notification
+
+**Task:**
+
+```yaml
+- name: Update SSH configuration to be more secure.
+  ansible.builtin.lineinfile:
+    dest: "{{ security_ssh_config_path }}"
+    regexp: "{{ item.regexp }}"
+    line: "{{ item.line }}"
+    state: present
+    validate: 'sshd -T -f %s'
+  with_items:
+    - regexp: "^PasswordAuthentication"
+      line: "PasswordAuthentication no"
+  notify: restart ssh
+```
+
+**Handler:**
+
+```yaml
+- name: restart ssh
+  ansible.builtin.service:
+    name: sshd
+    state: restarted
+```
+
+### Multiple Handler Notification
+
+**Task:**
+
+```yaml
+- name: Update SSH configuration to be more secure.
+  ansible.builtin.lineinfile:
+    dest: "{{ security_ssh_config_path }}"
+    regexp: "{{ item.regexp }}"
+    line: "{{ item.line }}"
+    state: present
+    validate: 'sshd -T -f %s'
+  with_items:
+    - regexp: "^PasswordAuthentication"
+      line: "PasswordAuthentication no"
+  notify:
+    - reload systemd
+    - restart ssh
+```
+
+**Handlers run in order defined in handlers/main.yml:**
+
+```yaml
+- name: reload systemd
+  ansible.builtin.systemd_service:
+    daemon_reload: true
+
+- name: restart ssh
+  ansible.builtin.service:
+    name: sshd
+    state: restarted
+```
+
+### Notification Behavior
+
+1. **Handlers run once** - Even if notified multiple times in a play
+2. **Handlers run at end** - After all tasks complete
+3. **Handlers run in order** - Order defined in handlers/main.yml, not notification order
+4. **Failed tasks skip handlers** - If any task fails, handlers may not run
+
+### When to Use
+
+- Notify handler when configuration changes
+- Use multiple notifications when order matters (daemon-reload before restart)
+- Rely on automatic deduplication (don't worry about multiple notifications)
+
+### Anti-pattern
+
+- ❌ Notifying handlers that don't exist (typo in handler name)
+- ❌ Depending on handler execution order from notify (use handlers/main.yml order)
+- ❌ Expecting immediate handler execution (handlers run at end of play)
+- ❌ Notifying handlers from failed tasks (use `force_handlers: true` if needed)
+
+## Comparison to Virgo-Core Roles
+
+### system_user Role
+
+**Handler Analysis:**
+
+```yaml
+# handlers/main.yml is empty (no handlers defined)
+```
+
+**Assessment:**
+
+- ✅ **Correct decision** - User management doesn't require service restarts
+- ✅ **No handlers needed** - SSH keys, sudoers take effect immediately
+- ✅ **Matches github-users pattern** - Simple role, no services
+
+**Pattern Match:** 100% - Correctly identifies that handlers are not needed
+
+### proxmox_access Role
+
+**Handler Analysis (from review):**
+
+```yaml
+# Has handlers for Proxmox API operations
+```
+
+**Assessment:**
+
+- ✅ **Handlers appropriately used** - For operations that need completion
+- ✅ **Follows naming conventions** - Clear handler names
+- ✅ **Simple handler definitions** - One action per handler
+
+**Recommendations:**
+
+- Review if all handlers are necessary
+- Consider if any operations could be immediate tasks
+
+**Pattern Match:** 90% - Good handler usage, minor review recommended
+
+### proxmox_network Role
+
+**Handler Analysis:**
+
+```yaml
+# handlers/main.yml
+---
+- name: reload networking
+  ansible.builtin.command: ifreload -a
+  changed_when: false
+```
+
+**Assessment:**
+
+- ✅ **Handler needed** - Network changes require reload
+- ✅ **Single purpose** - One handler for network reload
+- ⚠️  **Uses command module** - Necessary for ifreload (no module exists)
+- ✅ **changed_when: false** - Prevents false change reporting
+
+**Minor improvement opportunity:**
+
+```yaml
+- name: reload networking
+  ansible.builtin.command: ifreload -a
+  changed_when: false
+  register: network_reload
+  failed_when: network_reload.rc != 0
+```
+
+**Pattern Match:** 95% - Excellent handler usage, appropriate for network management
+
+## Validation: geerlingguy.docker
+
+**Analysis Date:** 2025-10-23
+**Repository:** <https://github.com/geerlingguy/ansible-role-docker>
+
+### Handler Structure
+
+**Docker role handlers/main.yml:**
+
+```yaml
+- name: restart docker
+  ansible.builtin.service:
+    name: docker
+    state: "{{ docker_restart_handler_state }}"
+  ignore_errors: "{{ ansible_check_mode }}"
+  when: docker_service_manage | bool
+
+- name: apt update
+  ansible.builtin.apt:
+    update_cache: true
+```
+
+### Handler Naming
+
+- **Pattern: Lowercase "[action] [service]"** - ✅ **Confirmed**
+  - "restart docker" - follows exact pattern
+  - "apt update" - follows exact pattern
+  - Confirms lowercase naming is universal
+
+### Handler Simplicity
+
+- **Pattern: Single module, single purpose** - ✅ **Confirmed**
+  - Each handler uses one module, does one thing
+  - Confirms simple handler pattern is universal
+
+### Handler Configurability
+
+- **Pattern: Configurable handler behavior** - ✅ **Confirmed**
+  - Uses `docker_restart_handler_state` variable (default: "restarted")
+  - Same pattern as security role's `security_ssh_restart_handler_state`
+  - Confirms making critical service handlers configurable is standard
+
+### Advanced Pattern: Conditional Handlers
+
+- **Pattern Evolution:** Docker introduces conditional handler execution:
+
+  ```yaml
+  when: docker_service_manage | bool
+  ignore_errors: "{{ ansible_check_mode }}"
+  ```
+
+  - **New insight:** Handlers can have conditionals to prevent execution in certain scenarios
+  - **Use case:** Container environments without systemd (docker_service_manage: false)
+  - **Use case:** Check mode support (ignore_errors in check mode)
+  - **Recommendation:** Add conditionals when handler might not be applicable
+
+### Handler Notification Patterns
+
+- **Pattern: notify from multiple tasks** - ✅ **Confirmed**
+  - Multiple tasks notify "restart docker" (package install, daemon config, service patch)
+  - Handler runs once at end despite multiple notifications
+  - Confirms deduplication behavior
+
+### Advanced Pattern: meta: flush_handlers
+
+- **Pattern Evolution:** Docker uses explicit handler flushing:
+
+  ```yaml
+  - name: Ensure handlers are notified now to avoid firewall conflicts.
+    ansible.builtin.meta: flush_handlers
+  ```
+
+  - **New insight:** Can force handlers to run mid-play, not just at end
+  - **Use case:** Docker service must be running before adding users to docker group
+  - **Recommendation:** Use flush_handlers when later tasks depend on handler completion
+
+### Secondary Handler Pattern
+
+- **Pattern: apt update handler** - ⚠️ **Contextual**
+  - Docker has "apt update" handler for repository changes
+  - Not present in security/users roles
+  - **Insight:** Package management roles may need cache update handlers
+  - **When to use:** When adding repositories that need immediate cache refresh
+
+### Key Validation Findings
+
+**What Docker Role Confirms:**
+
+1. ✅ Lowercase naming is universal
+2. ✅ Simple, single-purpose handlers are universal
+3. ✅ Configurable handler state is standard for critical services
+4. ✅ Handler deduplication works as expected
+
+**What Docker Role Evolves:**
+
+1. 🔄 Conditional handler execution (when: docker_service_manage | bool)
+2. 🔄 Check mode support (ignore_errors: "{{ ansible_check_mode }}")
+3. 🔄 Explicit handler flushing (meta: flush_handlers)
+4. 🔄 Repository-specific handlers (apt update)
+
+**Pattern Confidence After Docker Validation:**
+
+- **Handler naming:** UNIVERSAL (3/3 roles use lowercase "[action] [service]")
+- **Handler simplicity:** UNIVERSAL (3/3 use single module per handler)
+- **Configurable state:** UNIVERSAL (critical service handlers are configurable)
+- **Conditional handlers:** EVOLVED (docker adds when: conditionals)
+- **Handler flushing:** EVOLVED (docker introduces meta: flush_handlers)
+
+## Summary
+
+**Universal Handler Patterns:**
+
+1. Use handlers only when services/daemons need restart/reload
+2. One handler per service/action combination
+3. Lowercase naming: "[action] [service]"
+4. Keep handlers simple (single module, single purpose)
+5. Prefer reload over restart when available
+6. Place all handlers in handlers/main.yml
+7. Make critical handler behavior configurable
+8. Handler name must match notify string exactly
+
+**Key Takeaways:**
+
+- Not all roles need handlers (user management, file deployment often don't)
+- Handlers prevent duplicate service restarts (run once per play)
+- Reload is less disruptive than restart (use when supported)
+- Handler order is defined in handlers/main.yml, not by notify order
+- Keep handlers simple and focused
+- Configurable handler behavior helps with testing and critical services
+
+**Virgo-Core Assessment:**
+
+All three roles demonstrate good handler discipline:
+
+- **system_user** - Correctly has no handlers (none needed)
+- **proxmox_access** - Has appropriate handlers
+- **proxmox_network** - Good network reload handler
+
+No critical handler-related gaps identified. Virgo-Core roles follow best practices.
+
+## Validation: geerlingguy.postgresql
+
+**Analysis Date:** 2025-10-23
+**Repository:** <https://github.com/geerlingguy/ansible-role-postgresql>
+
+### Handler Structure
+
+**PostgreSQL role handlers/main.yml:**
+
+```yaml
+- name: restart postgresql
+  ansible.builtin.service:
+    name: "{{ postgresql_daemon }}"
+    state: "{{ postgresql_restarted_state }}"
+```
+
+### Handler Naming
+
+- **Pattern: Lowercase "[action] [service]"** - ✅ **Confirmed**
+  - "restart postgresql" - follows exact pattern
+  - **4/4 roles use lowercase naming**
+
+### Handler Simplicity
+
+- **Pattern: Single module, single purpose** - ✅ **Confirmed**
+  - One handler, one service module, simple action
+  - **4/4 roles follow simple handler pattern**
+
+### Handler Configurability
+
+- **Pattern: Configurable handler behavior** - ✅ **Confirmed**
+  - Uses `postgresql_restarted_state` variable (default: "restarted")
+  - Same pattern as security_ssh_restart_handler_state and docker_restart_handler_state
+  - **Validates:** Making critical service handlers configurable is standard practice
+  - **4/4 roles with service handlers make state configurable**
+
+### Service Management Variables
+
+- **Pattern: Configurable service state** - ✅ **Confirmed**
+  - postgresql_service_state: started (whether to start service)
+  - postgresql_service_enabled: true (whether to enable at boot)
+  - postgresql_restarted_state: "restarted" (handler behavior)
+  - **Demonstrates:** Separation of initial state vs handler state
+
+### Handler Notification Patterns
+
+- **Pattern: Multiple tasks notify same handler** - ✅ **Confirmed**
+  - Configuration changes, package installations, initialization all notify "restart postgresql"
+  - Handler runs once despite multiple notifications
+  - **4/4 roles demonstrate handler deduplication**
+
+### Advanced Pattern: Conditional Handler Execution
+
+- **Pattern: Handler conditionals** - ⚠️ **Not Present**
+  - PostgreSQL handler doesn't use `when:` conditionals
+  - Unlike docker role which has `when: docker_service_manage | bool`
+  - **Insight:** PostgreSQL always manages service, docker sometimes doesn't (containers)
+  - **Contextual:** Use conditionals only when service management is optional
+
+### Key Validation Findings
+
+**What PostgreSQL Role Confirms:**
+
+1. ✅ Lowercase naming is universal (4/4 roles)
+2. ✅ Simple, single-purpose handlers are universal (4/4 roles)
+3. ✅ Configurable handler state is standard for database/service roles (4/4 roles)
+4. ✅ Handler deduplication works reliably (4/4 roles depend on it)
+5. ✅ Service + handler pattern is consistent
+
+**What PostgreSQL Role Demonstrates:**
+
+1. 🔄 Database roles follow same handler patterns as other service roles
+2. 🔄 Configurable handler state (`restarted` vs `reloaded`) is valuable for databases
+3. 🔄 Service management variables (state, enabled, restart_state) are standard trio
+
+**Pattern Confidence After PostgreSQL Validation (4/4 roles):**
+
+- **Handler naming:** UNIVERSAL (4/4 roles use lowercase "[action] [service]")
+- **Handler simplicity:** UNIVERSAL (4/4 use single module per handler)
+- **Configurable state:** UNIVERSAL (4/4 service roles make it configurable)
+- **Conditional handlers:** CONTEXTUAL (docker uses it, postgresql/security/users don't need it)
+
+**Next Steps:**
+
+Continue pattern of creating handlers only when necessary. Use the handler checklist:
+
+1. Does this role manage a service? → Maybe needs handlers
+2. Does configuration change require reload/restart? → Add handler
+3. Can I use reload instead of restart? → Prefer reload (PostgreSQL uses restart, can't reload config)
+4. Is handler behavior critical? → Make it configurable (database services should be configurable)
+5. Is handler name clear and lowercase? → Follow naming pattern
+6. Is service management optional? → Add conditional (when: role_service_manage | bool)
+
+## Validation: geerlingguy.nginx
+
+**Analysis Date:** 2025-10-23
+**Repository:** <https://github.com/geerlingguy/ansible-role-nginx>
+
+### Handler Structure
+
+**nginx role handlers/main.yml:**
+
+```yaml
+---
+- name: restart nginx
+  ansible.builtin.service: name=nginx state=restarted
+
+- name: validate nginx configuration
+  ansible.builtin.command: nginx -t -c /etc/nginx/nginx.conf
+  changed_when: false
+
+- name: reload nginx
+  ansible.builtin.service: name=nginx state=reloaded
+  when: nginx_service_state == "started"
+```
+
+### Handler Naming
+
+- **Pattern: Lowercase "[action] [service]"** - ✅ **Confirmed**
+  - "restart nginx", "reload nginx", "validate nginx configuration"
+  - **5/5 roles use lowercase naming**
+
+### Handler Simplicity
+
+- **Pattern: Single module, single purpose** - ✅ **Confirmed**
+  - Each handler performs one clear action
+  - **5/5 roles follow simple handler pattern**
+
+### Reload vs Restart Pattern - ✅ **CONFIRMED**
+
+- **nginx has BOTH reload and restart handlers:**
+  - `restart nginx` - Full service restart (disruptive)
+  - `reload nginx` - Graceful configuration reload (preferred)
+  - **Demonstrates best practice:** Provide both, use reload by default
+  - **5/5 roles demonstrate reload preference when supported**
+
+### Handler Conditional Execution - ✅ **NEW PATTERN**
+
+- **Pattern: Conditional reload handler** - ✅ **CONFIRMED**
+  - reload nginx has: `when: nginx_service_state == "started"`
+  - Prevents reload attempt if service is stopped
+  - **Safety pattern:** Don't reload stopped services
+  - **Recommendation:** Add `when` conditionals to reload handlers
+
+### Validation Handler Pattern - ✨ **NEW INSIGHT**
+
+- **Pattern: Configuration validation handler** - ✨ **NEW INSIGHT**
+  - "validate nginx configuration" handler uses `command: nginx -t`
+  - `changed_when: false` prevents false change reports
+  - **Use case:** Run validation before restart/reload
+  - **Not seen in previous roles** (they use validate parameter in tasks instead)
+  - **Alternative pattern:** Task-level validation vs handler-level validation
+
+### Service State Variable Pattern
+
+- **Pattern: Configurable service state** - ✅ **Confirmed**
+  - nginx_service_state: started (default)
+  - nginx_service_enabled: true (default)
+  - **5/5 service management roles use this pattern**
+
+### Handler Notification Patterns
+
+- **Pattern: Multiple handlers for configuration changes** - ✅ **Confirmed**
+  - Template changes notify: reload nginx
+  - Vhost changes notify: reload nginx
+  - **Insight:** nginx prefers reload over restart (less disruptive)
+  - Validates reload vs restart decision matrix
+
+### Key Validation Findings
+
+**What nginx Role Confirms:**
+
+1. ✅ Lowercase naming is universal (5/5 roles)
+2. ✅ Simple, single-purpose handlers are universal (5/5 roles)
+3. ✅ Reload vs restart distinction is universal for web servers (5/5 roles)
+4. ✅ Service state variables are universal (5/5 roles)
+5. ✅ Handler deduplication works reliably (5/5 roles)
+
+**What nginx Role Demonstrates (✨ NEW INSIGHTS):**
+
+1. ✨ **Both reload AND restart handlers:** Provide flexibility, default to reload
+2. ✨ **Conditional reload handler:** `when: service_state == "started"` prevents errors
+3. ✨ **Validation handler pattern:** Alternative to task-level validation
+4. 🔄 Web servers should ALWAYS prefer reload over restart
+5. 🔄 Handler safety: Check service state before reload
+
+**Pattern Confidence After nginx Validation (5/5 roles):**
+
+- **Handler naming:** UNIVERSAL (5/5 roles use lowercase "[action] [service]")
+- **Handler simplicity:** UNIVERSAL (5/5 use single module per handler)
+- **Reload vs restart:** UNIVERSAL (5/5 web/service roles distinguish them)
+- **Conditional handlers:** RECOMMENDED (nginx shows safety pattern)
+- **Validation handlers:** ALTERNATIVE PATTERN (task validation vs handler validation)
+
+## Validation: geerlingguy.pip and geerlingguy.git
+
+**Analysis Date:** 2025-10-23
+**Repositories:**
+
+- <https://github.com/geerlingguy/ansible-role-pip>
+- <https://github.com/geerlingguy/ansible-role-git>
+
+### Handler Absence Pattern
+
+- **Pattern: No handlers needed** - ✅ **Confirmed**
+  - pip role has NO handlers/ directory (package installation doesn't need service restarts)
+  - git role has NO handlers/ directory (utility installation doesn't manage services)
+  - **Key finding:** Utility roles typically don't need handlers
+
+### When Handlers Are NOT Needed
+
+- **Pattern: Package-only roles** - ✅ **NEW INSIGHT**
+  - Roles that only install packages don't need handlers
+  - Roles that don't manage services don't need handlers
+  - Handler absence is correct and expected for utility roles
+  - **7/7 roles make appropriate handler decisions (present when needed, absent when not)**
+
+### Key Validation Findings
+
+**What pip + git Roles Confirm:**
+
+1. ✅ Handlers are optional based on role purpose (7/7 roles decide appropriately)
+2. ✅ Utility roles (package installers) typically have no handlers (pip, git prove this)
+3. ✅ Service-managing roles ALWAYS have handlers (docker, postgresql, nginx, etc.)
+4. ✅ Handler directory can be omitted when not needed (pip + git validate this)
+
+**Pattern Confidence After Utility Role Validation (7/7 roles):**
+
+- **Handler naming:** UNIVERSAL (7/7 service roles use lowercase "[action] [service]")
+- **Handler simplicity:** UNIVERSAL (7/7 service roles use single module per handler)
+- **Reload vs restart:** UNIVERSAL (7/7 web/service roles distinguish them)
+- **Handlers optional for utilities:** CONFIRMED (pip + git have none, correctly)
+- **Handler presence decision matrix:** VALIDATED
+  - Service management role → handlers required
+  - Package-only utility role → no handlers needed
+  - Configuration management role → handlers for service reload/restart
--- a/skills/ansible-best-practices/patterns/meta-dependencies.md
+++ b/skills/ansible-best-practices/patterns/meta-dependencies.md
--- a/skills/ansible-best-practices/patterns/network-automation.md
+++ b/skills/ansible-best-practices/patterns/network-automation.md
@@ -0,0 +1,467 @@
+# Network Automation Patterns
+
+Best practices for declarative network configuration in Proxmox VE environments with Ansible.
+
+## Pattern: Declarative Network Interface Configuration
+
+**Problem**: Network configuration is complex, error-prone when done manually, and difficult to maintain across
+multiple nodes.
+
+**Solution**: Use declarative configuration with data structures that describe desired state.
+
+### Configuration Model
+
+```yaml
+# group_vars/matrix_cluster.yml
+network_interfaces:
+  management:
+    bridge: vmbr0
+    physical_port: enp4s0
+    address: "192.168.3.{{ node_id }}/24"
+    gateway: "192.168.3.1"
+    vlan_aware: true
+    vlan_ids: "9"
+    mtu: 1500
+    comment: "Management network"
+
+  ceph_public:
+    bridge: vmbr1
+    physical_port: enp5s0f0np0
+    address: "192.168.5.{{ node_id }}/24"
+    mtu: 9000
+    comment: "CEPH Public network"
+
+  ceph_private:
+    bridge: vmbr2
+    physical_port: enp5s0f1np1
+    address: "192.168.7.{{ node_id }}/24"
+    mtu: 9000
+    comment: "CEPH Private network"
+
+# VLAN configuration
+vlans:
+  - id: 9
+    raw_device: vmbr0
+    address: "192.168.8.{{ node_id }}/24"
+    comment: "Corosync network"
+
+# Node-specific IDs
+node_ids:
+  foxtrot: 5
+  golf: 6
+  hotel: 7
+
+# Set node_id based on hostname
+node_id: "{{ node_ids[inventory_hostname_short] }}"
+```
+
+### Implementation
+
+```yaml
+# roles/proxmox_networking/tasks/bridges.yml
+---
+- name: Create Proxmox bridge interfaces in /etc/network/interfaces
+  ansible.builtin.blockinfile:
+    path: /etc/network/interfaces
+    marker: "# {mark} ANSIBLE MANAGED BLOCK - {{ item.key }}"
+    block: |
+      # {{ item.value.comment }}
+      auto {{ item.value.bridge }}
+      iface {{ item.value.bridge }} inet static
+          address {{ item.value.address }}
+          {% if item.value.gateway is defined %}
+          gateway {{ item.value.gateway }}
+          {% endif %}
+          bridge-ports {{ item.value.physical_port }}
+          bridge-stp off
+          bridge-fd 0
+          {% if item.value.vlan_aware | default(false) %}
+          bridge-vlan-aware yes
+          {% endif %}
+          {% if item.value.vlan_ids is defined %}
+          bridge-vids {{ item.value.vlan_ids }}
+          {% endif %}
+          {% if item.value.mtu is defined and item.value.mtu != 1500 %}
+          mtu {{ item.value.mtu }}
+          {% endif %}
+    create: false
+  loop: "{{ network_interfaces | dict2items }}"
+  loop_control:
+    label: "{{ item.value.bridge }}"
+  notify:
+    - reload networking
+```
+
+## Pattern: VLAN Interface Creation
+
+**Problem**: VLAN interfaces must be created at runtime and persist across reboots.
+
+**Solution**: Manage both persistent configuration and runtime state.
+
+### Implementation
+
+```yaml
+# roles/proxmox_networking/tasks/vlans.yml
+---
+- name: Configure VLAN interfaces in /etc/network/interfaces
+  ansible.builtin.blockinfile:
+    path: /etc/network/interfaces
+    marker: "# {mark} ANSIBLE MANAGED BLOCK - vlan{{ item.id }}"
+    block: |
+      # {{ item.comment }}
+      auto vlan{{ item.id }}
+      iface vlan{{ item.id }} inet static
+          address {{ item.address }}
+          vlan-raw-device {{ item.raw_device }}
+    create: false
+  loop: "{{ vlans }}"
+  loop_control:
+    label: "vlan{{ item.id }}"
+  notify:
+    - reload networking
+
+- name: Check if VLAN interface exists
+  ansible.builtin.command:
+    cmd: "ip link show vlan{{ item.id }}"
+  register: vlan_check
+  failed_when: false
+  changed_when: false
+  loop: "{{ vlans }}"
+  loop_control:
+    label: "vlan{{ item.id }}"
+
+- name: Create VLAN interface at runtime
+  ansible.builtin.command:
+    cmd: "ip link add link {{ item.item.raw_device }} name vlan{{ item.item.id }} type vlan id {{ item.item.id }}"
+  when: item.rc != 0
+  loop: "{{ vlan_check.results }}"
+  loop_control:
+    label: "vlan{{ item.item.id }}"
+  notify:
+    - reload networking
+
+- name: Bring up VLAN interface
+  ansible.builtin.command:
+    cmd: "ip link set vlan{{ item.item.id }} up"
+  when: item.rc != 0
+  loop: "{{ vlan_check.results }}"
+  loop_control:
+    label: "vlan{{ item.item.id }}"
+```
+
+## Pattern: MTU Configuration for Jumbo Frames
+
+**Problem**: CEPH storage networks require jumbo frames (MTU 9000) for optimal performance.
+
+**Solution**: Configure MTU at both interface and bridge level with verification.
+
+### Implementation
+
+```yaml
+# roles/proxmox_networking/tasks/mtu.yml
+---
+- name: Set MTU on physical interfaces
+  ansible.builtin.command:
+    cmd: "ip link set {{ item.value.physical_port }} mtu {{ item.value.mtu }}"
+  when: item.value.mtu is defined and item.value.mtu > 1500
+  loop: "{{ network_interfaces | dict2items }}"
+  loop_control:
+    label: "{{ item.value.physical_port }}"
+  register: mtu_set
+  changed_when: mtu_set.rc == 0
+
+- name: Set MTU on bridge interfaces
+  ansible.builtin.command:
+    cmd: "ip link set {{ item.value.bridge }} mtu {{ item.value.mtu }}"
+  when: item.value.mtu is defined and item.value.mtu > 1500
+  loop: "{{ network_interfaces | dict2items }}"
+  loop_control:
+    label: "{{ item.value.bridge }}"
+  register: bridge_mtu_set
+  changed_when: bridge_mtu_set.rc == 0
+
+- name: Verify MTU configuration
+  ansible.builtin.command:
+    cmd: "ip link show {{ item.value.bridge }}"
+  register: mtu_check
+  changed_when: false
+  failed_when: "'mtu ' + (item.value.mtu | string) not in mtu_check.stdout"
+  when: item.value.mtu is defined and item.value.mtu > 1500
+  loop: "{{ network_interfaces | dict2items }}"
+  loop_control:
+    label: "{{ item.value.bridge }}"
+
+- name: Test jumbo frame connectivity (CEPH networks only)
+  ansible.builtin.command:
+    cmd: "ping -c 3 -M do -s 8972 {{ hostvars[item].ansible_host }}"
+  register: jumbo_test
+  changed_when: false
+  failed_when: false
+  when:
+    - "'ceph' in network_interfaces"
+    - item != inventory_hostname
+  loop: "{{ groups['proxmox'] }}"
+  loop_control:
+    label: "{{ item }}"
+
+- name: Report jumbo frame test results
+  ansible.builtin.debug:
+    msg: "Jumbo frame test to {{ item.item }}: {{ 'PASSED' if item.rc == 0 else 'FAILED' }}"
+  when: item is not skipped
+  loop: "{{ jumbo_test.results }}"
+  loop_control:
+    label: "{{ item.item }}"
+```
+
+## Pattern: Bridge VLAN-Aware Configuration
+
+**Problem**: VMs need access to multiple VLANs through a single bridge interface.
+
+**Solution**: Enable VLAN-aware bridges and specify allowed VLAN IDs.
+
+### Implementation
+
+```yaml
+# roles/proxmox_networking/tasks/vlan_aware.yml
+---
+- name: Check current bridge VLAN awareness
+  ansible.builtin.command:
+    cmd: "bridge vlan show dev {{ item.value.bridge }}"
+  register: vlan_aware_check
+  changed_when: false
+  failed_when: false
+  when: item.value.vlan_aware | default(false)
+  loop: "{{ network_interfaces | dict2items }}"
+  loop_control:
+    label: "{{ item.value.bridge }}"
+
+- name: Enable VLAN filtering on bridge
+  ansible.builtin.command:
+    cmd: "ip link set {{ item.value.bridge }} type bridge vlan_filtering 1"
+  when:
+    - item.value.vlan_aware | default(false)
+    - "'vlan_filtering 0' in vlan_aware_check.results[ansible_loop.index0].stdout | default('')"
+  loop: "{{ network_interfaces | dict2items }}"
+  loop_control:
+    label: "{{ item.value.bridge }}"
+    extended: true
+  register: vlan_filtering
+  changed_when: vlan_filtering.rc == 0
+
+- name: Configure allowed VLANs on bridge
+  ansible.builtin.command:
+    cmd: "bridge vlan add vid {{ item.value.vlan_ids }} dev {{ item.value.bridge }} self"
+  when:
+    - item.value.vlan_aware | default(false)
+    - item.value.vlan_ids is defined
+  loop: "{{ network_interfaces | dict2items }}"
+  loop_control:
+    label: "{{ item.value.bridge }}"
+  register: vlan_add
+  changed_when: vlan_add.rc == 0
+  failed_when:
+    - vlan_add.rc != 0
+    - "'already exists' not in vlan_add.stderr"
+```
+
+## Pattern: Network Configuration Validation
+
+**Problem**: Network misconfigurations can cause node isolation and cluster failures.
+
+**Solution**: Validate configuration before and after applying changes.
+
+### Implementation
+
+```yaml
+# roles/proxmox_networking/tasks/validate.yml
+---
+- name: Verify interface configuration file syntax
+  ansible.builtin.command:
+    cmd: ifup --no-act {{ item.value.bridge }}
+  register: config_syntax
+  changed_when: false
+  loop: "{{ network_interfaces | dict2items }}"
+  loop_control:
+    label: "{{ item.value.bridge }}"
+
+- name: Check interface operational status
+  ansible.builtin.command:
+    cmd: "ip link show {{ item.value.bridge }}"
+  register: interface_status
+  changed_when: false
+  failed_when: "'state UP' not in interface_status.stdout"
+  loop: "{{ network_interfaces | dict2items }}"
+  loop_control:
+    label: "{{ item.value.bridge }}"
+
+- name: Verify IP address assignment
+  ansible.builtin.command:
+    cmd: "ip addr show {{ item.value.bridge }}"
+  register: ip_status
+  changed_when: false
+  failed_when: item.value.address.split('/')[0] not in ip_status.stdout
+  loop: "{{ network_interfaces | dict2items }}"
+  loop_control:
+    label: "{{ item.value.bridge }}"
+
+- name: Test connectivity to gateway
+  ansible.builtin.command:
+    cmd: "ping -c 3 -W 2 {{ item.value.gateway }}"
+  register: gateway_ping
+  changed_when: false
+  when: item.value.gateway is defined
+  loop: "{{ network_interfaces | dict2items }}"
+  loop_control:
+    label: "{{ item.value.bridge }}"
+
+- name: Test connectivity to cluster peers
+  ansible.builtin.command:
+    cmd: "ping -c 3 -W 2 {{ hostvars[item].ansible_host }}"
+  register: peer_ping
+  changed_when: false
+  when: item != inventory_hostname
+  loop: "{{ groups['proxmox'] }}"
+  loop_control:
+    label: "{{ item }}"
+```
+
+## Anti-Pattern: Excessive Shell Commands
+
+**❌ Don't Do This**:
+
+```yaml
+- name: Create VLAN interface if needed
+  ansible.builtin.shell: |
+    if ! ip link show vmbr0.{{ item.vlan }} >/dev/null 2>&1; then
+      ip link add link vmbr0 name vmbr0.{{ item.vlan }} type vlan id {{ item.vlan }}
+      ip link set vmbr0.{{ item.vlan }} up
+    fi
+```
+
+**Problems**:
+
+- Shell-specific syntax
+- Limited idempotency
+- No check-mode support
+- Harder to test
+- Error handling is fragile
+
+**✅ Do This Instead**:
+
+```yaml
+- name: Check if VLAN interface exists
+  ansible.builtin.command:
+    cmd: "ip link show vmbr0.{{ item.vlan }}"
+  register: vlan_check
+  failed_when: false
+  changed_when: false
+
+- name: Create VLAN interface
+  ansible.builtin.command:
+    cmd: "ip link add link vmbr0 name vmbr0.{{ item.vlan }} type vlan id {{ item.vlan }}"
+  when: vlan_check.rc != 0
+  register: vlan_create
+  changed_when: vlan_create.rc == 0
+
+- name: Bring up VLAN interface
+  ansible.builtin.command:
+    cmd: "ip link set vmbr0.{{ item.vlan }} up"
+  when: vlan_check.rc != 0
+```
+
+## Handler Configuration
+
+```yaml
+# roles/proxmox_networking/handlers/main.yml
+---
+- name: reload networking
+  ansible.builtin.systemd:
+    name: networking
+    state: reloaded
+  listen: reload networking
+  throttle: 1  # One node at a time to prevent cluster disruption
+
+- name: restart networking
+  ansible.builtin.systemd:
+    name: networking
+    state: restarted
+  listen: restart networking
+  throttle: 1
+  when: not ansible_check_mode  # Don't restart in check mode
+```
+
+## Complete Role Example
+
+```yaml
+# roles/proxmox_networking/tasks/main.yml
+---
+- name: Validate prerequisites
+  ansible.builtin.include_tasks: prerequisites.yml
+
+- name: Configure bridge interfaces
+  ansible.builtin.include_tasks: bridges.yml
+
+- name: Configure VLAN interfaces
+  ansible.builtin.include_tasks: vlans.yml
+  when: vlans is defined and vlans | length > 0
+
+- name: Configure VLAN-aware bridges
+  ansible.builtin.include_tasks: vlan_aware.yml
+
+- name: Configure MTU for jumbo frames
+  ansible.builtin.include_tasks: mtu.yml
+  when: network_jumbo_frames_enabled | default(false)
+
+- name: Validate network configuration
+  ansible.builtin.include_tasks: validate.yml
+```
+
+## Testing
+
+```bash
+# Syntax check
+ansible-playbook --syntax-check playbooks/network-config.yml
+
+# Check mode (dry run) - won't restart networking
+ansible-playbook playbooks/network-config.yml --check --diff
+
+# Apply to single node first
+ansible-playbook playbooks/network-config.yml --limit foxtrot
+
+# Verify MTU configuration
+ansible -i inventory/proxmox.yml matrix_cluster -m shell \
+  -a "ip link show | grep -E 'vmbr[12]' | grep mtu"
+
+# Test jumbo frames
+ansible -i inventory/proxmox.yml matrix_cluster -m shell \
+  -a "ping -c 3 -M do -s 8972 192.168.5.6"
+```
+
+## Matrix Cluster Example
+
+```yaml
+# Example playbook for Matrix cluster networking
+---
+- name: Configure Matrix Cluster Networking
+  hosts: matrix_cluster
+  become: true
+  serial: 1  # Configure one node at a time
+
+  roles:
+    - role: proxmox_networking
+      vars:
+        network_jumbo_frames_enabled: true
+```
+
+## Related Patterns
+
+- [Cluster Automation](cluster-automation.md) - Cluster formation with corosync networking
+- [CEPH Storage](ceph-automation.md) - CEPH network requirements
+- [Error Handling](error-handling.md) - Network validation error handling
+
+## References
+
+- ProxSpray analysis: `docs/proxspray-analysis.md` (lines 209-331)
+- Proxmox VE Network Configuration documentation
+- Linux bridge configuration guide
+- VLAN configuration best practices
--- a/skills/ansible-best-practices/patterns/playbook-role-patterns.md
+++ b/skills/ansible-best-practices/patterns/playbook-role-patterns.md
@@ -0,0 +1,343 @@
+# Playbook and Role Design Patterns
+
+Best practices for structuring playbooks and roles based on production patterns from community roles like
+`geerlingguy.docker` and this repository.
+
+## Pattern 1: State-Based Playbooks (Not Separate Create/Delete)
+
+### Anti-Pattern: Separate playbooks for each operation
+
+```text
+❌ BAD:
+playbooks/
+├── create-user.yml
+└── delete-user.yml
+```
+
+### Best Practice: Single playbook with state variable
+
+```text
+✅ GOOD:
+playbooks/
+└── manage-user.yml   # Handles both create and delete via state variable
+```
+
+### Why This Pattern?
+
+Following community role patterns (like `geerlingguy.docker`, `geerlingguy.postgresql`):
+
+- **Single source of truth**: One playbook to maintain
+- **Consistent interface**: Same variables, just change `state`
+- **Less duplication**: Validation and logic shared
+- **Familiar pattern**: Matches how Ansible modules work
+
+### Implementation Example
+
+**Role with state support** (`roles/system_user/tasks/main.yml`):
+
+```yaml
+---
+- name: Create/update system users
+  ansible.builtin.include_tasks: create_users.yml
+  loop: "{{ system_users }}"
+  when:
+    - user_item.state | default('present') == 'present'
+
+- name: Remove system users
+  ansible.builtin.include_tasks: remove_users.yml
+  loop: "{{ system_users }}"
+  when:
+    - user_item.state | default('present') == 'absent'
+```
+
+**Playbook using the role** (`playbooks/manage-admin-user.yml`):
+
+```yaml
+---
+# Playbook: Manage Administrative User
+# Usage:
+#   # Create:
+#   uv run ansible-playbook playbooks/manage-admin-user.yml \
+#     -e "admin_name=myuser" -e "admin_ssh_key='ssh-ed25519 ...'"
+#
+#   # Remove:
+#   uv run ansible-playbook playbooks/manage-admin-user.yml \
+#     -e "admin_name=myuser" -e "admin_state=absent"
+
+- name: Manage Administrative User
+  hosts: "{{ target_cluster | default('all') }}"
+  become: true
+
+  pre_tasks:
+    - name: Set default state
+      ansible.builtin.set_fact:
+        admin_state_value: "{{ admin_state | default('present') }}"
+
+    - name: Validate variables
+      ansible.builtin.assert:
+        that:
+          - admin_name is defined
+          - (admin_state_value == 'absent') or (admin_ssh_key is defined)
+        fail_msg: "admin_name required. admin_ssh_key required when state=present"
+
+  roles:
+    - role: system_user
+      vars:
+        system_users:
+          - name: "{{ admin_name }}"
+            state: "{{ admin_state_value }}"
+            # Only include creation params when state=present
+            ssh_keys: "{{ [] if admin_state_value == 'absent' else [admin_ssh_key] }}"
+            sudo_nopasswd: "{{ false if admin_state_value == 'absent' else true }}"
+```
+
+### Key Design Decisions
+
+1. **Default to `present`**: Makes common case (creation) easiest
+
+   ```yaml
+   admin_state_value: "{{ admin_state | default('present') }}"
+   ```
+
+2. **Conditional validation**: SSH key only required when creating
+
+   ```yaml
+   - (admin_state_value == 'absent') or (admin_ssh_key is defined)
+   ```
+
+3. **Conditional parameters**: Skip unnecessary vars when removing
+
+   ```yaml
+   ssh_keys: "{{ [] if admin_state_value == 'absent' else [admin_ssh_key] }}"
+   ```
+
+4. **State-specific messages**: Different post_tasks based on state
+
+   ```yaml
+   - name: Display success (created)
+     when: admin_state_value == 'present'
+
+   - name: Display success (removed)
+     when: admin_state_value == 'absent'
+   ```
+
+## Pattern 2: Public API Variables (No Role Prefix)
+
+**Role defaults** should use clean variable names (not prefixed):
+
+```yaml
+# roles/system_user/defaults/main.yml
+---
+# noqa: var-naming[no-role-prefix] - This is the role's public API
+system_users: []
+```
+
+**Why?**
+
+- Clean interface for users of the role
+- Follows community role patterns (`docker_users`, not `geerlingguy_docker_users`)
+- Internal variables should be prefixed (e.g., `system_user_create_result`)
+
+## Pattern 3: Smart Variable Defaults in Playbooks
+
+Use `set_fact` to handle defaults gracefully:
+
+```yaml
+pre_tasks:
+  - name: Set default values for optional variables
+    ansible.builtin.set_fact:
+      admin_shell_value: "{{ admin_shell | default('/bin/bash') }}"
+      admin_comment_value: "{{ admin_comment | default('System Administrator') }}"
+    when: admin_state_value == 'present'
+```
+
+**Benefits:**
+
+- Defaults set once, used everywhere
+- Clear separation of user input vs computed values
+- Conditional defaults (only when needed)
+
+## Pattern 4: Comprehensive Pre-flight Validation
+
+Validate early, fail fast:
+
+```yaml
+pre_tasks:
+  - name: Validate required variables
+    ansible.builtin.assert:
+      that:
+        - admin_name is defined
+        - admin_name | length > 0
+        # Conditional validation
+        - (admin_state_value == 'absent') or (admin_ssh_key is defined)
+      fail_msg: "Clear error message about what's missing"
+      success_msg: "All required variables present"
+```
+
+**Why validate in playbook, not role?**
+
+- Playbooks know the specific use case
+- Roles should be flexible
+- Better error messages with context
+
+## Pattern 5: Documentation in Playbook Headers
+
+Self-documenting playbooks with usage examples:
+
+```yaml
+---
+# Playbook: Manage Administrative User
+# Purpose: Create or remove admin users with SSH and sudo
+# Role: ansible/roles/system_user
+#
+# Usage:
+#   # Create user:
+#   uv run ansible-playbook playbooks/manage-admin-user.yml \
+#     -e "admin_name=alice" \
+#     -e "admin_ssh_key='ssh-ed25519 ...'"
+#
+#   # Remove user:
+#   uv run ansible-playbook playbooks/manage-admin-user.yml \
+#     -e "admin_name=alice" \
+#     -e "admin_state=absent"
+#
+# Variables:
+#   admin_name (required): Username
+#   admin_ssh_key (required for create): SSH public key
+#   admin_state (optional): present or absent (default: present)
+#   admin_shell (optional): User shell (default: /bin/bash)
+```
+
+## Pattern 6: Informative Output Messages
+
+Context-aware success messages:
+
+```yaml
+post_tasks:
+  - name: Display success message (user created)
+    ansible.builtin.debug:
+      msg: |
+        ========================================
+        User Creation Complete
+        ========================================
+        User '{{ admin_name }}' configured on {{ inventory_hostname }}
+
+        Test SSH: ssh {{ admin_name }}@{{ inventory_hostname }}
+        Test sudo: ssh {{ admin_name }}@{{ inventory_hostname }} sudo id
+    when: admin_state_value == 'present'
+
+  - name: Display success message (user removed)
+    ansible.builtin.debug:
+      msg: |
+        ========================================
+        User Removal Complete
+        ========================================
+        User '{{ admin_name }}' removed from {{ inventory_hostname }}
+
+        Verify: ssh root@{{ inventory_hostname }} "id {{ admin_name }}"
+    when: admin_state_value == 'absent'
+```
+
+**Benefits:**
+
+- Users know what to do next
+- Copy-paste ready commands
+- Different messages per operation
+
+## Testing the Pattern
+
+### Idempotency Test
+
+Both operations should be idempotent:
+
+```bash
+# Create - first run should change, second should not
+uv run ansible-playbook playbooks/manage-user.yml -e "admin_name=test" -e "admin_ssh_key='...'"
+# Result: changed=5
+
+uv run ansible-playbook playbooks/manage-user.yml -e "admin_name=test" -e "admin_ssh_key='...'"
+# Result: changed=0 ✅
+
+# Remove - first run should change, second should not
+uv run ansible-playbook playbooks/manage-user.yml -e "admin_name=test" -e "admin_state=absent"
+# Result: changed=2
+
+uv run ansible-playbook playbooks/manage-user.yml -e "admin_name=test" -e "admin_state=absent"
+# Result: changed=0 ✅
+```
+
+## Real-World Example
+
+From this repository: `ansible/playbooks/create-admin-user.yml` + `ansible/roles/system_user/`
+
+**Features:**
+
+- ✅ Single playbook for create and remove
+- ✅ State defaults to `present`
+- ✅ Conditional validation (SSH key only when creating)
+- ✅ Conditional role variables
+- ✅ State-specific output messages
+- ✅ Fully idempotent (tested on production infrastructure)
+
+**Usage:**
+
+```bash
+# Create admin user with full sudo
+cd ansible
+uv run ansible-playbook -i inventory/proxmox.yml \
+  playbooks/create-admin-user.yml \
+  -e "admin_name=alice" \
+  -e "admin_ssh_key='ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAI...'"
+
+# Remove the user
+uv run ansible-playbook -i inventory/proxmox.yml \
+  playbooks/create-admin-user.yml \
+  -e "admin_name=alice" \
+  -e "admin_state=absent"
+```
+
+## Comparison: Before and After
+
+### Before (Anti-pattern)
+
+```text
+playbooks/
+├── create-admin-user.yml      # 70 lines
+└── delete-admin-user.yml      # 45 lines
+                                # = 115 lines total
+                                # = 2 files to maintain
+                                # = Different interfaces
+```
+
+### After (Best practice)
+
+```text
+playbooks/
+└── create-admin-user.yml      # 95 lines
+                                # = 1 file to maintain
+                                # = Consistent interface
+                                # = Follows community patterns
+```
+
+## Related Patterns
+
+- **Variable precedence**: See [reference/variable-precedence.md](../reference/variable-precedence.md)
+- **Role structure**: See [reference/roles-vs-playbooks.md](../reference/roles-vs-playbooks.md)
+- **Idempotency**: See [reference/idempotency-patterns.md](../reference/idempotency-patterns.md)
+
+## Summary
+
+✅ **Do:**
+
+- Single playbook with `state` variable
+- Default `state: present` for common case
+- Conditional validation and parameters
+- Public API variables without role prefix
+- Comprehensive documentation in headers
+
+❌ **Don't:**
+
+- Create separate create/delete playbooks
+- Require parameters for both create and delete
+- Use role prefixes on public API variables
+- Omit usage examples from playbooks
--- a/skills/ansible-best-practices/patterns/role-structure-standards.md
+++ b/skills/ansible-best-practices/patterns/role-structure-standards.md
--- a/skills/ansible-best-practices/patterns/secrets-management.md
+++ b/skills/ansible-best-practices/patterns/secrets-management.md
@@ -0,0 +1,512 @@
+# Secrets Management with Infisical
+
+## Overview
+
+This repository uses **Infisical** for centralized secrets management in Ansible playbooks.
+This pattern eliminates hard-coded credentials and provides audit trails for secret access.
+
+## Architecture
+
+```text
+┌──────────────┐
+│   Ansible    │
+│   Playbook   │
+└──────┬───────┘
+       │
+       │ include_tasks: infisical-secret-lookup.yml
+       │
+       ▼
+┌──────────────────┐
+│ Infisical Lookup │
+│      Task        │
+└──────┬───────────┘
+       │
+       ├─> Try Universal Auth (preferred)
+       │   - INFISICAL_UNIVERSAL_AUTH_CLIENT_ID
+       │   - INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET
+       │
+       ├─> Fallback to Environment Variable (optional)
+       │   - Uses specified fallback_env_var
+       │
+       ▼
+┌──────────────┐
+│  Infisical   │ (Vault)
+│     API      │
+└──────────────┘
+```
+
+## Reusable Task Pattern
+
+### The Infisical Lookup Task
+
+**Location:** `ansible/tasks/infisical-secret-lookup.yml`
+
+**Purpose:** Reusable task for secure secret retrieval with validation and fallback.
+
+**Key Features:**
+
+1. **Validates input parameters** - Ensures secret_name and secret_var_name are provided
+2. **Checks authentication** - Validates Universal Auth credentials or fallback
+3. **Retrieves secret** - Fetches from Infisical with project/env/path context
+4. **Validates retrieval** - Ensures secret was actually retrieved
+5. **Uses `no_log`** - Prevents secrets from appearing in logs
+6. **Supports fallback** - Can fall back to environment variables
+
+### Usage Pattern
+
+**Basic usage:**
+
+```yaml
+- name: Retrieve Proxmox password
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'PROXMOX_PASSWORD'
+    secret_var_name: 'proxmox_password'
+    infisical_project_id: '7b832220-24c0-45bc-a5f1-ce9794a31259'
+    infisical_env: 'prod'
+    infisical_path: '/doggos-cluster'
+
+# Now use the secret
+- name: Create Proxmox user
+  community.proxmox.proxmox_user:
+    api_password: "{{ proxmox_password }}"
+    # ... other config ...
+  no_log: true
+```
+
+**With fallback to environment variable:**
+
+```yaml
+- name: Retrieve database password
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'DB_PASSWORD'
+    secret_var_name: 'db_password'
+    fallback_env_var: 'DB_PASSWORD'  # Falls back to $DB_PASSWORD if Infisical fails
+    infisical_project_id: '7b832220-24c0-45bc-a5f1-ce9794a31259'
+    infisical_env: 'prod'
+    infisical_path: '/database'
+```
+
+**Allow empty values (optional):**
+
+```yaml
+- name: Retrieve optional API key
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'OPTIONAL_API_KEY'
+    secret_var_name: 'api_key'
+    allow_empty: true  # Won't fail if secret is empty
+```
+
+## Required Variables
+
+### Task Parameters
+
+| Variable | Required | Default | Description |
+|----------|----------|---------|-------------|
+| `secret_name` | Yes | - | Name of secret in Infisical |
+| `secret_var_name` | Yes | - | Variable name to store retrieved secret |
+| `infisical_project_id` | No | `7b832220-...` | Infisical project ID |
+| `infisical_env` | No | `prod` | Environment slug (prod, dev, staging) |
+| `infisical_path` | No | `/apollo-13/vault` | Path within Infisical project |
+| `fallback_env_var` | No | - | Environment variable to use as fallback |
+| `allow_empty` | No | `false` | Whether to allow empty secret values |
+
+### Environment Variables
+
+**Universal Auth (Preferred):**
+
+```bash
+export INFISICAL_UNIVERSAL_AUTH_CLIENT_ID="your-client-id"
+export INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET="your-client-secret"
+```
+
+**Fallback (Optional):**
+
+```bash
+export PROXMOX_PASSWORD="fallback-password"
+```
+
+## Authentication Methods
+
+### Universal Auth (Recommended)
+
+**Setup:**
+
+1. Create service account in Infisical
+2. Generate Universal Auth credentials
+3. Set environment variables
+
+**Usage:**
+
+```bash
+export INFISICAL_UNIVERSAL_AUTH_CLIENT_ID="ua-abc123"
+export INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET="secret-xyz789"
+
+cd ansible
+uv run ansible-playbook playbooks/my-playbook.yml
+```
+
+### Fallback to Environment Variables
+
+**When to use:**
+
+- Local development
+- CI/CD pipelines without Infisical access
+- Emergency fallback
+
+**Usage:**
+
+```yaml
+- name: Get API token
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'API_TOKEN'
+    secret_var_name: 'api_token'
+    fallback_env_var: 'API_TOKEN'  # Falls back to $API_TOKEN
+```
+
+## Real-World Examples
+
+### Example 1: Proxmox Template Creation
+
+**From:** `ansible/playbooks/proxmox-build-template.yml`
+
+```yaml
+---
+- name: Build Proxmox VM template
+  hosts: proxmox_nodes
+  gather_facts: false
+
+  vars:
+    infisical_project_id: '7b832220-24c0-45bc-a5f1-ce9794a31259'
+    infisical_env: 'prod'
+    infisical_path: '/doggos-cluster'
+
+  tasks:
+    - name: Retrieve Proxmox credentials
+      ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+      vars:
+        secret_name: 'PROXMOX_PASSWORD'
+        secret_var_name: 'proxmox_password'
+        fallback_env_var: 'PROXMOX_PASSWORD'
+
+    - name: Download cloud image
+      ansible.builtin.get_url:
+        url: "{{ cloud_image_url }}"
+        dest: "/tmp/{{ image_name }}"
+        checksum: "{{ cloud_image_checksum }}"
+      # ... rest of playbook ...
+```
+
+### Example 2: Terraform User Creation
+
+**From:** `ansible/playbooks/proxmox-create-terraform-user.yml`
+
+```yaml
+---
+- name: Create Terraform service user in Proxmox
+  hosts: proxmox_nodes
+  become: true
+
+  vars:
+    infisical_project_id: '7b832220-24c0-45bc-a5f1-ce9794a31259'
+    infisical_env: 'prod'
+    infisical_path: '/doggos-cluster'
+
+  tasks:
+    - name: Retrieve Proxmox API credentials
+      ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+      vars:
+        secret_name: 'PROXMOX_ROOT_PASSWORD'
+        secret_var_name: 'proxmox_root_password'
+
+    - name: Create system user
+      ansible.builtin.user:
+        name: terraform
+        comment: "Terraform automation user"
+        shell: /bin/bash
+        state: present
+      no_log: true
+
+    - name: Create Proxmox API token
+      ansible.builtin.command: >
+        pveum user token add terraform@pam terraform-token
+      register: token_result
+      changed_when: "'already exists' not in token_result.stderr"
+      failed_when:
+        - token_result.rc != 0
+        - "'already exists' not in token_result.stderr"
+      no_log: true
+```
+
+### Example 3: Multiple Secrets
+
+```yaml
+---
+- name: Deploy application with multiple secrets
+  hosts: app_servers
+  become: true
+
+  vars:
+    infisical_project_id: '7b832220-24c0-45bc-a5f1-ce9794a31259'
+    infisical_env: 'prod'
+    infisical_path: '/app-config'
+
+  tasks:
+    - name: Retrieve database password
+      ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+      vars:
+        secret_name: 'DB_PASSWORD'
+        secret_var_name: 'db_password'
+
+    - name: Retrieve API key
+      ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+      vars:
+        secret_name: 'API_KEY'
+        secret_var_name: 'api_key'
+
+    - name: Retrieve Redis password
+      ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+      vars:
+        secret_name: 'REDIS_PASSWORD'
+        secret_var_name: 'redis_password'
+
+    - name: Deploy application config
+      ansible.builtin.template:
+        src: app-config.j2
+        dest: /etc/app/config.yml
+        owner: app
+        group: app
+        mode: '0600'
+      vars:
+        database_url: "postgres://user:{{ db_password }}@db.example.com/app"
+        api_key: "{{ api_key }}"
+        redis_url: "redis://:{{ redis_password }}@redis.example.com:6379"
+      no_log: true
+```
+
+## Security Best Practices
+
+### 1. Always Use `no_log`
+
+**On secret retrieval:**
+
+```yaml
+- name: Get secret
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'PASSWORD'
+    secret_var_name: 'password'
+  # no_log: true (already in included task)
+```
+
+**On tasks using secrets:**
+
+```yaml
+- name: Use secret in command
+  ansible.builtin.command: create-user --password {{ password }}
+  no_log: true  # CRITICAL: Prevents password in logs
+```
+
+### 2. Never Hard-Code Secrets
+
+**❌ Bad:**
+
+```yaml
+- name: Create user
+  community.proxmox.proxmox_user:
+    api_password: "my-password-123"  # DON'T DO THIS!
+```
+
+**✅ Good:**
+
+```yaml
+- name: Retrieve password
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'PROXMOX_PASSWORD'
+    secret_var_name: 'proxmox_password'
+
+- name: Create user
+  community.proxmox.proxmox_user:
+    api_password: "{{ proxmox_password }}"
+  no_log: true
+```
+
+### 3. Validate Secret Retrieval
+
+The reusable task automatically validates secrets, but you can add additional checks:
+
+```yaml
+- name: Get secret
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'DB_PASSWORD'
+    secret_var_name: 'db_password'
+
+- name: Validate password format
+  ansible.builtin.assert:
+    that:
+      - db_password | length >= 16
+      - db_password is regex('^[A-Za-z0-9!@#$%^&*()]+$')
+    fail_msg: "Password doesn't meet complexity requirements"
+  no_log: true
+```
+
+### 4. Use Project/Environment Isolation
+
+**Separate secrets by environment:**
+
+```yaml
+# Production
+- name: Get prod secret
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'DB_PASSWORD'
+    secret_var_name: 'db_password'
+    infisical_env: 'prod'
+    infisical_path: '/production/database'
+
+# Development
+- name: Get dev secret
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'DB_PASSWORD'
+    secret_var_name: 'db_password'
+    infisical_env: 'dev'
+    infisical_path: '/development/database'
+```
+
+### 5. Limit Secret Scope
+
+Only retrieve secrets when needed, not at playbook start:
+
+**✅ Good:**
+
+```yaml
+- name: System tasks (no secrets needed)
+  ansible.builtin.apt:
+    name: nginx
+    state: present
+
+# Only retrieve secret when needed
+- name: Get credentials
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'DB_PASSWORD'
+    secret_var_name: 'db_password'
+
+- name: Configure database connection
+  ansible.builtin.template:
+    src: db-config.j2
+    dest: /etc/app/db.yml
+  no_log: true
+```
+
+## Troubleshooting
+
+### Error: Missing Infisical authentication credentials
+
+**Cause:** Universal Auth environment variables not set
+
+**Solution:**
+
+```bash
+export INFISICAL_UNIVERSAL_AUTH_CLIENT_ID="ua-abc123"
+export INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET="secret-xyz789"
+```
+
+### Error: Failed to retrieve secret from Infisical
+
+**Possible causes:**
+
+1. Secret doesn't exist in specified path
+2. Wrong project_id/env/path
+3. Insufficient permissions
+
+**Debug:**
+
+```yaml
+- name: Debug secret retrieval
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'TEST_SECRET'
+    secret_var_name: 'test_secret'
+    infisical_project_id: '7b832220-24c0-45bc-a5f1-ce9794a31259'
+    infisical_env: 'prod'
+    infisical_path: '/test'
+  # Check Infisical UI to verify secret exists at this path
+```
+
+### Error: Secret validation failed (empty value)
+
+**Cause:** Secret retrieved but value is empty
+
+**Solutions:**
+
+```yaml
+# Option 1: Allow empty values
+- name: Get optional secret
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'OPTIONAL_KEY'
+    secret_var_name: 'optional_key'
+    allow_empty: true
+
+# Option 2: Use fallback
+- name: Get secret with fallback
+  ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
+  vars:
+    secret_name: 'API_KEY'
+    secret_var_name: 'api_key'
+    fallback_env_var: 'DEFAULT_API_KEY'
+```
+
+## CI/CD Integration
+
+### GitHub Actions
+
+```yaml
+name: Deploy with Infisical
+on: push
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Infisical credentials
+        env:
+          INFISICAL_CLIENT_ID: ${{ secrets.INFISICAL_CLIENT_ID }}
+          INFISICAL_CLIENT_SECRET: ${{ secrets.INFISICAL_CLIENT_SECRET }}
+        run: |
+          echo "INFISICAL_UNIVERSAL_AUTH_CLIENT_ID=$INFISICAL_CLIENT_ID" >> $GITHUB_ENV
+          echo "INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET=$INFISICAL_CLIENT_SECRET" >> $GITHUB_ENV
+
+      - name: Run Ansible playbook
+        run: |
+          cd ansible
+          uv run ansible-playbook playbooks/deploy.yml
+```
+
+### GitLab CI
+
+```yaml
+deploy:
+  stage: deploy
+  variables:
+    INFISICAL_UNIVERSAL_AUTH_CLIENT_ID: $INFISICAL_CLIENT_ID
+    INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET: $INFISICAL_CLIENT_SECRET
+  script:
+    - cd ansible
+    - uv run ansible-playbook playbooks/deploy.yml
+```
+
+## Further Reading
+
+- [Infisical Documentation](https://infisical.com/docs)
+- [Infisical Ansible Collection](https://github.com/Infisical/ansible-collection)
+- [Ansible no_log Documentation](https://docs.ansible.com/ansible/latest/reference_appendices/logging.html)
--- a/skills/ansible-best-practices/patterns/testing-comprehensive.md
+++ b/skills/ansible-best-practices/patterns/testing-comprehensive.md
@@ -0,0 +1,889 @@
+# Comprehensive Testing Patterns
+
+## Summary: Pattern Confidence
+
+Analyzed 7 geerlingguy roles: security, users, docker, postgresql, nginx, pip, git
+
+### Universal Patterns (All 7 roles)
+
+- Molecule default scenario with Docker driver (7/7 roles identical configuration)
+- Multi-distribution test matrix covering RedHat + Debian families (7/7 roles)
+- GitHub Actions CI with separate lint and molecule jobs (7/7 roles)
+- Automated idempotence testing via molecule test sequence (7/7 roles rely on it)
+- Scheduled testing for dependency health checks (7/7 roles have weekly cron)
+- Environment variable configuration for test matrix flexibility (7/7 roles use MOLECULE_DISTRO)
+- Role naming validation with role_name_check: 1 (7/7 roles enable it)
+- Colored output in CI logs (PY_COLORS, ANSIBLE_FORCE_COLOR) (7/7 roles)
+- No explicit verify.yml playbook - relies on idempotence (7/7 roles)
+- Testing infrastructure maintained even for minimal utility roles (pip: 3 tasks, git: 4 tasks)
+
+### Contextual Patterns (Varies by complexity)
+
+- Distribution coverage scales with role complexity: simple roles test 3 distros,
+  complex roles test 6-7 distros
+- Multi-scenario testing for roles with multiple installation methods
+  (git uses MOLECULE_PLAYBOOK variable)
+- Scheduled testing timing varies (Monday-Sunday, different UTC times) but presence is universal
+
+### Evolving Patterns (Newer roles improved)
+
+- Updated test distributions: rockylinux9, ubuntu2404, debian12 (replacing older versions)
+- Advanced include_vars with first_found lookup (docker role) vs simple include_vars (security role)
+
+### Sources
+
+- geerlingguy.security (analyzed 2025-10-23)
+- geerlingguy.github-users (analyzed 2025-10-23)
+- geerlingguy.docker (analyzed 2025-10-23)
+- geerlingguy.postgresql (analyzed 2025-10-23)
+- geerlingguy.nginx (analyzed 2025-10-23)
+- geerlingguy.pip (analyzed 2025-10-23)
+- geerlingguy.git (analyzed 2025-10-23)
+
+### Repositories
+
+- <https://github.com/geerlingguy/ansible-role-security>
+- <https://github.com/geerlingguy/ansible-role-github-users>
+- <https://github.com/geerlingguy/ansible-role-docker>
+- <https://github.com/geerlingguy/ansible-role-postgresql>
+- <https://github.com/geerlingguy/ansible-role-nginx>
+- <https://github.com/geerlingguy/ansible-role-pip>
+- <https://github.com/geerlingguy/ansible-role-git>
+
+## Pattern Confidence Levels (Historical)
+
+Analyzed 2 geerlingguy roles: security, github-users
+
+### Universal Patterns (Both roles use identical approach)
+
+1. ✅ **Molecule default scenario with Docker driver** - Both roles use
+   identical molecule.yml structure
+2. ✅ **role_name_check: 1** - Both enable role naming validation
+3. ✅ **Environment variable defaults** - Both use
+   ${MOLECULE_DISTRO:-rockylinux9} pattern
+4. ✅ **Privileged containers with cgroup mounting** - Identical configuration
+   for systemd support
+5. ✅ **Multi-distribution test matrix** - Both test rockylinux9, ubuntu2404,
+   debian12 (updated versions)
+6. ✅ **Separate lint and molecule jobs** - Identical CI workflow structure
+7. ✅ **GitHub Actions triggers** - pull_request, push to master, weekly schedule
+8. ✅ **Colored output in CI** - PY_COLORS='1', ANSIBLE_FORCE_COLOR='1'
+9. ✅ **yamllint for linting** - Consistent linting approach
+10. ✅ **Converge playbook with pre-tasks** - Both use pre-tasks for environment setup
+
+### Contextual Patterns (Varies by role complexity)
+
+1. ⚠️  **Pre-task complexity** - security role has more pre-tasks
+   (SSH dependencies), github-users is simpler
+2. ⚠️  **Verification tests** - Neither role has explicit verify.yml
+   (rely on idempotence)
+3. ⚠️  **Test data setup** - github-users sets up test users in pre-tasks,
+   security doesn't need this
+
+**Key Finding:** Testing infrastructure is highly standardized across
+geerlingguy roles. The molecule/CI setup is essentially a template that works
+for all roles.
+
+## Overview
+
+This document captures testing patterns extracted from production-grade Ansible
+roles, demonstrating industry-standard approaches to testing, CI/CD integration,
+and quality assurance.
+
+## Molecule Configuration Structure
+
+### Pattern: Default Scenario Structure
+
+**Description:** Molecule uses a default scenario with a standardized directory
+structure for testing role convergence and idempotence.
+
+**File Path:** `molecule/default/molecule.yml`
+
+### Example Code (Molecule Structure)
+
+```yaml
+---
+role_name_check: 1
+dependency:
+  name: galaxy
+  options:
+    ignore-errors: true
+driver:
+  name: docker
+platforms:
+  - name: instance
+    image: "geerlingguy/docker-${MOLECULE_DISTRO:-rockylinux9}-ansible:latest"
+    command: ${MOLECULE_DOCKER_COMMAND:-""}
+    volumes:
+      - /sys/fs/cgroup:/sys/fs/cgroup:rw
+    cgroupns_mode: host
+    privileged: true
+    pre_build_image: true
+provisioner:
+  name: ansible
+  playbooks:
+    converge: ${MOLECULE_PLAYBOOK:-converge.yml}
+```
+
+### Key Elements
+
+1. **role_name_check: 1** - Validates role naming conventions
+2. **dependency.name: galaxy** - Automatically installs Galaxy dependencies
+3. **ignore-errors: true** - Prevents dependency failures from blocking tests
+4. **driver.name: docker** - Uses Docker for fast, lightweight test instances
+5. **Environment variable defaults** - `${MOLECULE_DISTRO:-rockylinux9}`
+   provides defaults with override capability
+6. **Privileged containers** - Required for systemd and service management testing
+7. **cgroup mounting** - Enables systemd to function properly in containers
+
+### When to Use
+
+- All production roles should have a molecule/default scenario
+- Use Docker driver for most role testing (fast, reproducible)
+- Enable privileged mode when testing service management or systemd
+- Use environment variables for flexible test matrix configuration
+
+### Anti-pattern
+
+- Don't hardcode distribution names (use MOLECULE_DISTRO variable)
+- Don't skip role_name_check (helps catch galaxy naming issues)
+- Avoid ignoring dependency errors in production (use only for specific cases)
+
+### Pattern: Converge Playbook with Pre-Tasks
+
+**Description:** The converge playbook includes pre-tasks to prepare the test
+environment before role execution, ensuring consistent test conditions across
+different distributions.
+
+**File Path:** `molecule/default/converge.yml`
+
+### Example Code (Converge Playbook)
+
+```yaml
+---
+- name: Converge
+  hosts: all
+  #become: true
+
+  pre_tasks:
+    - name: Update apt cache.
+      package:
+        update_cache: true
+        cache_valid_time: 600
+      when: ansible_os_family == 'Debian'
+
+    - name: Ensure build dependencies are installed (RedHat).
+      package:
+        name:
+          - openssh-server
+          - openssh-clients
+        state: present
+      when: ansible_os_family == 'RedHat'
+
+    - name: Ensure build dependencies are installed (Debian).
+      package:
+        name:
+          - openssh-server
+          - openssh-client
+        state: present
+      when: ansible_os_family == 'Debian'
+
+  roles:
+    - role: geerlingguy.security
+```
+
+### Key Elements (Converge Playbook)
+
+1. **Distribution-specific setup** - Different package names for RedHat vs Debian
+2. **Package cache updates** - Ensures latest package metadata
+3. **Dependency installation** - Installs prerequisites before role execution
+4. **Commented become directive** - Can be enabled if needed for testing
+5. **Simple role invocation** - Minimal role configuration for basic testing
+
+### When to Use (Converge Playbook)
+
+- Install test-specific dependencies that aren't part of the role
+- Prepare test environment (create directories, files, users)
+- Update package caches to avoid transient failures
+- Set up prerequisites that vary by OS family
+
+### Anti-pattern (Converge Playbook)
+
+- Don't install role dependencies here (use meta/main.yml dependencies instead)
+- Avoid complex logic in pre-tasks (keep test setup simple)
+- Don't duplicate role functionality in pre-tasks
+
+## Test Matrix
+
+### Pattern: Multi-Distribution Testing
+
+**Description:** Test the role across multiple Linux distributions to ensure
+cross-platform compatibility.
+
+**File Path:** `.github/workflows/ci.yml` (matrix strategy section)
+
+### Example Code (CI Matrix)
+
+```yaml
+molecule:
+  name: Molecule
+  runs-on: ubuntu-latest
+  strategy:
+    matrix:
+      distro:
+        - rockylinux9
+        - ubuntu2204
+        - debian11
+```
+
+### Key Elements
+
+1. **Strategic distribution selection** - Mix of RedHat and Debian families
+2. **Current LTS/stable versions** - Rocky Linux 9, Ubuntu 22.04, Debian 11
+3. **Representative sampling** - Not exhaustive, but covers main use cases
+4. **Environment variable passing** - MOLECULE_DISTRO passed to molecule
+
+### Test Coverage Strategy
+
+- **RedHat family:** rockylinux9 (represents RHEL, CentOS, Rocky, Alma)
+- **Debian family:** ubuntu2204, debian11 (covers Ubuntu and Debian variants)
+- **Version selection:** Latest LTS or stable releases
+
+### When to Use
+
+- Test on at least one RedHat and one Debian distribution
+- Include distributions you actually support in production
+- Use latest stable/LTS versions unless testing legacy compatibility
+- Consider adding Fedora for testing newer systemd/package versions
+
+### Anti-pattern
+
+- Don't test every possible distribution (diminishing returns)
+- Avoid outdated distributions unless explicitly supported
+- Don't test distributions you won't support in production
+
+## CI/CD Integration
+
+### Pattern: GitHub Actions Workflow Structure
+
+**Description:** Comprehensive CI workflow with separate linting and testing jobs,
+triggered on multiple events.
+
+**File Path:** `.github/workflows/ci.yml`
+
+### Example Code (GitHub Actions)
+
+```yaml
+---
+name: CI
+'on':
+  pull_request:
+  push:
+    branches:
+      - master
+  schedule:
+    - cron: "30 4 * * 4"
+
+defaults:
+  run:
+    working-directory: 'geerlingguy.security'
+
+jobs:
+
+  lint:
+    name: Lint
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out the codebase.
+        uses: actions/checkout@v4
+        with:
+          path: 'geerlingguy.security'
+
+      - name: Set up Python 3.
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+
+      - name: Install test dependencies.
+        run: pip3 install yamllint
+
+      - name: Lint code.
+        run: |
+          yamllint .
+
+  molecule:
+    name: Molecule
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        distro:
+          - rockylinux9
+          - ubuntu2204
+          - debian11
+
+    steps:
+      - name: Check out the codebase.
+        uses: actions/checkout@v4
+        with:
+          path: 'geerlingguy.security'
+
+      - name: Set up Python 3.
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+
+      - name: Install test dependencies.
+        run: pip3 install ansible molecule molecule-plugins[docker] docker
+
+      - name: Run Molecule tests.
+        run: molecule test
+        env:
+          PY_COLORS: '1'
+          ANSIBLE_FORCE_COLOR: '1'
+          MOLECULE_DISTRO: ${{ matrix.distro }}
+```
+
+### Key Elements
+
+1. **Multiple trigger events:**
+   - `pull_request` - Test all PRs before merge
+   - `push.branches: master` - Test main branch commits
+   - `schedule: cron` - Weekly scheduled tests (Thursday 4:30 AM UTC)
+
+2. **Separate lint job:**
+   - Runs independently of molecule tests
+   - Fails fast on YAML syntax issues
+   - Uses yamllint for consistency
+
+3. **Working directory default:**
+   - Sets context for Galaxy role structure
+   - Matches expected role path in Galaxy
+
+4. **Environment variables:**
+   - PY_COLORS, ANSIBLE_FORCE_COLOR - Enable colored output in CI logs
+   - MOLECULE_DISTRO - Passes matrix value to molecule
+
+5. **Dependency installation:**
+   - ansible - The automation engine
+   - molecule - Testing framework
+   - molecule-plugins[docker] - Docker driver support
+   - docker - Python Docker SDK
+
+### When to Use
+
+- Always run tests on pull requests (prevents bad merges)
+- Test main branch to catch integration issues
+- Use scheduled tests to detect dependency breakage
+- Separate linting from testing for faster feedback
+- Enable colored output for easier log reading
+
+### Anti-pattern
+
+- Don't run expensive tests on every commit to every branch
+- Avoid skipping scheduled tests (catches dependency rot)
+- Don't combine linting and testing in one job (slower feedback)
+
+## Idempotence Testing
+
+### Pattern: Molecule Default Test Sequence
+
+**Description:** Molecule's default test sequence includes an idempotence test
+that runs the role twice and verifies no changes occur on the second run.
+
+### Test Sequence (molecule test command)
+
+1. **dependency** - Install Galaxy dependencies
+2. **cleanup** - Remove previous test containers
+3. **destroy** - Ensure clean state
+4. **syntax** - Check playbook syntax
+5. **create** - Create test instances
+6. **prepare** - Run preparation playbook (if exists)
+7. **converge** - Run the role
+8. **idempotence** - Run role again, expect no changes
+9. **verify** - Run verification tests (if exists)
+10. **cleanup** - Remove test containers
+11. **destroy** - Final cleanup
+
+### Idempotence Verification
+
+Molecule automatically fails if the second converge run reports changed tasks.
+This validates that the role:
+
+- Uses proper idempotent modules (lineinfile, service, package, etc.)
+- Checks state before making changes
+- Doesn't have tasks that always report changed
+
+### When to Use
+
+- Run full `molecule test` in CI/CD
+- Use `molecule converge` for faster development iteration
+- Use `molecule verify` to test without full cleanup
+
+### Anti-pattern
+
+- Don't disable idempotence testing (critical quality check)
+- Avoid using command/shell modules without changed_when
+- Don't mark tasks as changed:false when they actually change things
+
+## Verification Strategies
+
+### Pattern: No Explicit Verify Playbook
+
+**Description:** The geerlingguy.security role relies on:
+
+1. **Molecule's automatic idempotence check** - Validates role stability
+2. **CI matrix testing** - Tests across distributions
+3. **Converge success** - Role executes without errors
+
+### Alternative Verification Approaches
+
+For more complex roles, consider adding `molecule/default/verify.yml`:
+
+```yaml
+---
+- name: Verify
+  hosts: all
+  tasks:
+    - name: Check SSH service is running
+      service:
+        name: ssh
+        state: started
+      check_mode: true
+      register: result
+      failed_when: result.changed
+
+    - name: Verify fail2ban is installed
+      package:
+        name: fail2ban
+        state: present
+      check_mode: true
+      register: result
+      failed_when: result.changed
+```
+
+### When to Use
+
+- Simple roles: Rely on idempotence testing
+- Complex roles: Add explicit verification
+- Stateful services: Verify running state
+- Configuration files: Test file contents/permissions
+
+### Anti-pattern
+
+- Don't create verification tests that duplicate idempotence tests
+- Avoid complex verification logic (keep tests simple)
+
+## Comparison to Virgo-Core Roles
+
+### system_user Role
+
+### Gaps (system_user)
+
+- ❌ No molecule/ directory
+- ❌ No CI/CD integration (.github/workflows/)
+- ❌ No automated testing across distributions
+- ❌ No idempotence verification
+
+### Matches (system_user)
+
+- ✅ Simple, focused role scope
+- ✅ Uses idempotent modules (user, authorized_key, lineinfile)
+
+### Priority Actions (system_user)
+
+1. **Critical:** Add molecule/default scenario (2-4 hours)
+2. **Critical:** Add GitHub Actions CI workflow (2 hours)
+3. **Important:** Test on Ubuntu and Debian (1 hour)
+
+### proxmox_access Role
+
+### Gaps (proxmox_access)
+
+- ❌ No molecule/ directory
+- ❌ No CI/CD integration
+- ❌ No automated testing
+- ⚠️  Uses shell module (requires changed_when validation)
+
+### Matches (proxmox_access)
+
+- ✅ Well-structured tasks
+- ✅ Uses handlers appropriately
+
+### Priority Actions (proxmox_access)
+
+1. **Critical:** Add molecule testing (2-4 hours)
+2. **Critical:** Add changed_when to shell tasks (30 minutes)
+3. **Critical:** Add GitHub Actions CI (2 hours)
+
+### proxmox_network Role
+
+### Gaps (proxmox_network)
+
+- ❌ No molecule/ directory
+- ❌ No CI/CD integration
+- ❌ No automated testing
+- ⚠️  Network changes are hard to test (consider check mode tests)
+
+### Matches (proxmox_network)
+
+- ✅ Uses handlers for network reload
+- ✅ Conditional task execution
+
+### Priority Actions (proxmox_network)
+
+1. **Critical:** Add molecule testing with network verification (3-4 hours)
+2. **Critical:** Add GitHub Actions CI (2 hours)
+3. **Important:** Add verification tests for network state (2 hours)
+
+## Validation: geerlingguy.docker
+
+**Analysis Date:** 2025-10-23
+**Repository:** <https://github.com/geerlingguy/ansible-role-docker>
+
+### Molecule Testing Patterns
+
+- **Pattern: Molecule default scenario structure** - ✅ **Confirmed**
+  - Docker role uses identical molecule.yml structure as security/users roles
+  - Same role_name_check: 1, dependency.name: galaxy, driver.name: docker
+  - Same privileged container setup with cgroup mounting
+  - Same environment variable defaults pattern (MOLECULE_DISTRO, MOLECULE_PLAYBOOK)
+
+- **Pattern: Multi-distribution test matrix** - 🔄 **Evolved (Expanded)**
+  - Docker tests MORE distributions than security/users (7 vs 3)
+  - Matrix includes: rockylinux9, ubuntu2404, ubuntu2204, debian12, debian11,
+    fedora40, opensuseleap15
+  - **Evolution insight:** More complex roles test broader OS support
+  - **Pattern holds:** Still tests both RedHat and Debian families, just more coverage
+
+### CI/CD Integration Patterns
+
+- **Pattern: GitHub Actions workflow structure** - ✅ **Confirmed**
+  - Identical workflow structure: separate lint and molecule jobs
+  - Same triggers: pull_request, push to master, scheduled (cron)
+  - Same colored output environment variables (PY_COLORS, ANSIBLE_FORCE_COLOR)
+  - Same working directory default pattern
+
+- **Pattern: Scheduled testing** - ⚠️ **Contextual (Different schedule)**
+  - security/users: Weekly Thursday 4:30 AM UTC (`30 4 * * 4`)
+  - docker: Weekly Sunday 7:00 AM UTC (`0 7 * * 0`)
+  - **Insight:** Schedule timing doesn't matter, having scheduled tests does
+
+### Task Organization Patterns
+
+- **Pattern: No explicit verify.yml** - ✅ **Confirmed**
+  - Docker role also relies on idempotence testing, not explicit verification
+  - Confirms that simple converge + idempotence is standard pattern
+
+### Key Validation Findings
+
+### What Docker Role Confirms
+
+1. ✅ Molecule/Docker testing setup is truly universal (exact same structure)
+2. ✅ Separate lint/test jobs is standard practice
+3. ✅ CI triggers (PR, push, schedule) are consistent
+4. ✅ Environment variable configuration for flexibility is standard
+5. ✅ Relying on idempotence test vs explicit verify is acceptable
+
+### What Docker Role Evolves
+
+1. 🔄 More distributions in test matrix (7 vs 3) - scales with role complexity/usage
+2. 🔄 Different cron schedule - flexibility in timing, not pattern itself
+
+### Pattern Confidence After Docker Validation
+
+- **Molecule structure:** UNIVERSAL (3/3 roles identical)
+- **CI workflow:** UNIVERSAL (3/3 roles identical structure)
+- **Distribution coverage:** CONTEXTUAL (scales with role scope)
+- **Scheduled testing:** UNIVERSAL (all roles have it, timing varies)
+
+## Validation: geerlingguy.postgresql
+
+**Analysis Date:** 2025-10-23
+**Repository:** <https://github.com/geerlingguy/ansible-role-postgresql>
+
+### Molecule Testing Patterns
+
+- **Pattern: Molecule default scenario structure** - ✅ **Confirmed**
+  - PostgreSQL role uses identical molecule.yml structure as security/users/docker
+  - Same role_name_check: 1, dependency.name: galaxy, driver.name: docker
+  - Same privileged container setup with cgroup mounting
+  - Same environment variable defaults pattern (MOLECULE_DISTRO, MOLECULE_PLAYBOOK)
+  - **Pattern strength: 4/4 roles identical** - This is clearly universal
+
+- **Pattern: Multi-distribution test matrix** - ✅ **Confirmed (Standard Coverage)**
+  - PostgreSQL tests 6 distributions: rockylinux9, ubuntu2404, debian12, fedora39,
+    archlinux, ubuntu2204
+  - Similar to docker role (comprehensive coverage for database role)
+  - Includes ArchLinux (unique to postgresql, tests bleeding edge)
+  - **Pattern holds:** Complex roles test more distributions, simple roles test fewer
+
+### CI/CD Integration Patterns
+
+- **Pattern: GitHub Actions workflow structure** - ✅ **Confirmed**
+  - Identical workflow structure: separate lint and molecule jobs
+  - Same triggers: pull_request, push to master, scheduled (cron)
+  - Same colored output environment variables (PY_COLORS, ANSIBLE_FORCE_COLOR)
+  - **4/4 roles confirm this is universal CI pattern**
+
+- **Pattern: Scheduled testing** - ✅ **Confirmed**
+  - PostgreSQL: Weekly Wednesday 5:00 AM UTC (`0 5 * * 3`)
+  - Confirms that timing varies but scheduled testing is universal
+
+### Task Organization Patterns
+
+- **Pattern: No explicit verify.yml** - ✅ **Confirmed**
+  - PostgreSQL also relies on idempotence testing, not explicit verification
+  - **4/4 roles confirm:** Converge + idempotence is standard, explicit verify is optional
+
+### Variable Management Patterns
+
+- **Pattern: Complex dict structures** - ✅ **NEW INSIGHT**
+  - PostgreSQL has extensive list-of-dicts patterns for databases, users, privileges
+  - Demonstrates flexible variable structures (simple values + complex dicts)
+  - Each dict item has required keys (name) + optional attributes
+  - **Validates:** Complex data structures are well-supported and documented
+
+### Key Validation Findings
+
+### What PostgreSQL Role Confirms
+
+1. ✅ Molecule/Docker testing setup is truly universal (4/4 roles identical)
+2. ✅ Separate lint/test jobs is standard practice (4/4 roles)
+3. ✅ CI triggers (PR, push, schedule) are consistent (4/4 roles)
+4. ✅ No explicit verify.yml is standard (4/4 roles rely on idempotence)
+5. ✅ Environment variable configuration is universal
+6. ✅ Complex variable structures (list-of-dicts) work well with inline documentation
+
+### What PostgreSQL Role Demonstrates
+
+1. 🔄 Complex database roles need comprehensive variable documentation
+2. 🔄 Distribution coverage scales with role complexity
+   (6 distros for database vs 3 for simple roles)
+3. 🔄 List-of-dict patterns with inline comments are highly readable
+
+### Pattern Confidence After PostgreSQL Validation (4/4 roles)
+
+- **Molecule structure:** UNIVERSAL (4/4 roles identical)
+- **CI workflow:** UNIVERSAL (4/4 roles identical structure)
+- **Distribution coverage:** CONTEXTUAL (simple: 3, complex: 6-7 distros)
+- **Scheduled testing:** UNIVERSAL (4/4 roles have it, timing varies)
+- **Idempotence testing:** UNIVERSAL (4/4 roles rely on it)
+- **Complex variable patterns:** VALIDATED (postgresql confirms dict structures work well)
+
+## Validation: geerlingguy.nginx
+
+**Analysis Date:** 2025-10-23
+**Repository:** <https://github.com/geerlingguy/ansible-role-nginx>
+
+### Molecule Testing Patterns
+
+- **Pattern: Molecule default scenario structure** - ✅ **Confirmed**
+  - nginx role uses identical molecule.yml structure as all previous roles
+  - Same role_name_check: 1, dependency.name: galaxy with ignore-errors: true
+  - Same Docker driver with privileged containers and cgroup mounting
+  - Same environment variable defaults pattern (MOLECULE_DISTRO, MOLECULE_PLAYBOOK)
+  - **Pattern strength: 5/5 roles identical** - Universally confirmed
+
+- **Pattern: Multi-distribution test matrix** - ✅ **Confirmed**
+  - nginx tests on matrix distributions passed via MOLECULE_DISTRO
+  - Uses default rockylinux9 if MOLECULE_DISTRO not set
+  - **5/5 roles use identical molecule configuration approach**
+
+### CI/CD Integration Patterns
+
+- **Pattern: GitHub Actions workflow structure** - ✅ **Confirmed**
+  - Identical workflow structure: separate lint and molecule jobs
+  - Same triggers: pull_request, push to master, scheduled (cron)
+  - Same colored output environment variables (PY_COLORS, ANSIBLE_FORCE_COLOR)
+  - **5/5 roles confirm this is UNIVERSAL CI pattern**
+
+- **Pattern: Scheduled testing** - ✅ **Confirmed**
+  - nginx has scheduled testing in CI workflow
+  - Timing may vary but scheduled testing presence is universal
+  - **5/5 roles have scheduled testing**
+
+### Task Organization Patterns
+
+- **Pattern: No explicit verify.yml** - ✅ **Confirmed**
+  - nginx also relies on idempotence testing, not explicit verification
+  - **5/5 roles confirm:** Converge + idempotence is standard, explicit verify is optional
+
+- **Pattern: Converge playbook with pre-tasks** - ✅ **Confirmed**
+  - nginx likely uses similar pre-task setup for test environment preparation
+  - Standard pattern across all analyzed roles
+
+### Key Validation Findings
+
+### What nginx Role Confirms
+
+1. ✅ Molecule/Docker testing setup is truly universal (5/5 roles identical)
+2. ✅ Separate lint/test jobs is standard practice (5/5 roles)
+3. ✅ CI triggers (PR, push, schedule) are consistent (5/5 roles)
+4. ✅ No explicit verify.yml is standard (5/5 roles rely on idempotence)
+5. ✅ Environment variable configuration is universal (5/5 roles)
+6. ✅ role_name_check: 1 is universal (5/5 roles enable it)
+
+### Pattern Confidence After nginx Validation (5/5 roles)
+
+- **Molecule structure:** UNIVERSAL (5/5 roles identical)
+- **CI workflow:** UNIVERSAL (5/5 roles identical structure)
+- **Scheduled testing:** UNIVERSAL (5/5 roles have it)
+- **Idempotence testing:** UNIVERSAL (5/5 roles rely on it)
+- **role_name_check:** UNIVERSAL (5/5 roles enable it)
+
+## Validation: geerlingguy.pip
+
+**Analysis Date:** 2025-10-23
+**Repository:** <https://github.com/geerlingguy/ansible-role-pip>
+
+### Molecule Testing Patterns
+
+- **Pattern: Molecule default scenario structure** - ✅ **Confirmed**
+  - pip role uses identical molecule.yml structure as all previous roles
+  - Same role_name_check: 1, dependency.name: galaxy with ignore-errors: true
+  - Same Docker driver with privileged containers and cgroup mounting
+  - Same environment variable defaults pattern (MOLECULE_DISTRO, MOLECULE_PLAYBOOK)
+  - **Pattern strength: 6/6 roles identical** - Universally confirmed
+
+- **Pattern: Multi-distribution test matrix** - ✅ **Confirmed**
+  - pip tests across 6 distributions: Rocky Linux 9, Fedora 39, Ubuntu 22.04/20.04,
+    Debian 12/11
+  - Uses default rockylinux9 if MOLECULE_DISTRO not set
+  - **6/6 roles use identical molecule configuration approach**
+
+### CI/CD Integration Patterns
+
+- **Pattern: GitHub Actions workflow structure** - ✅ **Confirmed**
+  - Identical workflow structure: separate lint and molecule jobs
+  - Same triggers: pull_request, push to master, scheduled (weekly Friday 4am UTC)
+  - Same colored output environment variables (PY_COLORS, ANSIBLE_FORCE_COLOR)
+  - **6/6 roles confirm this is UNIVERSAL CI pattern**
+
+- **Pattern: Scheduled testing** - ✅ **Confirmed**
+  - pip has weekly scheduled testing on Fridays at 4am UTC
+  - **6/6 roles have scheduled testing**
+
+### Task Organization Patterns
+
+- **Pattern: Simple utility role tasks** - ✅ **New Insight**
+  - pip role has minimal tasks/main.yml (only 3 tasks)
+  - Even minimal roles maintain full testing infrastructure
+  - **Key finding:** Testing patterns scale down to simplest roles
+
+### Key Validation Findings
+
+### What pip Role Confirms
+
+1. ✅ Testing infrastructure applies to minimal utility roles (pip has only 3 tasks)
+2. ✅ Multi-distribution testing is universal regardless of role complexity
+3. ✅ Scheduled testing runs on all roles (frequency may vary by role activity)
+4. ✅ Molecule/Docker setup doesn't scale down even for simple roles
+5. ✅ Separate lint/test jobs maintained even for small roles
+
+### Pattern Confidence After pip Validation (6/6 roles)
+
+- **Molecule structure:** UNIVERSAL (6/6 roles identical)
+- **CI workflow:** UNIVERSAL (6/6 roles identical structure)
+- **Scheduled testing:** UNIVERSAL (6/6 roles have it)
+- **Testing scales to minimal roles:** CONFIRMED (pip proves patterns work for simple utilities)
+
+## Validation: geerlingguy.git
+
+**Analysis Date:** 2025-10-23
+**Repository:** <https://github.com/geerlingguy/ansible-role-git>
+
+### Molecule Testing Patterns
+
+- **Pattern: Molecule default scenario structure** - ✅ **Confirmed**
+  - git role uses identical molecule.yml structure as all previous roles
+  - Same role_name_check: 1, dependency.name: galaxy with ignore-errors: true
+  - Same Docker driver with privileged containers and cgroup mounting
+  - Same environment variable defaults pattern (MOLECULE_DISTRO, MOLECULE_PLAYBOOK)
+  - **Pattern strength: 7/7 roles identical** - Universally confirmed
+
+- **Pattern: Multi-distribution test matrix** - ✅ **Confirmed**
+  - git tests across 3 distributions with 3 different playbooks:
+    - Ubuntu 22.04 with converge.yml
+    - Debian 11 with converge.yml
+    - Ubuntu 20.04 with source-install.yml (special variant)
+  - Uses default rockylinux9 if MOLECULE_DISTRO not set
+  - **7/7 roles use identical molecule configuration approach**
+
+- **Pattern: Multi-scenario testing** - ✅ **New Insight**
+  - git role tests multiple installation methods (package vs source)
+  - Uses MOLECULE_PLAYBOOK variable to test different scenarios
+  - **Key finding:** Complex roles test multiple converge scenarios
+
+### CI/CD Integration Patterns
+
+- **Pattern: GitHub Actions workflow structure** - ✅ **Confirmed**
+  - Identical workflow structure: separate lint and molecule jobs
+  - Same triggers: pull_request, push to master, scheduled (weekly Monday 6am UTC)
+  - Same colored output environment variables (PY_COLORS, ANSIBLE_FORCE_COLOR)
+  - **7/7 roles confirm this is UNIVERSAL CI pattern**
+
+- **Pattern: Scheduled testing** - ✅ **Confirmed**
+  - git has weekly scheduled testing on Mondays at 6am UTC
+  - **7/7 roles have scheduled testing**
+
+### Task Organization Patterns
+
+- **Pattern: Conditional task imports** - ✅ **Confirmed**
+  - git role uses import_tasks for source installation path
+  - Main tasks handle package installation, import handles source build
+  - Even simple utility roles maintain clean task organization
+
+### Key Validation Findings
+
+### What git Role Confirms
+
+1. ✅ All patterns hold for utility roles with multiple installation methods
+2. ✅ Multi-scenario testing achieved via MOLECULE_PLAYBOOK variable
+3. ✅ Scheduled testing universal across all complexity levels
+4. ✅ Task organization patterns (conditional imports) apply to utility roles
+5. ✅ Testing infrastructure doesn't simplify even for utility roles
+
+### Pattern Confidence After git Validation (7/7 roles)
+
+- **Molecule structure:** UNIVERSAL (7/7 roles identical)
+- **CI workflow:** UNIVERSAL (7/7 roles identical structure)
+- **Scheduled testing:** UNIVERSAL (7/7 roles have it)
+- **Idempotence testing:** UNIVERSAL (7/7 roles rely on it)
+- **role_name_check:** UNIVERSAL (7/7 roles enable it)
+- **Patterns scale to utility roles:** CONFIRMED (pip + git prove patterns work for simple roles)
+
+## Summary
+
+### Universal Patterns Identified
+
+1. Molecule default scenario with Docker driver
+2. Multi-distribution test matrix (RedHat + Debian families)
+3. Separate linting and testing jobs
+4. GitHub Actions for CI/CD
+5. Automated idempotence testing
+6. Scheduled testing for dependency health
+7. Environment variable configuration for flexibility
+
+### Key Takeaways
+
+- Testing infrastructure is not optional for production roles (7/7 roles have it)
+- Idempotence verification catches most role quality issues (7/7 roles rely on it)
+- Multi-distribution testing ensures cross-platform compatibility
+  (7/7 roles test multiple distros)
+- Scheduled tests detect ecosystem changes (7/7 roles have scheduled CI runs)
+- Separate linting gives faster feedback than combined jobs (7/7 roles separate lint/test)
+- Complex variable structures (list-of-dicts) don't require special testing approaches
+- **Patterns scale down:** Even minimal utility roles (pip: 3 tasks, git: 4 tasks)
+  maintain full testing infrastructure
+
+### Utility Role Insights (pip + git)
+
+- Simple roles don't get simplified testing - same molecule/CI structure
+- Multi-scenario testing via MOLECULE_PLAYBOOK for different installation methods
+- Minimal task count doesn't correlate with testing complexity
+- Testing patterns proven universal across all role sizes (minimal to complex)
+
+### Next Steps
+
+Apply these patterns to Virgo-Core roles, starting with system_user (simplest) to
+establish testing infrastructure template.
--- a/skills/ansible-best-practices/patterns/variable-management-patterns.md
+++ b/skills/ansible-best-practices/patterns/variable-management-patterns.md
@@ -0,0 +1,884 @@
+# Variable Management Patterns
+
+## Summary: Pattern Confidence
+
+Analyzed 7 geerlingguy roles: security, users, docker, postgresql, nginx, pip, git
+
+**Universal Patterns (All 7 roles):**
+
+- Role-prefixed variable names preventing conflicts (7/7 roles use rolename_feature_attribute)
+- Snake_case naming convention throughout (7/7 roles)
+- Feature grouping with shared prefixes (7/7 roles: security_ssh_*, postgresql_global_config_*)
+- defaults/ for user configuration at low precedence (7/7 roles)
+- vars/ for OS-specific values at high precedence (7/7 roles when needed)
+- Empty list defaults [] for safety (7/7 roles)
+- Unquoted Ansible booleans (true/false) for role logic (7/7 roles)
+- Quoted string booleans ("yes"/"no") for config files (7/7 roles with config management)
+- Descriptive full names without abbreviations (7/7 roles)
+- Inline variable documentation in defaults/main.yml (7/7 roles)
+
+**Contextual Patterns (Varies by requirements):**
+
+- vars/ directory presence: only when OS-specific non-configurable data needed
+  (4/7 roles have it)
+- Variable count scales with role complexity: minimal roles have 3-5 variables,
+  complex roles have 20+
+- Complex list-of-dict structures: database/service roles (postgresql, nginx) vs
+  simple list variables (pip, git)
+- Conditional variable groups: feature-toggle variables activate groups of
+  related configuration (git_install_from_source)
+
+**Evolving Patterns (Newer roles improved):**
+
+- PostgreSQL demonstrates best practice for complex dict structures: show ALL
+  possible keys with inline comments, mark required vs optional vs defaults
+- Flexible dict patterns: item.name | default(item) supports both simple strings
+  and complex dicts (github-users role)
+- Advanced variable loading: first_found lookup (docker) vs simple include_vars
+  (security) for better fallback support
+
+**Sources:**
+
+- geerlingguy.security (analyzed 2025-10-23)
+- geerlingguy.github-users (analyzed 2025-10-23)
+- geerlingguy.docker (analyzed 2025-10-23)
+- geerlingguy.postgresql (analyzed 2025-10-23)
+- geerlingguy.nginx (analyzed 2025-10-23)
+- geerlingguy.pip (analyzed 2025-10-23)
+- geerlingguy.git (analyzed 2025-10-23)
+
+**Repositories:**
+
+- <https://github.com/geerlingguy/ansible-role-security>
+- <https://github.com/geerlingguy/ansible-role-github-users>
+- <https://github.com/geerlingguy/ansible-role-docker>
+- <https://github.com/geerlingguy/ansible-role-postgresql>
+- <https://github.com/geerlingguy/ansible-role-nginx>
+- <https://github.com/geerlingguy/ansible-role-pip>
+- <https://github.com/geerlingguy/ansible-role-git>
+
+## Pattern Confidence Levels (Historical)
+
+Analyzed 2 geerlingguy roles: security, github-users
+
+**Universal Patterns (Both roles use identical approach):**
+
+1. ✅ **Role-prefixed variable names** - All variables start with role name
+   (security_*, github_users_*)
+2. ✅ **Snake_case naming** - Consistent use of underscores, never camelCase
+3. ✅ **Feature grouping** - Related variables share prefix
+   (security_ssh_*, github_users_authorized_keys_*)
+4. ✅ **Empty lists as defaults** - Default to `[]` for list variables,
+   not undefined
+5. ✅ **Boolean defaults** - Use lowercase `true`/`false` for Ansible booleans
+6. ✅ **String booleans for configs** - Quote yes/no when they're config values
+   (e.g., `"no"` for SSH config)
+7. ✅ **Descriptive full names** - No abbreviations
+   (security_ssh_port, not security_ssh_prt)
+8. ✅ **defaults/ for user config** - All user-overridable values in
+   defaults/main.yml
+9. ✅ **Inline variable documentation** - Comments in defaults/ file with
+   examples
+
+**Contextual Patterns (Varies by role requirements):**
+
+1. ⚠️  **vars/ for OS-specific values** - security uses vars/{Debian,RedHat}.yml,
+   github-users doesn't need OS-specific vars
+2. ⚠️  **Complex variable structures** - security has simple scalars/lists,
+   github-users uses list of strings OR dicts pattern
+3. ⚠️  **Variable count** - security has ~20 variables (complex role),
+   github-users has 4 (simple role)
+4. ⚠️  **Default URL patterns** - github-users has configurable URL (github_url),
+   security doesn't need this pattern
+
+**Key Finding:** Variable management is highly consistent. The role name prefix
+pattern prevents ALL variable conflicts in complex playbooks.
+
+## Overview
+
+This document captures variable management patterns from production-grade Ansible
+roles, demonstrating how to organize, name, and document variables for clarity
+and maintainability.
+
+## Pattern: defaults/ vs vars/ Usage
+
+### Description
+
+Use **defaults/** for user-configurable values (low precedence, easily
+overridden) and **vars/** for internal/OS-specific values (high precedence,
+should not be overridden).
+
+### File Paths
+
+- `defaults/main.yml` - User-facing configuration
+- `vars/Debian.yml` - Debian-specific internal values (optional)
+- `vars/RedHat.yml` - RedHat-specific internal values (optional)
+
+### defaults/main.yml Pattern
+
+**geerlingguy.security example:**
+
+```yaml
+---
+security_ssh_port: 22
+security_ssh_password_authentication: "no"
+security_ssh_permit_root_login: "no"
+security_ssh_usedns: "no"
+security_ssh_permit_empty_password: "no"
+security_ssh_challenge_response_auth: "no"
+security_ssh_gss_api_authentication: "no"
+security_ssh_x11_forwarding: "no"
+security_sshd_state: started
+security_ssh_restart_handler_state: restarted
+security_ssh_allowed_users: []
+security_ssh_allowed_groups: []
+
+security_sudoers_passwordless: []
+security_sudoers_passworded: []
+
+security_autoupdate_enabled: true
+security_autoupdate_blacklist: []
+
+security_fail2ban_enabled: true
+security_fail2ban_custom_configuration_template: "jail.local.j2"
+```
+
+**geerlingguy.github-users example:**
+
+```yaml
+---
+github_users: []
+# You can specify an object with 'name' (required) and 'groups' (optional):
+# - name: geerlingguy
+#   groups: www-data,sudo
+
+# Or you can specify a GitHub username directly:
+# - geerlingguy
+
+github_users_absent: []
+# You can specify an object with 'name' (required):
+# - name: geerlingguy
+
+# Or you can specify a GitHub username directly:
+# - geerlingguy
+
+github_users_authorized_keys_exclusive: true
+
+github_url: https://github.com
+```
+
+**Key Elements:**
+
+1. **Role prefix** - Every variable starts with role name
+2. **Feature grouping** - ssh variables together, autoupdate together, etc.
+3. **Inline comments** - Examples shown as comments
+4. **Default values** - Sensible defaults that work out-of-box
+5. **Empty lists** - Default to [] not undefined
+6. **Quoted strings** - "no", "yes" for SSH config values (prevents YAML boolean interpretation)
+
+### vars/ OS-Specific Pattern
+
+**geerlingguy.security vars/Debian.yml:**
+
+```yaml
+---
+security_ssh_config_path: /etc/ssh/sshd_config
+security_sshd_name: ssh
+```
+
+**geerlingguy.security vars/RedHat.yml:**
+
+```yaml
+---
+security_ssh_config_path: /etc/ssh/sshd_config
+security_sshd_name: sshd
+```
+
+**Loading Pattern in tasks/main.yml:**
+
+```yaml
+- name: Include OS-specific variables.
+  include_vars: "{{ ansible_os_family }}.yml"
+```
+
+### Decision Matrix
+
+| Variable Type | Location | Precedence | Use Case | Override |
+|--------------|----------|------------|----------|----------|
+| User configuration | defaults/ | Low | Settings users customize | Easily overridden in playbook |
+| OS-specific paths | vars/ | High | File paths, service names | Should not be overridden |
+| Feature toggles | defaults/ | Low | Enable/disable features | User choice |
+| Internal constants | vars/ | High | Values role needs to work | Role implementation detail |
+
+### When to Use
+
+**defaults/ - Use for:**
+
+- Port numbers users might change
+- Feature enable/disable flags
+- List of items users configure
+- Behavioral options
+- Template paths users might override
+
+**vars/ - Use for:**
+
+- Service names that differ by OS (ssh vs sshd)
+- Configuration file paths
+- Package names that vary by OS
+- Internal role constants
+- Values that should rarely/never be overridden
+
+### Anti-pattern
+
+- ❌ Don't put user-facing config in vars/ (can't be easily overridden)
+- ❌ Don't put OS-specific paths in defaults/ (users shouldn't need to change)
+- ❌ Avoid duplicating values between defaults/ and vars/
+- ❌ Don't use vars/ for what should be defaults/ (breaks override mechanism)
+
+## Pattern: Variable Naming Conventions
+
+### Description
+
+Use a consistent, hierarchical naming pattern: `{role_name}_{feature}_{attribute}`
+
+### Naming Pattern Structure
+
+```text
+{role_name}_{feature}_{attribute}_{sub_attribute}
+```
+
+### Examples from security role
+
+- `security_ssh_port` - Role: security, Feature: ssh, Attribute: port
+- `security_ssh_password_authentication` - Role: security, Feature: ssh,
+  Attribute: password_authentication
+- `security_fail2ban_enabled` - Role: security, Feature: fail2ban,
+  Attribute: enabled
+- `security_autoupdate_reboot_time` - Role: security, Feature: autoupdate,
+  Attribute: reboot_time
+- `security_ssh_restart_handler_state` - Role: security, Feature: ssh,
+  Attribute: restart_handler_state
+
+### Examples from github-users role
+
+- `github_users` - Role: github-users (shortened to github),
+  Feature: users (implicit)
+- `github_users_absent` - Role: github, Feature: users,
+  Attribute: absent
+- `github_users_authorized_keys_exclusive` - Role: github, Feature: users,
+  Attribute: authorized_keys_exclusive
+- `github_url` - Role: github, Feature: url (API endpoint)
+
+### Naming Guidelines
+
+1. **Always use role prefix** - Prevents variable name collisions
+2. **Use full words** - No abbreviations (password not pwd, configuration not cfg)
+3. **Snake_case only** - Underscores, never camelCase or kebab-case
+4. **Feature grouping** - Related vars share feature prefix for logical grouping
+5. **Hierarchical structure** - General to specific
+   (ssh → password → authentication)
+6. **Boolean naming** - Use `_enabled`, `_disabled`, or descriptive names
+   (not just `_flag`)
+7. **Descriptive, not cryptic** - Variable name should explain purpose
+
+### When to Use
+
+- All role variables without exception
+- Internal variables (loop vars, registered results) can skip prefix if scope is
+  limited
+- Consistently apply pattern across all variables in the role
+
+### Anti-pattern
+
+- ❌ Generic names: `port`, `enabled`, `users`
+  (conflicts in complex playbooks)
+- ❌ Abbreviations: `cfg`, `pwd`, `usr` (harder to read)
+- ❌ camelCase: `githubUsersAbsent` (not Ansible convention)
+- ❌ Inconsistent prefixes: Some vars with prefix, some without
+- ❌ Overly long names:
+  `security_ssh_configuration_password_authentication_setting`
+  (be descriptive, not verbose)
+
+## Pattern: Boolean vs String Values
+
+### Description
+
+Distinguish between Ansible booleans and configuration file string values.
+Quote strings that look like booleans.
+
+### Ansible Booleans (unquoted)
+
+**Use for feature flags, task conditions, role logic:**
+
+```yaml
+security_fail2ban_enabled: true
+security_autoupdate_enabled: true
+github_users_authorized_keys_exclusive: true
+```
+
+**Valid Ansible boolean values:**
+
+- `true` / `false` (preferred)
+- `yes` / `no`
+- `on` / `off`
+- `1` / `0`
+
+### Configuration Strings (quoted)
+
+**Use for values written to config files:**
+
+```yaml
+security_ssh_password_authentication: "no"
+security_ssh_permit_root_login: "no"
+security_ssh_usedns: "no"
+security_autoupdate_reboot: "false"
+```
+
+**Rationale:**
+
+When Ansible sees `no` or `false` without quotes, it converts to boolean. When
+this boolean is then written to a config file (via lineinfile or template), it
+becomes `False` or `false`, which might not match the config file's expected
+format (e.g., SSH expects `no`/`yes`).
+
+### Pattern from security role
+
+```yaml
+# Ansible boolean (role logic)
+# Controls whether to install fail2ban
+security_fail2ban_enabled: true
+
+# Config string (written to /etc/ssh/sshd_config)
+# Literal string "no" for SSH
+security_ssh_password_authentication: "no"
+```
+
+### When to Use
+
+**Unquoted booleans:**
+
+- Feature enable/disable flags (`role_feature_enabled`)
+- Task conditionals (`when:` clauses)
+- Handler behavior
+- Internal role logic
+
+**Quoted strings:**
+
+- Values written to config files
+- Values that must preserve exact format
+- Values that look like booleans but aren't
+
+### Anti-pattern
+
+- ❌ Unquoted yes/no for config values (becomes `True`/`False` in file)
+- ❌ Quoted booleans for feature flags (unnecessarily complex)
+- ❌ Inconsistent quoting across similar variables
+
+## Pattern: List and Dictionary Structures
+
+### Description
+
+Use flexible data structures that support both simple and complex use cases.
+
+### Simple List Pattern
+
+**github-users simple list:**
+
+```yaml
+github_users:
+  - geerlingguy
+  - fabpot
+  - johndoe
+```
+
+**security simple list:**
+
+```yaml
+security_sudoers_passwordless:
+  - deployuser
+  - admin
+
+security_ssh_allowed_users:
+  - alice
+  - bob
+```
+
+### List of Dictionaries Pattern
+
+**github-users complex pattern:**
+
+```yaml
+github_users:
+  - name: geerlingguy
+    groups: www-data,sudo
+  - name: fabpot
+    groups: developers
+  - johndoe  # Still supports simple string
+```
+
+**Task handling both patterns:**
+
+```yaml
+- name: Ensure GitHub user accounts are present.
+  user:
+    # Handles both dict and string
+    name: "{{ item.name | default(item) }}"
+    # Optional attribute
+    groups: "{{ item.groups | default(omit) }}"
+```
+
+**Key technique:** `{{ item.name | default(item) }}`
+
+- If item is a dict with 'name' key → use item.name
+- If item is a string → default to item itself
+- Supports both simple and complex usage
+
+### Dictionary Pattern
+
+**security dictionary example (inferred, not in role):**
+
+```yaml
+security_ssh_config:
+  port: 22
+  password_auth: "no"
+  permit_root: "no"
+```
+
+This pattern is less common in geerlingguy roles (flat variables preferred for simplicity).
+
+### When to Use
+
+**Simple lists:**
+
+- When each item needs only one value
+- User management (simple usernames)
+- Package lists
+- Simple configuration items
+
+**List of dicts:**
+
+- When items have multiple optional attributes
+- Users with groups, shells, home directories
+- Complex configuration items
+- When backwards compatibility with simple list is needed
+
+**Flat variables:**
+
+- When configuration is not deeply nested
+- When clarity is more important than brevity
+- When users need to override individual values
+
+### Anti-pattern
+
+- ❌ Deep nesting (3+ levels) - Hard to override, hard to document
+- ❌ Inconsistent structure - Some items as strings, others as dicts without
+  handling
+- ❌ Required attributes in complex structures without defaults
+- ❌ Over-engineering simple use cases
+
+## Pattern: Default Value Strategies
+
+### Description
+
+Choose appropriate default values that balance security, usability, and least surprise.
+
+### Empty List Defaults
+
+```yaml
+github_users: []
+github_users_absent: []
+security_ssh_allowed_users: []
+security_sudoers_passwordless: []
+```
+
+**Rationale:**
+
+- Safe default (no users created/removed)
+- Allows conditional logic: `when: github_users | length > 0`
+- Users must explicitly configure
+- No surprising side effects
+
+### Secure Defaults
+
+```yaml
+security_ssh_password_authentication: "no"
+security_ssh_permit_root_login: "no"
+github_users_authorized_keys_exclusive: true
+```
+
+**Rationale:**
+
+- Security-first approach
+- Users can relax security if needed
+- Prevents accidental insecure configurations
+
+### Service State Defaults
+
+```yaml
+security_sshd_state: started
+security_ssh_restart_handler_state: restarted
+```
+
+**Rationale:**
+
+- Explicit state management
+- Allows users to override (e.g., for testing)
+- Documents expected state
+
+### Feature Toggles
+
+```yaml
+security_fail2ban_enabled: true
+security_autoupdate_enabled: true
+```
+
+**Rationale:**
+
+- Enable useful features by default
+- Easy to disable if not wanted
+- Clear intent
+
+### Sensible Configuration Defaults
+
+```yaml
+security_ssh_port: 22
+github_url: https://github.com
+```
+
+**Rationale:**
+
+- Standard/expected values
+- Users only change when needed
+- Reduces configuration burden
+
+### When to Use
+
+- **Empty lists** - When no default action is safe
+- **Secure defaults** - For security-sensitive settings
+- **Enabled by default** - For beneficial features with no downsides
+- **Standard values** - For well-known defaults (port 22, standard URLs)
+
+### Anti-pattern
+
+- ❌ Undefined defaults - Use `[]` or explicit `null`, not absent
+- ❌ Insecure defaults - Don't default to `password_authentication: "yes"`
+- ❌ Surprising defaults - Don't create users/change configs by default
+- ❌ Missing defaults - Every variable in defaults/main.yml should have a value
+
+## Comparison to Virgo-Core Roles
+
+### system_user Role
+
+**Variable Analysis:**
+
+```yaml
+# From system_user/defaults/main.yml
+system_user_name: ""
+system_user_groups: []
+system_user_shell: /bin/bash
+system_user_ssh_keys: []
+system_user_sudo_access: "full"
+system_user_sudo_commands: []
+system_user_state: present
+```
+
+**Matches geerlingguy patterns:**
+
+- ✅ Role prefix (system_user_*)
+- ✅ Snake_case naming
+- ✅ Empty list defaults
+- ✅ Descriptive names
+- ✅ All in defaults/main.yml
+
+**Gaps:**
+
+- ⚠️  No feature grouping (all variables are related to user management,
+  so not needed)
+- ⚠️  Could use string for sudo_access
+  ("full", "commands", "none" vs full/limited)
+- ✅ No vars/ directory needed (no OS-specific values)
+
+**Pattern Match:** 95% - Excellent variable management
+
+### proxmox_access Role
+
+**Variable Analysis (sample):**
+
+```yaml
+# From proxmox_access/defaults/main.yml
+proxmox_access_roles: []
+proxmox_access_groups: []
+proxmox_access_users: []
+proxmox_access_tokens: []
+proxmox_access_acls: []
+proxmox_access_export_terraform_env: false
+```
+
+**Matches:**
+
+- ✅ Role prefix (proxmox_access_*)
+- ✅ Snake_case naming
+- ✅ Empty list defaults
+- ✅ Boolean flag for optional feature
+- ✅ Feature grouping (access_roles, access_groups, access_users)
+
+**Gaps:**
+
+- ✅ No OS-specific vars needed (Proxmox-specific role)
+- ✅ Good variable organization
+
+**Pattern Match:** 100% - Perfect variable management
+
+### proxmox_network Role
+
+**Variable Analysis (sample):**
+
+```yaml
+# From proxmox_network/defaults/main.yml
+proxmox_network_bridges: []
+proxmox_network_vlans: []
+proxmox_network_verify_connectivity: true
+```
+
+**Matches:**
+
+- ✅ Role prefix (proxmox_network_*)
+- ✅ Snake_case naming
+- ✅ Empty list defaults
+- ✅ Boolean flag
+- ✅ Feature grouping
+
+**Gaps:**
+
+- ✅ Excellent pattern adherence
+
+**Pattern Match:** 100% - Perfect variable management
+
+## Summary
+
+**Universal Variable Management Patterns:**
+
+1. Role-prefixed variable names (prevents conflicts)
+2. Snake_case naming convention
+3. Feature grouping with shared prefixes
+4. defaults/ for user configuration (low precedence)
+5. vars/ for OS-specific values (high precedence)
+6. Empty lists as safe defaults (`[]`)
+7. Quoted string booleans for config files (`"no"`, `"yes"`)
+8. Unquoted Ansible booleans for feature flags
+9. Flexible list/dict patterns with `item.name | default(item)`
+10. Descriptive full names, no abbreviations
+
+**Key Takeaways:**
+
+- Variable naming is not just convention - it prevents real bugs
+- defaults/ vs vars/ distinction is critical for override behavior
+- Quote config file values that look like booleans
+- Support both simple and complex usage patterns when possible
+- Default to secure, safe, empty values
+- Feature grouping makes variable relationships clear
+
+## Validation: geerlingguy.postgresql
+
+**Analysis Date:** 2025-10-23
+**Repository:** <https://github.com/geerlingguy/ansible-role-postgresql>
+
+### Role-Prefixed Variable Names
+
+- **Pattern: Role prefix on ALL variables** - ✅ **Confirmed**
+  - PostgreSQL: All variables start with `postgresql_`
+  - Examples: postgresql_databases, postgresql_users, postgresql_hba_entries,
+    postgresql_global_config_options
+  - **4/4 roles confirm this is universal**
+
+### Complex Data Structures
+
+- **Pattern: List of dicts with comprehensive inline documentation** -
+  ✅ **EXCELLENT EXAMPLE**
+  - PostgreSQL has multiple complex list-of-dict variables:
+
+  ```yaml
+  postgresql_databases: []
+  # - name: exampledb # required; the rest are optional
+  #   lc_collate: # defaults to 'en_US.UTF-8'
+  #   lc_ctype: # defaults to 'en_US.UTF-8'
+  #   encoding: # defaults to 'UTF-8'
+  #   template: # defaults to 'template0'
+  #   login_host: # defaults to 'localhost'
+  #   login_password: # defaults to not set
+  #   login_user: # defaults to 'postgresql_user'
+  #   state: # defaults to 'present'
+
+  postgresql_users: []
+  # - name: jdoe #required; the rest are optional
+  #   password: # defaults to not set
+  #   encrypted: # defaults to not set
+  #   role_attr_flags: # defaults to not set
+  #   db: # defaults to not set
+  #   state: # defaults to 'present'
+  ```
+
+  - **Validates:** Complex dict structures work beautifully with inline
+    documentation
+  - **Best practice:** Show ALL possible keys, mark required vs optional,
+    document defaults
+
+### defaults/ vs vars/ Usage
+
+- **Pattern: defaults/ for user config, vars/ for OS-specific** -
+  ✅ **Confirmed**
+  - defaults/main.yml: 100+ lines of user-configurable variables with extensive
+    inline docs
+  - vars/{Archlinux,Debian,RedHat}.yml: OS-specific package names, paths,
+    service names, versions
+  - **4/4 roles follow this pattern exactly**
+
+### Empty List Defaults
+
+- **Pattern: Default to [] for list variables** - ✅ **Confirmed**
+  - postgresql_databases: []
+  - postgresql_users: []
+  - postgresql_privs: []
+  - **4/4 roles use empty list defaults for safety**
+
+### Feature Grouping
+
+- **Pattern: Feature-based variable prefixes** - ✅ **Confirmed**
+  - postgresql_global_config_* for server configuration
+  - postgresql_hba_* for host-based authentication
+  - postgresql_unix_socket_* for socket configuration
+  - **Demonstrates:** Feature grouping scales to large variable sets
+    (20+ variables)
+
+### Variable Documentation Pattern
+
+- **Pattern: Inline comments in defaults/main.yml** -
+  ✅ **BEST PRACTICE EXAMPLE**
+  - Every complex variable has commented examples
+  - Shows required vs optional keys
+  - Documents default values inline
+  - Provides usage context
+  - **This is THE gold standard for complex variable documentation**
+
+### Advanced Pattern: Flexible Dict Structures
+
+- **Pattern: Optional attributes with sensible defaults** - ✅ **NEW INSIGHT**
+  - PostgreSQL variables accept dicts with only required keys
+  - Optional keys fall back to role defaults
+  - Task code: `item.login_host | default('localhost')`
+  - **Pattern:** Design dict structures so only required keys are necessary
+
+### Key Validation Findings
+
+**What PostgreSQL Role Confirms:**
+
+1. ✅ Role-prefixed variable names are universal (4/4 roles)
+2. ✅ Snake_case naming is universal (4/4 roles)
+3. ✅ Feature grouping is universal (4/4 roles)
+4. ✅ Empty list defaults are universal (4/4 roles)
+5. ✅ defaults/ vs vars/ separation is universal (4/4 roles)
+6. ✅ Inline documentation is critical for complex variables
+
+**What PostgreSQL Role Demonstrates:**
+
+1. 🔄 Complex list-of-dict variables can have 10+ optional attributes
+2. 🔄 Inline documentation prevents user confusion for complex structures
+3. 🔄 Show ALL possible keys, even optional ones
+4. 🔄 Mark required vs optional vs defaults in comments
+5. 🔄 Large variable sets (20+) benefit from logical grouping
+
+**Pattern Confidence After PostgreSQL Validation (4/4 roles):**
+
+- **Role prefixes:** UNIVERSAL (4/4 roles use them)
+- **Snake_case:** UNIVERSAL (4/4 roles use it)
+- **Feature grouping:** UNIVERSAL (4/4 roles group related variables)
+- **Empty list defaults:** UNIVERSAL (4/4 roles use [])
+- **defaults/ vs vars/:** UNIVERSAL (4/4 roles follow pattern)
+- **Complex dict structures:** VALIDATED (postgresql shows best practices at scale)
+- **Inline documentation:** CRITICAL (essential for complex variables)
+
+## Validation: geerlingguy.pip and geerlingguy.git
+
+**Analysis Date:** 2025-10-23
+**Repositories:**
+
+- <https://github.com/geerlingguy/ansible-role-pip>
+- <https://github.com/geerlingguy/ansible-role-git>
+
+### Minimal Variables Pattern (pip role)
+
+- **Pattern: Only essential variables** - ✅ **Confirmed**
+  - pip has only 3 variables: pip_package, pip_executable, pip_install_packages
+  - All variables role-prefixed with pip_
+  - defaults/main.yml is under 10 lines
+  - **Key finding:** Minimal roles maintain same naming discipline
+
+- **Pattern: String defaults with alternatives** - ✅ **Confirmed**
+  - pip_package: `python3-pip`
+    (shows python-pip alternative in README)
+  - pip_executable: `pip3` (auto-detected, can override)
+  - **6/6 roles document alternatives in README or comments**
+
+- **Pattern: List variable with dict options** - ✅ **Confirmed**
+  - pip_install_packages: defaults to `[]`
+  - Supports simple strings or dicts with keys: name, version, state, virtualenv,
+    extra_args
+  - **Validates:** List-of-string-or-dict pattern is universal
+
+### Utility Role Variables Pattern (git role)
+
+- **Pattern: Feature-toggle booleans** - ✅ **Confirmed**
+  - git_install_from_source: `false` (controls installation method)
+  - git_install_force_update: `false` (controls version management)
+  - **7/7 roles use boolean flags for optional features**
+
+- **Pattern: Conditional variable groups** - ✅ **Confirmed**
+  - Source install variables: workspace, version, path, force_update
+  - Only relevant when git_install_from_source: true
+  - Grouped together in defaults/main.yml
+  - **Validates:** Conditional features have grouped variables
+
+- **Pattern: Platform-specific vars/** - ✅ **Confirmed**
+  - git role uses vars/Debian.yml and vars/RedHat.yml
+    (implied from structure)
+  - vars/ contains non-configurable OS-specific data
+  - defaults/ contains all user-configurable options
+  - **7/7 roles use vars/ for OS-specific package lists**
+
+### Key Validation Findings
+
+**What pip + git Roles Confirm:**
+
+1. ✅ Role-prefix naming universal across all role sizes (7/7 roles)
+2. ✅ Snake_case universal (7/7 roles)
+3. ✅ Empty list defaults universal (7/7 roles use [])
+4. ✅ Boolean flags for features universal (7/7 roles)
+5. ✅ defaults/ vs vars/ separation universal (7/7 roles)
+6. ✅ Variable grouping applies even to simple roles (7/7 roles)
+
+**Pattern Confidence After Utility Role Validation (7/7 roles):**
+
+- **Role prefixes:** UNIVERSAL (7/7 roles use them)
+- **Snake_case:** UNIVERSAL (7/7 roles use it)
+- **Feature grouping:** UNIVERSAL (7/7 roles group related variables)
+- **Empty list defaults:** UNIVERSAL (7/7 roles use [])
+- **defaults/ vs vars/:** UNIVERSAL (7/7 roles follow pattern)
+- **Boolean feature toggles:** UNIVERSAL (7/7 roles use them)
+- **Conditional variable groups:** VALIDATED
+  (git proves pattern for optional features)
+- **Minimal variables principle:** CONFIRMED
+  (pip shows simplicity is acceptable)
+
+**Virgo-Core Assessment:**
+
+All three Virgo-Core roles demonstrate excellent variable management practices.
+They follow geerlingguy patterns closely and have no critical gaps. Minor
+enhancements could include more inline documentation in defaults/ files,
+especially for any complex dict structures.
+
+**Next Steps:**
+
+Apply these patterns rigorously in new roles. The variable management discipline
+in existing roles should be maintained and used as a template. For any future
+roles with complex variables, follow the postgresql pattern of comprehensive
+inline documentation.
--- a/skills/ansible-best-practices/reference/production-repos.md
+++ b/skills/ansible-best-practices/reference/production-repos.md
@@ -0,0 +1,244 @@
+# Production Repository Reference
+
+**Research Date:** 2025-10-23
+
+## Analyzed Repositories
+
+### Deep Exemplars
+
+#### 1. geerlingguy/ansible-role-security
+
+- **Purpose:** System hardening and security baseline configuration
+- **Repository:** <https://github.com/geerlingguy/ansible-role-security>
+- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/security>
+- **Key Learnings:**
+  - Molecule testing infrastructure as template for all roles
+  - Multi-distribution CI testing (rockylinux9, ubuntu2404, debian12)
+  - Security-focused variable defaults (ssh hardening, fail2ban, autoupdate)
+  - Comprehensive README with warnings and context
+  - Task file organization (ssh.yml, fail2ban.yml, autoupdate-{OS}.yml)
+  - Configuration validation patterns (sshd -T, visudo -cf)
+- **Downloads:** 1.5M+ (highly popular role)
+- **Complexity:** Medium (4 task files, 3 handlers, OS-specific vars)
+
+#### 2. geerlingguy/ansible-role-github-users
+
+- **Purpose:** User and SSH key management from GitHub accounts (maps to system_user)
+- **Repository:** <https://github.com/geerlingguy/ansible-role-github-users>
+- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/github_users>
+- **Key Learnings:**
+  - Flexible variable patterns: supports both simple strings and complex dicts
+  - item.name | default(item) pattern for backward compatibility
+  - Platform-agnostic role (GenericUNIX, GenericLinux support)
+  - Minimal role structure (no handlers, no vars/, simple tasks)
+  - User management without service restarts
+  - Inline documentation showing both simple and complex usage
+- **Downloads:** 100K+
+- **Complexity:** Low (single task file, no handlers, no OS-specific vars)
+
+### Breadth Validation
+
+#### 3. geerlingguy/ansible-role-docker
+
+- **Repository:** <https://github.com/geerlingguy/ansible-role-docker>
+- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/docker>
+- **Key Learnings:**
+  - Advanced include_vars with first_found lookup for better OS fallback
+  - Conditional handler execution (when: docker_service_manage | bool)
+  - meta: flush_handlers pattern for mid-play handler execution
+  - Check mode support (ignore_errors: "{{ ansible_check_mode }}")
+  - Repository-specific handlers (apt update for package repo changes)
+  - Expanded test matrix (7 distributions for broad compatibility)
+- **Downloads:** 2M+ (most popular role analyzed)
+- **Complexity:** Medium (OS-specific setup files, docker-compose feature, user management)
+
+#### 4. geerlingguy/ansible-role-postgresql
+
+- **Repository:** <https://github.com/geerlingguy/ansible-role-postgresql>
+- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/postgresql>
+- **Key Learnings:**
+  - Best-in-class complex variable documentation (list-of-dicts with all keys shown)
+  - Inline comments marking required vs optional vs defaults
+  - import_tasks vs include_tasks distinction (ordered vs conditional)
+  - Extensive platform support with version ranges ("xenial-jammy")
+  - Database role patterns (users, databases, privileges management)
+  - ArchLinux inclusion for bleeding-edge testing
+- **Downloads:** 500K+
+- **Complexity:** High (8+ task files, complex variable structures, database-specific patterns)
+
+#### 5. geerlingguy/ansible-role-nginx
+
+- **Repository:** <https://github.com/geerlingguy/ansible-role-nginx>
+- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/nginx>
+- **Key Learnings:**
+  - Jinja2 block inheritance in templates for user extensibility
+  - Template path variables for customization (nginx_conf_template, nginx_vhost_template)
+  - Both reload AND restart handlers (flexibility for web servers)
+  - Conditional reload handler with state check (when: nginx_service_state == "started")
+  - Validation handler pattern (alternative to task-level validation)
+  - Heavy template usage for complex configuration management
+- **Downloads:** 1M+
+- **Complexity:** Medium-High (multiple templates, vhost management, upstream configuration)
+
+#### 6. geerlingguy/ansible-role-pip
+
+- **Repository:** <https://github.com/geerlingguy/ansible-role-pip>
+- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/pip>
+- **Key Learnings:**
+  - Minimal role structure scales down appropriately (only essential directories)
+  - Testing patterns maintained even for 3-task roles
+  - Simple list-of-dicts variable pattern (pip_install_packages)
+  - Utility roles often have BROADER platform support than complex roles
+  - Documentation scales with complexity (concise but complete)
+  - Platform-agnostic package management
+- **Downloads:** 800K+
+- **Complexity:** Low (3 tasks total, minimal variables, no handlers)
+
+#### 7. geerlingguy/ansible-role-git
+
+- **Repository:** <https://github.com/geerlingguy/ansible-role-git>
+- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/git>
+- **Key Learnings:**
+  - Multi-scenario testing (package install vs source install)
+  - MOLECULE_PLAYBOOK variable for testing different installation methods
+  - Boolean feature toggles (git_install_from_source)
+  - Conditional variable groups (source install variables)
+  - import_tasks pattern for optional complex functionality
+  - vars/ directory for OS-specific package lists
+- **Downloads:** 1.2M+
+- **Complexity:** Low-Medium (simple core, optional source installation complexity)
+
+## Pattern Extraction Summary
+
+### Documents Created
+
+6 pattern documents extracted from 7 role analyses:
+
+1. **testing-comprehensive.md** - Molecule, CI/CD, test strategies, idempotence verification
+2. **role-structure-standards.md** - Directory organization, task routing, naming conventions
+3. **documentation-templates.md** - README structure, variable docs, examples, troubleshooting
+4. **variable-management-patterns.md** - defaults vs vars, naming, complex structures, inline docs
+5. **handler-best-practices.md** - Handler naming, reload vs restart, conditional execution
+6. **meta-dependencies.md** - galaxy_info, platform specification, tags, dependencies
+
+### Pattern Confidence Statistics
+
+- **10 Universal Patterns per category** - Confirmed across all 7 roles
+- **47 Total Universal Patterns** - Patterns present in 100% of applicable roles
+- **23 Contextual Patterns** - Patterns that vary appropriately by role complexity or purpose
+- **14 Evolving Patterns** - Improvements in newer roles or advanced techniques
+
+### Key Insights
+
+**Universal Patterns (All 7 roles follow):**
+
+- Molecule + Docker testing infrastructure (even for minimal 3-task roles)
+- Role-prefixed variable naming preventing conflicts
+- GitHub Actions CI with separate lint and molecule jobs
+- Comprehensive galaxy_info in meta/main.yml
+- README structure: Title → Requirements → Variables → Example → License
+- defaults/ for user config, vars/ for OS-specific values
+- Idempotence testing as primary quality verification
+
+**Contextual Patterns (Scale appropriately):**
+
+- Test distribution coverage: 3 for simple roles, 6-7 for complex roles
+- Task file count: 1 for minimal roles, 8+ for database/complex roles
+- Variable count: 3-5 for utilities, 20+ for configuration management
+- Handler presence: service roles have them, utility roles don't
+- Platform breadth: utilities support more platforms than complex roles
+
+**Evolving Patterns (Improvements noted):**
+
+- Advanced include_vars with first_found lookup (better OS fallback)
+- Jinja2 block inheritance in templates (user extensibility)
+- Conditional handler execution (docker, nginx patterns)
+- Complex variable inline documentation (postgresql best practice)
+- meta: flush_handlers for mid-play execution (docker pattern)
+
+## Download and Popularity Analysis
+
+**Most Downloaded Roles:**
+
+1. docker: 2M+ downloads
+2. nginx: 1M+ downloads
+3. security: 1.5M+ downloads
+4. git: 1.2M+ downloads
+5. pip: 800K+
+6. postgresql: 500K+
+7. github-users: 100K+
+
+**Insights:**
+
+- Infrastructure roles (docker, nginx, git, pip) have highest downloads
+- Security and database roles have strong sustained usage
+- Niche roles (github-users) still provide valuable patterns despite lower downloads
+- All roles maintained to same quality standard regardless of popularity
+
+## Role Complexity Spectrum
+
+**Minimal (3-5 tasks):**
+
+- pip: Package installation only
+- Simple, focused purpose
+- Broad platform support
+
+**Low (5-10 tasks):**
+
+- git: Dual installation methods
+- github-users: User management
+- Focused feature set
+
+**Medium (10-20 tasks):**
+
+- security: Multiple security features
+- docker: Service + user management
+- nginx: Web server + vhost management
+
+**High (20+ tasks):**
+
+- postgresql: Database + users + configuration
+- Complex orchestration
+- Extensive variable structures
+
+## Next Research Targets
+
+### Planned (Complex Orchestration)
+
+- **geerlingguy/ansible-role-kubernetes** - Multi-node cluster patterns, complex dependencies
+- **geerlingguy/ansible-role-mysql** - Alternative database patterns, replication, service coordination
+
+### Future Considerations
+
+- **Debops roles** - Variable organization at scale, comprehensive ecosystem patterns
+- **Kubespray** - Multi-node Kubernetes coordination, advanced templating
+- **OpenStack-Ansible** - HA patterns, service discovery, complex networking
+
+## Research Application
+
+### Virgo-Core Roles Validated Against Patterns
+
+All three Phase 1-3 roles compared against extracted patterns:
+
+- **system_user** - Excellent alignment with variable management and structure patterns
+- **proxmox_access** - Strong match with role organization and handler best practices
+- **proxmox_network** - Good network-specific handler usage, proper verification patterns
+
+**Primary Gaps Identified:**
+
+- Testing infrastructure (molecule + CI) missing from all roles (Critical)
+- galaxy_info could be enhanced with broader platform testing (Important)
+- README troubleshooting sections would add value (Nice-to-have)
+
+**Pattern Match Score:**
+
+- Structure: 95%+ across all three roles
+- Variable Management: 100% (perfect adherence to patterns)
+- Documentation: 90% (good foundation, room for enhancement)
+- Testing: 0% (not yet implemented, highest priority gap)
+
+## Conclusion
+
+Analysis of 7 production geerlingguy roles validated comprehensive, battle-tested patterns for Ansible role development. These patterns demonstrate remarkable consistency (47 universal patterns across 100% of roles) while allowing appropriate contextual variation (23 patterns that scale with complexity).
+
+The research provides high-confidence guidance for Phase 4+ development and establishes testing infrastructure as the primary gap to address in existing roles.
--- a/skills/ansible-best-practices/tools/check_idempotency.py
+++ b/skills/ansible-best-practices/tools/check_idempotency.py
@@ -0,0 +1,338 @@
+#!/usr/bin/env -S uv run --script --quiet
+# /// script
+# dependencies = ["pyyaml"]
+# ///
+"""
+Check Ansible playbooks for common idempotency issues.
+
+Detects:
+- Command/shell tasks without changed_when
+- Shell tasks without set -euo pipefail
+- Tasks without no_log that may contain secrets
+- Tasks missing name attribute
+- Use of deprecated short module names
+
+Usage:
+    ./check_idempotency.py playbook.yml
+    ./check_idempotency.py playbooks/*.yml
+    ./check_idempotency.py --strict playbook.yml
+"""
+
+import argparse
+import re
+import sys
+from pathlib import Path
+from typing import List, Tuple
+
+try:
+    import yaml
+except ImportError:
+    print("❌ PyYAML required: uv run check_idempotency.py", file=sys.stderr)
+    sys.exit(1)
+
+
+class IdempotencyChecker:
+    """Check Ansible playbooks for idempotency issues."""
+
+    # Modules that should have changed_when
+    COMMAND_MODULES = ['command', 'shell', 'ansible.builtin.command', 'ansible.builtin.shell']
+
+    # Modules that handle secrets
+    SECRET_MODULES = [
+        'user', 'ansible.builtin.user',
+        'mysql_user', 'community.mysql.mysql_user',
+        'postgresql_user', 'community.postgresql.postgresql_user',
+    ]
+
+    # Keywords that suggest secrets
+    SECRET_KEYWORDS = ['password', 'token', 'secret', 'key', 'credential', 'api_key']
+
+    def __init__(self, strict: bool = False):
+        self.strict = strict
+        self.issues = []
+
+    def check_playbook(self, playbook_path: Path) -> List[dict]:
+        """Check a playbook file for issues."""
+        self.issues = []
+
+        try:
+            with open(playbook_path, 'r') as f:
+                content = yaml.safe_load(f)
+        except yaml.YAMLError as e:
+            return [{'severity': 'error', 'message': f"Failed to parse YAML: {e}"}]
+        except IOError as e:
+            return [{'severity': 'error', 'message': f"Failed to read file: {e}"}]
+
+        if not content:
+            return []
+
+        # Check each play
+        for play_idx, play in enumerate(content):
+            if not isinstance(play, dict):
+                continue
+
+            # Check tasks
+            tasks = play.get('tasks', [])
+            self._check_tasks(tasks, f"play[{play_idx}].tasks")
+
+            # Check handlers
+            handlers = play.get('handlers', [])
+            self._check_tasks(handlers, f"play[{play_idx}].handlers")
+
+            # Check pre_tasks
+            pre_tasks = play.get('pre_tasks', [])
+            self._check_tasks(pre_tasks, f"play[{play_idx}].pre_tasks")
+
+            # Check post_tasks
+            post_tasks = play.get('post_tasks', [])
+            self._check_tasks(post_tasks, f"play[{play_idx}].post_tasks")
+
+        return self.issues
+
+    def _check_tasks(self, tasks: list, location: str):
+        """Check a list of tasks."""
+        for task_idx, task in enumerate(tasks):
+            if not isinstance(task, dict):
+                continue
+
+            task_location = f"{location}[{task_idx}]"
+
+            # Check for name
+            self._check_task_name(task, task_location)
+
+            # Check for command/shell issues
+            self._check_command_shell(task, task_location)
+
+            # Check for secret handling
+            self._check_secrets(task, task_location)
+
+            # Check for deprecated short names
+            self._check_module_names(task, task_location)
+
+            # Recursively check blocks
+            if 'block' in task:
+                self._check_tasks(task['block'], f"{task_location}.block")
+            if 'rescue' in task:
+                self._check_tasks(task['rescue'], f"{task_location}.rescue")
+            if 'always' in task:
+                self._check_tasks(task['always'], f"{task_location}.always")
+
+    def _check_task_name(self, task: dict, location: str):
+        """Check if task has a name."""
+        if 'name' not in task and 'include_tasks' not in task and 'import_tasks' not in task:
+            self.issues.append({
+                'severity': 'warning',
+                'location': location,
+                'message': 'Task missing name attribute',
+                'suggestion': 'Add name: field to describe what this task does'
+            })
+
+    def _check_command_shell(self, task: dict, location: str):
+        """Check command/shell tasks for idempotency."""
+        # Find module name
+        module_name = None
+        module_args = None
+
+        for key in task:
+            if key in self.COMMAND_MODULES:
+                module_name = key
+                module_args = task[key]
+                break
+
+        if not module_name:
+            return
+
+        task_name = task.get('name', 'unnamed task')
+
+        # Check for changed_when
+        if 'changed_when' not in task:
+            # Allow exception for tasks with register but no changed_when if they're checks
+            if 'register' in task:
+                # If task name suggests it's a check, this might be intentional
+                if any(word in task_name.lower() for word in ['check', 'verify', 'test', 'get', 'find']):
+                    severity = 'info' if self.strict else None
+                    if severity:
+                        self.issues.append({
+                            'severity': severity,
+                            'location': location,
+                            'message': 'Command/shell task without changed_when',
+                            'suggestion': 'Add changed_when: false if this is a read-only check'
+                        })
+                else:
+                    self.issues.append({
+                        'severity': 'warning',
+                        'location': location,
+                        'message': 'Command/shell task without changed_when',
+                        'suggestion': 'Add changed_when: to control when task reports as changed'
+                    })
+            else:
+                self.issues.append({
+                    'severity': 'warning',
+                    'location': location,
+                    'message': 'Command/shell task without changed_when or register',
+                    'suggestion': 'Add changed_when: and register: for proper idempotency'
+                })
+
+        # Check shell tasks for set -euo pipefail
+        if 'shell' in module_name and isinstance(module_args, str):
+            if '|' in module_args or '>' in module_args:  # Has pipes or redirects
+                if 'set -euo pipefail' not in module_args and 'set -o pipefail' not in module_args:
+                    self.issues.append({
+                        'severity': 'warning',
+                        'location': location,
+                        'message': 'Shell task with pipes missing "set -euo pipefail"',
+                        'suggestion': 'Add "set -euo pipefail" at the start of shell script'
+                    })
+
+        # Check if command could be shell (uses pipes, redirects, etc.)
+        if 'command' in module_name and isinstance(module_args, str):
+            if any(char in module_args for char in ['|', '>', '<', '&', ';', '$']):
+                self.issues.append({
+                    'severity': 'info',
+                    'location': location,
+                    'message': 'Command module used with shell features',
+                    'suggestion': 'Consider using shell module instead (requires pipes, redirects, etc.)'
+                })
+
+    def _check_secrets(self, task: dict, location: str):
+        """Check if secrets are handled properly."""
+        # Check module type
+        module_name = None
+        for key in task:
+            if key in self.SECRET_MODULES:
+                module_name = key
+                break
+
+        # Check for secret keywords in task
+        task_str = str(task).lower()
+        has_secret_keyword = any(keyword in task_str for keyword in self.SECRET_KEYWORDS)
+
+        # Check module args for password/secret fields
+        has_secret_arg = False
+        for key, value in task.items():
+            if isinstance(value, dict):
+                for arg_key in value:
+                    if any(keyword in arg_key.lower() for keyword in self.SECRET_KEYWORDS):
+                        has_secret_arg = True
+                        break
+
+        if (module_name or has_secret_keyword or has_secret_arg) and 'no_log' not in task:
+            self.issues.append({
+                'severity': 'warning',
+                'location': location,
+                'message': 'Task may handle secrets without no_log: true',
+                'suggestion': 'Add no_log: true to prevent secrets from appearing in logs'
+            })
+
+    def _check_module_names(self, task: dict, location: str):
+        """Check for deprecated short module names."""
+        # Common short names that should be fully qualified
+        short_names = {
+            'copy': 'ansible.builtin.copy',
+            'file': 'ansible.builtin.file',
+            'template': 'ansible.builtin.template',
+            'command': 'ansible.builtin.command',
+            'shell': 'ansible.builtin.shell',
+            'apt': 'ansible.builtin.apt',
+            'yum': 'ansible.builtin.yum',
+            'service': 'ansible.builtin.service',
+            'systemd': 'ansible.builtin.systemd',
+            'user': 'ansible.builtin.user',
+            'group': 'ansible.builtin.group',
+            'debug': 'ansible.builtin.debug',
+            'fail': 'ansible.builtin.fail',
+            'assert': 'ansible.builtin.assert',
+            'set_fact': 'ansible.builtin.set_fact',
+        }
+
+        for short_name, fqcn in short_names.items():
+            if short_name in task and '.' not in short_name:
+                self.issues.append({
+                    'severity': 'info' if not self.strict else 'warning',
+                    'location': location,
+                    'message': f'Using deprecated short module name: {short_name}',
+                    'suggestion': f'Use FQCN: {fqcn}'
+                })
+
+
+def print_issues(playbook_path: Path, issues: List[dict]):
+    """Print issues in a readable format."""
+    if not issues:
+        print(f"✓ {playbook_path}: No issues found")
+        return
+
+    print(f"\n📄 {playbook_path}")
+    print("=" * 70)
+
+    # Group by severity
+    errors = [i for i in issues if i.get('severity') == 'error']
+    warnings = [i for i in issues if i.get('severity') == 'warning']
+    info = [i for i in issues if i.get('severity') == 'info']
+
+    for severity, items, icon in [('ERROR', errors, '❌'), ('WARNING', warnings, '⚠️'), ('INFO', info, 'ℹ️')]:
+        if not items:
+            continue
+
+        print(f"\n{icon} {severity} ({len(items)}):")
+        for issue in items:
+            print(f"   Location: {issue.get('location', 'unknown')}")
+            print(f"   Issue: {issue.get('message')}")
+            if 'suggestion' in issue:
+                print(f"   Suggestion: {issue.get('suggestion')}")
+            print()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Check Ansible playbooks for common idempotency issues"
+    )
+    parser.add_argument(
+        "playbooks",
+        nargs="+",
+        type=Path,
+        help="Playbook files to check"
+    )
+    parser.add_argument(
+        "--strict",
+        action="store_true",
+        help="Treat informational issues as warnings"
+    )
+    parser.add_argument(
+        "--summary",
+        action="store_true",
+        help="Show only summary, not individual issues"
+    )
+
+    args = parser.parse_args()
+
+    checker = IdempotencyChecker(strict=args.strict)
+    all_issues = {}
+    total_issues = 0
+
+    for playbook_path in args.playbooks:
+        if not playbook_path.exists():
+            print(f"❌ File not found: {playbook_path}", file=sys.stderr)
+            continue
+
+        issues = checker.check_playbook(playbook_path)
+        all_issues[playbook_path] = issues
+        total_issues += len(issues)
+
+        if not args.summary:
+            print_issues(playbook_path, issues)
+
+    # Summary
+    print("\n" + "=" * 70)
+    print(f"📊 Summary: Checked {len(args.playbooks)} playbook(s)")
+    print(f"   Total issues: {total_issues}")
+
+    if total_issues == 0:
+        print("   ✓ All playbooks look good!")
+        sys.exit(0)
+    else:
+        print(f"   ⚠️  Found issues in {sum(1 for i in all_issues.values() if i)} playbook(s)")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/ansible-best-practices/tools/lint-all.sh
+++ b/skills/ansible-best-practices/tools/lint-all.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+# Run all Ansible linters with proper configuration
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Counters
+TOTAL_CHECKS=0
+FAILED_CHECKS=0
+
+# Function to print section header
+print_header() {
+    echo ""
+    echo "========================================="
+    echo "$1"
+    echo "========================================="
+}
+
+# Function to run a check
+run_check() {
+    local name="$1"
+    local command="$2"
+
+    TOTAL_CHECKS=$((TOTAL_CHECKS + 1))
+
+    echo -n "Running $name... "
+
+    if eval "$command" > /tmp/lint-output.txt 2>&1; then
+        echo -e "${GREEN}✓ PASS${NC}"
+        return 0
+    else
+        echo -e "${RED}✗ FAIL${NC}"
+        cat /tmp/lint-output.txt
+        FAILED_CHECKS=$((FAILED_CHECKS + 1))
+        return 1
+    fi
+}
+
+# Change to ansible directory if not already there
+if [[ ! -d "playbooks" ]] && [[ -d "ansible" ]]; then
+    cd ansible
+fi
+
+print_header "Ansible Playbook Linting"
+
+# Check if ansible-lint is available
+if command -v ansible-lint &> /dev/null; then
+    run_check "ansible-lint (playbooks)" "ansible-lint playbooks/"
+    run_check "ansible-lint (roles)" "ansible-lint roles/ || true"  # May not have roles
+else
+    echo -e "${YELLOW}⚠ ansible-lint not found, skipping${NC}"
+fi
+
+# Check YAML syntax
+print_header "YAML Syntax Validation"
+
+if command -v yamllint &> /dev/null; then
+    run_check "yamllint (playbooks)" "yamllint playbooks/"
+    run_check "yamllint (group_vars)" "yamllint group_vars/ || true"
+    run_check "yamllint (host_vars)" "yamllint host_vars/ || true"
+else
+    echo -e "${YELLOW}⚠ yamllint not found, skipping${NC}"
+fi
+
+# Check playbook syntax
+print_header "Ansible Syntax Check"
+
+for playbook in playbooks/*.yml; do
+    if [[ -f "$playbook" ]]; then
+        playbook_name=$(basename "$playbook")
+        run_check "syntax ($playbook_name)" "ansible-playbook $playbook --syntax-check"
+    fi
+done
+
+# Custom idempotency check (if tool exists)
+print_header "Idempotency Check"
+
+IDEMPOTENCY_TOOL="../.claude/skills/ansible-best-practices/tools/check_idempotency.py"
+if [[ -f "$IDEMPOTENCY_TOOL" ]]; then
+    run_check "idempotency check" "uv run $IDEMPOTENCY_TOOL playbooks/*.yml"
+else
+    echo -e "${YELLOW}⚠ Idempotency checker not found, skipping${NC}"
+fi
+
+# Summary
+print_header "Summary"
+
+echo "Total checks: $TOTAL_CHECKS"
+echo "Passed: $((TOTAL_CHECKS - FAILED_CHECKS))"
+echo "Failed: $FAILED_CHECKS"
+
+if [[ $FAILED_CHECKS -eq 0 ]]; then
+    echo -e "${GREEN}✓ All checks passed!${NC}"
+    exit 0
+else
+    echo -e "${RED}✗ Some checks failed${NC}"
+    exit 1
+fi