Initial commit
This commit is contained in:
12
.claude-plugin/plugin.json
Normal file
12
.claude-plugin/plugin.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"name": "ansible-best-practices",
|
||||
"description": "Ansible playbook refactoring, role development, testing, and best practices with Infisical secrets management",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "basher83",
|
||||
"email": "basher83@mail.spaceships.work"
|
||||
},
|
||||
"skills": [
|
||||
"./skills"
|
||||
]
|
||||
}
|
||||
3
README.md
Normal file
3
README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# ansible-best-practices
|
||||
|
||||
Ansible playbook refactoring, role development, testing, and best practices with Infisical secrets management
|
||||
117
plugin.lock.json
Normal file
117
plugin.lock.json
Normal file
@@ -0,0 +1,117 @@
|
||||
{
|
||||
"$schema": "internal://schemas/plugin.lock.v1.json",
|
||||
"pluginId": "gh:basher83/lunar-claude:plugins/infrastructure/ansible-best-practices",
|
||||
"normalized": {
|
||||
"repo": null,
|
||||
"ref": "refs/tags/v20251128.0",
|
||||
"commit": "eef1ea0fdc4539368ef81ddc9ac68389c80a1e57",
|
||||
"treeHash": "c9023a71527b9bb43fa99df4eb2c7dc8197daaaa62431b006bcd78599c5390e3",
|
||||
"generatedAt": "2025-11-28T10:14:11.921713Z",
|
||||
"toolVersion": "publish_plugins.py@0.2.0"
|
||||
},
|
||||
"origin": {
|
||||
"remote": "git@github.com:zhongweili/42plugin-data.git",
|
||||
"branch": "master",
|
||||
"commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390",
|
||||
"repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data"
|
||||
},
|
||||
"manifest": {
|
||||
"name": "ansible-best-practices",
|
||||
"description": "Ansible playbook refactoring, role development, testing, and best practices with Infisical secrets management",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
"content": {
|
||||
"files": [
|
||||
{
|
||||
"path": "README.md",
|
||||
"sha256": "e29716e1fad616884a71aebbba2c77c5948663e492bd1c6989993cc06e6f4d66"
|
||||
},
|
||||
{
|
||||
"path": ".claude-plugin/plugin.json",
|
||||
"sha256": "3c2b518746bbfbddb923eefef236873a6939cc148b0b41dba91e88a4603dd408"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/SKILL.md",
|
||||
"sha256": "c6c05c8d6e3cbad2f377424d7bb7704895f3742c5ae8c6d20d1d7aa20e96196b"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/tools/lint-all.sh",
|
||||
"sha256": "5efc687e1fdf9cf3ca461f559f083f009d4028ab6c4fb170ee3325238d285b74"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/tools/check_idempotency.py",
|
||||
"sha256": "727d4e35a560d50748f1fea99761a4aa14b9646cbdf978c7ec69ea8d0e73f5ce"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/patterns/role-structure-standards.md",
|
||||
"sha256": "fa04e62bf3d59a2d883afaa19749850ef73abd524bad38f5193b281a382b0ffc"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/patterns/testing-comprehensive.md",
|
||||
"sha256": "f98bf5b1d0ea916beb1ccf66d89504921f4ca2e9bcf7dda7ffaf90cd61fc0877"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/patterns/variable-management-patterns.md",
|
||||
"sha256": "49becbed5312d7294321ce443729ccaf8d609f40b738b15dcc4a4271bb8327d0"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/patterns/documentation-templates.md",
|
||||
"sha256": "1131d281cc706853ad06fa8d099dcac7e3658e30299d35019382d60e688b8bd0"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/patterns/network-automation.md",
|
||||
"sha256": "17fcb8127b7bf96cf5fd3126492c1abf10258c674080acfb3c8af0c5f0565294"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/patterns/playbook-role-patterns.md",
|
||||
"sha256": "0d3bca0260266215405c9e15a7876274b37b1b784a4c79c4c80c78f4215e0c08"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/patterns/cluster-automation.md",
|
||||
"sha256": "a1f56c9d94370c70bf0ee0187f798f5bd1bdb15a3ff7a931a621a939b8313f9d"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/patterns/error-handling.md",
|
||||
"sha256": "736c82e8410ac02ba18c104ef346b9c44e686d060414332db85ba75fe6e1c0d4"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/patterns/ceph-automation.md",
|
||||
"sha256": "89a345ce583d56d0a9bfb54b707c8a074c0bf4dbc0951ecdda77af2f82d72024"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/patterns/meta-dependencies.md",
|
||||
"sha256": "676ab77408753af4c477ffacceed202e00b4f8a3d360c68dc1b4a725096ccfc3"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/patterns/secrets-management.md",
|
||||
"sha256": "484095a5c627fe89964edd3dddd28ef373be993a4276259ad5f2c1e212d05051"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/patterns/handler-best-practices.md",
|
||||
"sha256": "0c58980b793024c84dc1d1573524dd7d04beb97b6ae0127969709f5887317d11"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/anti-patterns/common-mistakes.md",
|
||||
"sha256": "07a257980ddd710c1670f4c286bf3fe6cf5ef95c12e603b2c3566364f144d64b"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/examples/02-infisical-secrets/docker-deployment.yml",
|
||||
"sha256": "56c24f19770ae371717f7fbfbc1b27ad325b871dc852061260d47c8a3a99964c"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/examples/02-infisical-secrets/README.md",
|
||||
"sha256": "c0554e6d3274543cf0b0d29ae4e99465d2f7a3b3dfab01ff9ac14291665823d1"
|
||||
},
|
||||
{
|
||||
"path": "skills/ansible-best-practices/reference/production-repos.md",
|
||||
"sha256": "d7c0eaa4cd41a77135f7c29291aa4b380c65af87d33f58a81f9192999de8353c"
|
||||
}
|
||||
],
|
||||
"dirSha256": "c9023a71527b9bb43fa99df4eb2c7dc8197daaaa62431b006bcd78599c5390e3"
|
||||
},
|
||||
"security": {
|
||||
"scannedAt": null,
|
||||
"scannerVersion": null,
|
||||
"flags": []
|
||||
}
|
||||
}
|
||||
391
skills/ansible-best-practices/SKILL.md
Normal file
391
skills/ansible-best-practices/SKILL.md
Normal file
@@ -0,0 +1,391 @@
|
||||
---
|
||||
name: ansible-best-practices
|
||||
description: >
|
||||
Ansible playbook and role patterns using ansible.builtin modules, community.general,
|
||||
community.proxmox, ansible.posix collections, molecule testing, ansible-lint validation,
|
||||
and Infisical secrets management. Covers idempotency patterns (changed_when, failed_when,
|
||||
register), YAML playbook structure, Jinja2 templating, handler patterns, and variable
|
||||
precedence rules. This skill should be used when writing Ansible playbooks, developing
|
||||
Ansible roles, testing with molecule/ansible-lint, managing secrets with Infisical,
|
||||
implementing idempotent task patterns with changed_when/failed_when directives, or
|
||||
configuring Proxmox/network automation.
|
||||
---
|
||||
|
||||
# Ansible Playbook Best Practices
|
||||
|
||||
Expert guidance for writing maintainable, idempotent, and testable Ansible playbooks based on
|
||||
real-world patterns from this repository.
|
||||
|
||||
## Quick Reference
|
||||
|
||||
### Pattern Decision Guide
|
||||
|
||||
| Need | Use Pattern | Details |
|
||||
|------|-------------|---------|
|
||||
| **Use secrets?** | Infisical Secret Management | [patterns/secrets-management.md](patterns/secrets-management.md) |
|
||||
| **Resource management?** | State-Based Playbooks | [patterns/playbook-role-patterns.md](patterns/playbook-role-patterns.md) |
|
||||
| **No native module?** | Hybrid Module Approach | See Hybrid Module section below |
|
||||
| **Task failing?** | Proper Error Handling | [patterns/error-handling.md](patterns/error-handling.md) |
|
||||
| **Repeating blocks?** | Task Organization | [patterns/task-organization.md](patterns/task-organization.md) |
|
||||
| **Network config?** | Network Automation | [patterns/network-automation.md](patterns/network-automation.md) |
|
||||
| **Tasks show 'changed'?** | Idempotency Patterns | [reference/idempotency-patterns.md](reference/idempotency-patterns.md) |
|
||||
|
||||
### Golden Rules
|
||||
|
||||
1. **Use `uv run` prefix** - Always: `uv run ansible-playbook`
|
||||
2. **Fully qualify modules** - `ansible.builtin.copy` not `copy`
|
||||
3. **Secrets via Infisical** - Use reusable task pattern
|
||||
4. **Control `command`/`shell`** - Always use `changed_when`, `failed_when`
|
||||
5. **Use `set -euo pipefail`** - In all shell scripts
|
||||
6. **Tag sensitive tasks** - Use `no_log: true`
|
||||
7. **Idempotency first** - Check before create, verify after
|
||||
|
||||
### Common Commands
|
||||
|
||||
```bash
|
||||
# Lint
|
||||
mise run ansible-lint
|
||||
|
||||
# Analyze complexity
|
||||
./tools/analyze_playbook.py ansible/playbooks/my-playbook.yml
|
||||
|
||||
# Check idempotency
|
||||
./tools/check_idempotency.py ansible/playbooks/my-playbook.yml
|
||||
|
||||
# Run with secrets
|
||||
cd ansible && uv run ansible-playbook playbooks/my-playbook.yml
|
||||
```
|
||||
|
||||
## Core Patterns from This Repository
|
||||
|
||||
### 1. Infisical Secret Management
|
||||
|
||||
This repository uses **Infisical** for centralized secrets management.
|
||||
|
||||
**Quick Pattern:**
|
||||
|
||||
```yaml
|
||||
- name: Retrieve Proxmox credentials
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'PROXMOX_PASSWORD'
|
||||
secret_var_name: 'proxmox_password'
|
||||
fallback_env_var: 'PROXMOX_PASSWORD' # Optional
|
||||
```
|
||||
|
||||
**Key Features:** Validates authentication, proper `no_log`, fallback to env vars, reusable across playbooks.
|
||||
|
||||
See [patterns/secrets-management.md](patterns/secrets-management.md) for complete guide including
|
||||
authentication methods, security best practices, and CI/CD integration.
|
||||
|
||||
### 2. State-Based Playbooks
|
||||
|
||||
**Pattern:** Single playbook handles both create and remove via `state` variable.
|
||||
|
||||
```yaml
|
||||
# Create user (default)
|
||||
uv run ansible-playbook playbooks/create-admin-user.yml \
|
||||
-e "admin_name=alice" -e "admin_ssh_key='ssh-ed25519 ...'"
|
||||
|
||||
# Remove user (add state=absent)
|
||||
uv run ansible-playbook playbooks/create-admin-user.yml \
|
||||
-e "admin_name=alice" -e "admin_state=absent"
|
||||
```
|
||||
|
||||
**Why:** Follows community role patterns, single source of truth, consistent interface, less duplication.
|
||||
|
||||
See [patterns/playbook-role-patterns.md](patterns/playbook-role-patterns.md) for complete implementation details and advanced patterns.
|
||||
|
||||
### 3. Hybrid Module Approach
|
||||
|
||||
**Pattern:** Use native modules where available, fall back to `command` when needed.
|
||||
|
||||
```yaml
|
||||
# GOOD: Native module
|
||||
- name: Create Linux system user
|
||||
ansible.builtin.user:
|
||||
name: "{{ system_username }}"
|
||||
state: present
|
||||
|
||||
# ACCEPTABLE: Command when no native module exists
|
||||
- name: Create Proxmox API token
|
||||
ansible.builtin.command: >
|
||||
pveum user token add {{ system_username }}@{{ proxmox_user_realm }}
|
||||
register: token_result
|
||||
changed_when: "'already exists' not in token_result.stderr"
|
||||
failed_when:
|
||||
- token_result.rc != 0
|
||||
- "'already exists' not in token_result.stderr"
|
||||
```
|
||||
|
||||
**Key:** `changed_when` and `failed_when` make `command` module idempotent.
|
||||
|
||||
### 4. Proper Error Handling
|
||||
|
||||
```yaml
|
||||
- name: Check if resource exists
|
||||
ansible.builtin.command: check-resource {{ resource_id }}
|
||||
register: resource_check
|
||||
changed_when: false # Read-only operation
|
||||
failed_when: false # Don't fail, check in next task
|
||||
|
||||
- name: Fail if resource missing
|
||||
ansible.builtin.fail:
|
||||
msg: "Resource {{ resource_id }} not found"
|
||||
when: resource_check.rc != 0
|
||||
```
|
||||
|
||||
See [patterns/error-handling.md](patterns/error-handling.md) for comprehensive patterns.
|
||||
|
||||
### 5. Task Organization
|
||||
|
||||
**Reusable Tasks Pattern:**
|
||||
|
||||
```yaml
|
||||
# In playbook
|
||||
- name: Get database password
|
||||
ansible.builtin.include_tasks: "{{ playbook_dir }}/../tasks/infisical-secret-lookup.yml"
|
||||
vars:
|
||||
secret_name: 'DB_PASSWORD'
|
||||
secret_var_name: 'db_password'
|
||||
```
|
||||
|
||||
Extract common patterns to `tasks/` directory, use `include_tasks` with clear variable contracts.
|
||||
|
||||
See [patterns/task-organization.md](patterns/task-organization.md) and [patterns/reusable-tasks.md](patterns/reusable-tasks.md).
|
||||
|
||||
### 6. Network Automation
|
||||
|
||||
**Pattern:** Use `community.general.interfaces_file` for network configuration.
|
||||
|
||||
```yaml
|
||||
- name: Enable VLAN-aware bridging
|
||||
community.general.interfaces_file:
|
||||
iface: vmbr1
|
||||
option: bridge-vlan-aware
|
||||
value: "yes"
|
||||
backup: true
|
||||
state: present
|
||||
notify: Reload network interfaces
|
||||
```
|
||||
|
||||
Declarative config, automatic backup, handler pattern for reload.
|
||||
|
||||
See [patterns/network-automation.md](patterns/network-automation.md) for advanced patterns including VLAN, bonding, and verification.
|
||||
|
||||
### 7. Idempotency Patterns
|
||||
|
||||
**Use `changed_when` and `failed_when`:**
|
||||
|
||||
```yaml
|
||||
# Check before create
|
||||
- name: Check if VM exists
|
||||
ansible.builtin.shell: |
|
||||
set -o pipefail
|
||||
qm list | awk '{print $1}' | grep -q "^{{ template_id }}$"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: vm_exists
|
||||
changed_when: false # Checking doesn't change anything
|
||||
failed_when: false # Don't fail if not found
|
||||
|
||||
# Conditional create
|
||||
- name: Create VM
|
||||
ansible.builtin.command: qm create {{ template_id }} ...
|
||||
when: vm_exists.rc != 0
|
||||
```
|
||||
|
||||
See [reference/idempotency-patterns.md](reference/idempotency-patterns.md) for comprehensive patterns.
|
||||
|
||||
## Variable Organization
|
||||
|
||||
### Quick Summary
|
||||
|
||||
**Precedence:** Extra vars (`-e`) > Role vars > Defaults
|
||||
|
||||
**Organization:**
|
||||
|
||||
```text
|
||||
ansible/
|
||||
├── group_vars/all.yml # Variables for ALL hosts
|
||||
├── group_vars/proxmox.yml # Group-specific
|
||||
├── host_vars/foxtrot.yml # Host-specific
|
||||
└── playbooks/
|
||||
└── my-playbook.yml # Use vars: for playbook-specific
|
||||
```
|
||||
|
||||
**Key principle:** Use `defaults/main.yml` for configurable options, `vars/main.yml` for constants.
|
||||
|
||||
See [reference/variable-precedence.md](reference/variable-precedence.md) for complete precedence
|
||||
rules (22 levels) and
|
||||
[patterns/variable-management-patterns.md](patterns/variable-management-patterns.md) for
|
||||
advanced patterns.
|
||||
|
||||
## Module Selection
|
||||
|
||||
### Prefer ansible.builtin
|
||||
|
||||
**Always use fully qualified collection names (FQCN):**
|
||||
|
||||
```yaml
|
||||
# GOOD
|
||||
- name: Ping hosts
|
||||
ansible.builtin.ping:
|
||||
|
||||
# BAD (deprecated short names)
|
||||
- name: Ping hosts
|
||||
ping:
|
||||
```
|
||||
|
||||
### Community Collections in Use
|
||||
|
||||
- `community.general` - General utilities (interfaces_file, etc.)
|
||||
- `community.proxmox` - Proxmox VE management
|
||||
- `infisical.vault` - Secrets management
|
||||
- `ansible.posix` - POSIX system management
|
||||
- `community.docker` - Docker management
|
||||
|
||||
See [../../ansible/requirements.yml](../../ansible/requirements.yml) and [reference/collections-guide.md](reference/collections-guide.md).
|
||||
|
||||
## Testing
|
||||
|
||||
### With ansible-lint
|
||||
|
||||
```bash
|
||||
# Run all linters
|
||||
mise run lint-all
|
||||
|
||||
# Just Ansible
|
||||
mise run ansible-lint
|
||||
```
|
||||
|
||||
**Common Issues:** Missing `name:` on tasks, using `shell` instead of `command`, not using
|
||||
`changed_when`, deprecated short names, missing `no_log` on sensitive tasks.
|
||||
|
||||
### With Molecule
|
||||
|
||||
```bash
|
||||
cd tools/molecule/default
|
||||
molecule create # Create test environment
|
||||
molecule converge # Run playbook
|
||||
molecule verify # Run tests
|
||||
molecule destroy # Clean up
|
||||
```
|
||||
|
||||
See [reference/testing-guide.md](reference/testing-guide.md) and [patterns/testing-comprehensive.md](patterns/testing-comprehensive.md) for CI/CD integration.
|
||||
|
||||
## Common Anti-Patterns
|
||||
|
||||
See [anti-patterns/common-mistakes.md](anti-patterns/common-mistakes.md) for detailed examples.
|
||||
|
||||
### Quick List
|
||||
|
||||
**1. Not Using `set -euo pipefail`**
|
||||
|
||||
```yaml
|
||||
# GOOD
|
||||
- name: Run script
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
command1 | command2
|
||||
args:
|
||||
executable: /bin/bash
|
||||
```
|
||||
|
||||
**2. Missing `no_log` on Secrets**
|
||||
|
||||
```yaml
|
||||
# GOOD
|
||||
- name: Set password
|
||||
ansible.builtin.command: set-password {{ password }}
|
||||
no_log: true
|
||||
```
|
||||
|
||||
**3. Using `shell` When `command` Suffices**
|
||||
|
||||
Use `shell` ONLY when you need shell features (pipes, redirects, etc.).
|
||||
|
||||
```yaml
|
||||
# GOOD: No shell features needed
|
||||
- name: List files
|
||||
ansible.builtin.command: ls -la
|
||||
```
|
||||
|
||||
See [anti-patterns/common-mistakes.md](anti-patterns/common-mistakes.md) for complete list and
|
||||
[anti-patterns/refactoring-guide.md](anti-patterns/refactoring-guide.md) for improvement
|
||||
strategies.
|
||||
|
||||
## Tools Available
|
||||
|
||||
### Python Analysis Tools (uv)
|
||||
|
||||
```bash
|
||||
# Complexity metrics
|
||||
./tools/analyze_playbook.py playbook.yml
|
||||
|
||||
# Find non-idempotent patterns
|
||||
./tools/check_idempotency.py playbook.yml
|
||||
|
||||
# Variable organization helper
|
||||
./tools/extract_variables.py playbook.yml
|
||||
```
|
||||
|
||||
### Linting
|
||||
|
||||
```bash
|
||||
# Run all linters
|
||||
./tools/lint-all.sh
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
```bash
|
||||
# Molecule test scenarios
|
||||
./tools/molecule/default/
|
||||
```
|
||||
|
||||
## Progressive Disclosure
|
||||
|
||||
Start here, drill down as needed:
|
||||
|
||||
### Quick Reference (Read First)
|
||||
|
||||
- [Playbook & Role Patterns](patterns/playbook-role-patterns.md) - State-based playbooks, public API variables, validation
|
||||
- [Secrets Management](patterns/secrets-management.md) - Infisical integration, authentication, security
|
||||
|
||||
### Deep Patterns (Read When Needed)
|
||||
|
||||
- [Testing Comprehensive](patterns/testing-comprehensive.md) - Molecule, CI/CD, test strategies
|
||||
- [Role Structure Standards](patterns/role-structure-standards.md) - Directory org, naming conventions
|
||||
- [Documentation Templates](patterns/documentation-templates.md) - README structure, variable docs
|
||||
- [Variable Management Patterns](patterns/variable-management-patterns.md) - defaults vs vars, naming
|
||||
- [Handler Best Practices](patterns/handler-best-practices.md) - Handler usage patterns
|
||||
- [Meta Dependencies](patterns/meta-dependencies.md) - galaxy_info, dependencies
|
||||
|
||||
### Advanced Automation (from ProxSpray Analysis)
|
||||
|
||||
- [Cluster Automation](patterns/cluster-automation.md) - Proxmox cluster formation with idempotency
|
||||
- [Network Automation](patterns/network-automation.md) - Declarative network configuration
|
||||
- [CEPH Automation](patterns/ceph-automation.md) - Complete CEPH storage deployment
|
||||
|
||||
### Core Reference
|
||||
|
||||
- [Roles vs Playbooks](reference/roles-vs-playbooks.md) - Organization patterns
|
||||
- [Variable Precedence](reference/variable-precedence.md) - Complete precedence rules (22 levels)
|
||||
- [Idempotency Patterns](reference/idempotency-patterns.md) - Advanced idempotency techniques
|
||||
- [Module Selection](reference/module-selection.md) - Builtin vs community decision guide
|
||||
- [Testing Guide](reference/testing-guide.md) - Molecule and ansible-lint deep dive
|
||||
- [Collections Guide](reference/collections-guide.md) - Using and managing collections
|
||||
- [Production Repos](reference/production-repos.md) - Studied geerlingguy roles index
|
||||
|
||||
### Patterns & Anti-Patterns
|
||||
|
||||
- [Error Handling](patterns/error-handling.md) - Proper error handling patterns
|
||||
- [Task Organization](patterns/task-organization.md) - Reusable tasks and includes
|
||||
- [Common Mistakes](anti-patterns/common-mistakes.md) - What to avoid
|
||||
- [Refactoring Guide](anti-patterns/refactoring-guide.md) - How to improve existing playbooks
|
||||
|
||||
## Related Skills
|
||||
|
||||
- **Proxmox Infrastructure** - Playbooks for template creation and network config
|
||||
- **NetBox + PowerDNS** - Dynamic inventory and secrets management patterns
|
||||
698
skills/ansible-best-practices/anti-patterns/common-mistakes.md
Normal file
698
skills/ansible-best-practices/anti-patterns/common-mistakes.md
Normal file
@@ -0,0 +1,698 @@
|
||||
# Common Ansible Anti-Patterns and Mistakes
|
||||
|
||||
## Overview
|
||||
|
||||
This guide catalogs common mistakes found in Ansible playbooks and provides corrected examples based on Virgo-Core
|
||||
repository best practices.
|
||||
|
||||
## 1. Not Using `set -euo pipefail` in Shell Scripts
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Run multi-line shell script
|
||||
ansible.builtin.shell: |
|
||||
command1
|
||||
command2 | grep something
|
||||
command3
|
||||
```
|
||||
|
||||
**Problems:**
|
||||
|
||||
- Pipe failures ignored (grep returns no matches = rc 1, but shell continues)
|
||||
- Undefined variables silently treated as empty strings
|
||||
- First command failure doesn't stop execution
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Run multi-line shell script
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
command1
|
||||
command2 | grep something
|
||||
command3
|
||||
args:
|
||||
executable: /bin/bash
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
|
||||
- `-e`: Exit on first error
|
||||
- `-u`: Treat undefined variables as errors
|
||||
- `-o pipefail`: Pipe fails if any command in pipe fails
|
||||
- `executable: /bin/bash`: Ensures bash (not sh) interprets the script
|
||||
|
||||
## 2. Using Shell When Command Suffices
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: List files
|
||||
ansible.builtin.shell: ls -la /tmp
|
||||
```
|
||||
|
||||
**Problems:**
|
||||
|
||||
- Unnecessary shell overhead
|
||||
- Shell injection risk if variables used
|
||||
- Less portable
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: List files
|
||||
ansible.builtin.command: ls -la /tmp
|
||||
changed_when: false
|
||||
```
|
||||
|
||||
**Use `shell` ONLY when you need:**
|
||||
|
||||
- Pipes: `cat file | grep pattern`
|
||||
- Redirects: `command > output.txt`
|
||||
- Environment expansion: `echo $HOME`
|
||||
- Shell built-ins: `source`, `cd`, etc.
|
||||
|
||||
## 3. Missing `changed_when` on Command/Shell
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Check if VM exists
|
||||
ansible.builtin.command: qm status 101
|
||||
```
|
||||
|
||||
**Problem:** Reports "changed" even though it's a read-only check
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Check if VM exists
|
||||
ansible.builtin.command: qm status 101
|
||||
register: vm_status
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
```
|
||||
|
||||
## 4. Missing `no_log` on Sensitive Tasks
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Create user with password
|
||||
ansible.builtin.user:
|
||||
name: myuser
|
||||
password: "{{ user_password }}"
|
||||
# Password will appear in logs!
|
||||
```
|
||||
|
||||
**Problem:** Sensitive data appears in Ansible logs
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Create user with password
|
||||
ansible.builtin.user:
|
||||
name: myuser
|
||||
password: "{{ user_password }}"
|
||||
no_log: true
|
||||
```
|
||||
|
||||
**Always use `no_log: true` with:**
|
||||
|
||||
- Passwords
|
||||
- API tokens
|
||||
- SSH keys
|
||||
- Certificates
|
||||
- Any PII or sensitive data
|
||||
|
||||
## 5. Using Short Module Names
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Copy file
|
||||
copy:
|
||||
src: file.txt
|
||||
dest: /tmp/file.txt
|
||||
|
||||
- name: Install package
|
||||
apt:
|
||||
name: nginx
|
||||
state: present
|
||||
```
|
||||
|
||||
**Problem:** Short names are deprecated and will be removed
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Copy file
|
||||
ansible.builtin.copy:
|
||||
src: file.txt
|
||||
dest: /tmp/file.txt
|
||||
|
||||
- name: Install package
|
||||
ansible.builtin.apt:
|
||||
name: nginx
|
||||
state: present
|
||||
```
|
||||
|
||||
**Use Fully Qualified Collection Names (FQCN):**
|
||||
|
||||
- `ansible.builtin.copy` not `copy`
|
||||
- `ansible.builtin.command` not `command`
|
||||
- `community.proxmox.proxmox_kvm` not `proxmox_kvm`
|
||||
|
||||
## 6. Hard-Coding Secrets
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Configure database
|
||||
ansible.builtin.template:
|
||||
src: db-config.j2
|
||||
dest: /etc/app/db.yml
|
||||
vars:
|
||||
db_password: "MyPassword123" # NEVER DO THIS!
|
||||
```
|
||||
|
||||
**Problems:**
|
||||
|
||||
- Secrets in version control
|
||||
- No audit trail
|
||||
- Difficult to rotate
|
||||
- Security violation
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Retrieve database password
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'DB_PASSWORD'
|
||||
secret_var_name: 'db_password'
|
||||
|
||||
- name: Configure database
|
||||
ansible.builtin.template:
|
||||
src: db-config.j2
|
||||
dest: /etc/app/db.yml
|
||||
vars:
|
||||
db_password: "{{ db_password }}"
|
||||
no_log: true
|
||||
```
|
||||
|
||||
## 7. Not Handling "Already Exists" Gracefully
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Create API token
|
||||
ansible.builtin.command: pveum user token add terraform@pam terraform-token
|
||||
# Fails if token already exists
|
||||
```
|
||||
|
||||
**Problem:** Playbook not idempotent - fails on second run
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Create API token
|
||||
ansible.builtin.command: pveum user token add terraform@pam terraform-token
|
||||
register: token_result
|
||||
changed_when: "'already exists' not in token_result.stderr"
|
||||
failed_when:
|
||||
- token_result.rc != 0
|
||||
- "'already exists' not in token_result.stderr"
|
||||
```
|
||||
|
||||
**Pattern from repository:** Handle expected errors gracefully
|
||||
|
||||
## 8. Missing Task Names
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- ansible.builtin.apt:
|
||||
name: nginx
|
||||
state: present
|
||||
|
||||
- ansible.builtin.systemd:
|
||||
name: nginx
|
||||
state: started
|
||||
```
|
||||
|
||||
**Problem:** Hard to understand playbook output
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Install Nginx web server
|
||||
ansible.builtin.apt:
|
||||
name: nginx
|
||||
state: present
|
||||
|
||||
- name: Start Nginx service
|
||||
ansible.builtin.systemd:
|
||||
name: nginx
|
||||
state: started
|
||||
enabled: true
|
||||
```
|
||||
|
||||
**ansible-lint will flag this:** `[name[missing]]`
|
||||
|
||||
## 9. Using `when` Instead of `failed_when`
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Run command
|
||||
ansible.builtin.command: some-command
|
||||
register: result
|
||||
ignore_errors: true
|
||||
|
||||
- name: Fail if bad
|
||||
ansible.builtin.fail:
|
||||
msg: "Command failed"
|
||||
when: result.rc != 0 and 'acceptable error' not in result.stderr
|
||||
```
|
||||
|
||||
**Problem:** Two tasks instead of one, less clear
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Run command
|
||||
ansible.builtin.command: some-command
|
||||
register: result
|
||||
failed_when:
|
||||
- result.rc != 0
|
||||
- "'acceptable error' not in result.stderr"
|
||||
```
|
||||
|
||||
## 10. Ignoring Return Codes
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Run deployment script
|
||||
ansible.builtin.command: /usr/local/bin/deploy.sh
|
||||
# No error checking at all
|
||||
```
|
||||
|
||||
**Problem:** Failures go unnoticed
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Run deployment script
|
||||
ansible.builtin.command: /usr/local/bin/deploy.sh
|
||||
register: deploy_result
|
||||
|
||||
- name: Verify deployment succeeded
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- deploy_result.rc == 0
|
||||
- "'SUCCESS' in deploy_result.stdout"
|
||||
fail_msg: "Deployment failed: {{ deploy_result.stderr }}"
|
||||
```
|
||||
|
||||
## 11. Not Using Handlers for Service Restarts
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Update Nginx config
|
||||
ansible.builtin.copy:
|
||||
src: nginx.conf
|
||||
dest: /etc/nginx/nginx.conf
|
||||
|
||||
- name: Restart Nginx
|
||||
ansible.builtin.systemd:
|
||||
name: nginx
|
||||
state: restarted
|
||||
# Always restarts, even if config didn't change
|
||||
```
|
||||
|
||||
**Problem:** Unnecessary service restarts
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Update Nginx config
|
||||
ansible.builtin.copy:
|
||||
src: nginx.conf
|
||||
dest: /etc/nginx/nginx.conf
|
||||
notify: Restart Nginx
|
||||
|
||||
handlers:
|
||||
- name: Restart Nginx
|
||||
ansible.builtin.systemd:
|
||||
name: nginx
|
||||
state: restarted
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
|
||||
- Only restarts if config changes
|
||||
- Multiple tasks can trigger same handler
|
||||
- Handler runs once at end
|
||||
|
||||
## 12. Using `with_items` Instead of `loop`
|
||||
|
||||
### ❌ Wrong (Deprecated)
|
||||
|
||||
```yaml
|
||||
- name: Install packages
|
||||
ansible.builtin.apt:
|
||||
name: "{{ item }}"
|
||||
state: present
|
||||
with_items:
|
||||
- nginx
|
||||
- docker.io
|
||||
- python3-pip
|
||||
```
|
||||
|
||||
**Problem:** `with_items` is deprecated
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Install packages
|
||||
ansible.builtin.apt:
|
||||
name: "{{ item }}"
|
||||
state: present
|
||||
loop:
|
||||
- nginx
|
||||
- docker.io
|
||||
- python3-pip
|
||||
```
|
||||
|
||||
**Even better (single task):**
|
||||
|
||||
```yaml
|
||||
- name: Install packages
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- nginx
|
||||
- docker.io
|
||||
- python3-pip
|
||||
state: present
|
||||
```
|
||||
|
||||
## 13. Not Validating Variables
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Create VM
|
||||
community.proxmox.proxmox_kvm:
|
||||
vmid: "{{ vm_id }}"
|
||||
name: "{{ vm_name }}"
|
||||
# ... config ...
|
||||
# What if vm_id or vm_name is undefined?
|
||||
```
|
||||
|
||||
**Problem:** Cryptic errors if variables missing
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Validate VM variables
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- vm_id is defined
|
||||
- vm_id is number
|
||||
- vm_id >= 100
|
||||
- vm_name is defined
|
||||
- vm_name is match('^[a-z0-9-]+$')
|
||||
fail_msg: |
|
||||
Invalid VM configuration:
|
||||
vm_id: {{ vm_id | default('UNDEFINED') }}
|
||||
vm_name: {{ vm_name | default('UNDEFINED') }}
|
||||
|
||||
- name: Create VM
|
||||
community.proxmox.proxmox_kvm:
|
||||
vmid: "{{ vm_id }}"
|
||||
name: "{{ vm_name }}"
|
||||
# ... config ...
|
||||
```
|
||||
|
||||
## 14. Mixing Logic and Data
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Configure based on hostname
|
||||
ansible.builtin.template:
|
||||
src: app-config.j2
|
||||
dest: /etc/app/config.yml
|
||||
vars:
|
||||
db_host: "{{ 'prod-db' if inventory_hostname == 'prod-server' else 'dev-db' }}"
|
||||
# Logic in vars
|
||||
```
|
||||
|
||||
**Problem:** Hard to maintain, not DRY
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
**In `group_vars/prod.yml`:**
|
||||
|
||||
```yaml
|
||||
db_host: prod-db
|
||||
```
|
||||
|
||||
**In `group_vars/dev.yml`:**
|
||||
|
||||
```yaml
|
||||
db_host: dev-db
|
||||
```
|
||||
|
||||
**In playbook:**
|
||||
|
||||
```yaml
|
||||
- name: Configure application
|
||||
ansible.builtin.template:
|
||||
src: app-config.j2
|
||||
dest: /etc/app/config.yml
|
||||
```
|
||||
|
||||
## 15. Not Using Tags
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
# No tags - must run entire playbook every time
|
||||
- name: Install packages
|
||||
ansible.builtin.apt: ...
|
||||
|
||||
- name: Configure service
|
||||
ansible.builtin.template: ...
|
||||
|
||||
- name: Start service
|
||||
ansible.builtin.systemd: ...
|
||||
```
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Install packages
|
||||
ansible.builtin.apt: ...
|
||||
tags: [install, packages]
|
||||
|
||||
- name: Configure service
|
||||
ansible.builtin.template: ...
|
||||
tags: [config]
|
||||
|
||||
- name: Start service
|
||||
ansible.builtin.systemd: ...
|
||||
tags: [service, start]
|
||||
```
|
||||
|
||||
**Usage:**
|
||||
|
||||
```bash
|
||||
# Only run config tasks
|
||||
ansible-playbook playbook.yml --tags config
|
||||
|
||||
# Skip service start
|
||||
ansible-playbook playbook.yml --skip-tags start
|
||||
```
|
||||
|
||||
## 16. Using Bare Variables in Templates
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```jinja
|
||||
# templates/config.j2
|
||||
database_host: {{ db_host }}
|
||||
database_port: {{ db_port }}
|
||||
```
|
||||
|
||||
**Problem:** YAML parsing errors if values contain special characters
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```jinja
|
||||
# templates/config.j2
|
||||
database_host: "{{ db_host }}"
|
||||
database_port: {{ db_port }}
|
||||
```
|
||||
|
||||
**Rule:** Always quote strings, don't quote numbers/booleans
|
||||
|
||||
## 17. Hardcoding Paths
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Copy script
|
||||
ansible.builtin.copy:
|
||||
src: scripts/deploy.sh
|
||||
dest: /opt/myapp/deploy.sh
|
||||
# Assumes specific directory structure
|
||||
```
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Copy script
|
||||
ansible.builtin.copy:
|
||||
src: "{{ playbook_dir }}/../scripts/deploy.sh"
|
||||
dest: "{{ app_install_dir }}/deploy.sh"
|
||||
vars:
|
||||
app_install_dir: /opt/myapp
|
||||
```
|
||||
|
||||
## 18. Not Using Blocks for Related Tasks
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Task 1
|
||||
ansible.builtin.command: task1
|
||||
when: deploy_mode == 'production'
|
||||
|
||||
- name: Task 2
|
||||
ansible.builtin.command: task2
|
||||
when: deploy_mode == 'production'
|
||||
|
||||
- name: Task 3
|
||||
ansible.builtin.command: task3
|
||||
when: deploy_mode == 'production'
|
||||
```
|
||||
|
||||
**Problem:** Repetitive conditions
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Production deployment tasks
|
||||
block:
|
||||
- name: Task 1
|
||||
ansible.builtin.command: task1
|
||||
|
||||
- name: Task 2
|
||||
ansible.builtin.command: task2
|
||||
|
||||
- name: Task 3
|
||||
ansible.builtin.command: task3
|
||||
|
||||
when: deploy_mode == 'production'
|
||||
```
|
||||
|
||||
## 19. Using `sudo` Instead of `become`
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```yaml
|
||||
- name: Install package
|
||||
ansible.builtin.command: sudo apt install nginx
|
||||
```
|
||||
|
||||
**Problems:**
|
||||
|
||||
- Bypasses Ansible's privilege escalation
|
||||
- No become_user support
|
||||
- Less portable
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```yaml
|
||||
- name: Install package
|
||||
ansible.builtin.apt:
|
||||
name: nginx
|
||||
state: present
|
||||
become: true
|
||||
```
|
||||
|
||||
## 20. Not Testing Playbooks
|
||||
|
||||
### ❌ Wrong
|
||||
|
||||
```bash
|
||||
# Write playbook, run directly in production
|
||||
ansible-playbook production.yml
|
||||
```
|
||||
|
||||
### ✅ Correct
|
||||
|
||||
```bash
|
||||
# 1. Syntax check
|
||||
ansible-playbook playbook.yml --syntax-check
|
||||
|
||||
# 2. Lint
|
||||
ansible-lint playbook.yml
|
||||
|
||||
# 3. Dry run (check mode)
|
||||
ansible-playbook playbook.yml --check
|
||||
|
||||
# 4. Test in development
|
||||
ansible-playbook playbook.yml -l dev
|
||||
|
||||
# 5. Limited rollout in production
|
||||
ansible-playbook playbook.yml -l prod --limit 1
|
||||
|
||||
# 6. Full production deployment
|
||||
ansible-playbook playbook.yml -l prod
|
||||
```
|
||||
|
||||
## Quick Reference: Ansible-Lint Rules
|
||||
|
||||
Common rules flagged by ansible-lint:
|
||||
|
||||
| Rule ID | Description | Fix |
|
||||
|---------|-------------|-----|
|
||||
| `name[missing]` | Task missing name | Add `name:` field |
|
||||
| `fqcn[action-core]` | Use FQCN for modules | `ansible.builtin.copy` not `copy` |
|
||||
| `no-changed-when` | Command without `changed_when` | Add `changed_when:` |
|
||||
| `risky-shell-pipe` | Shell pipe without `set -o pipefail` | Add `set -euo pipefail` |
|
||||
| `no-log-password` | Password without `no_log` | Add `no_log: true` |
|
||||
|
||||
**Run ansible-lint:**
|
||||
|
||||
```bash
|
||||
cd ansible
|
||||
ansible-lint playbooks/my-playbook.yml
|
||||
```
|
||||
|
||||
## Summary: Best Practices Checklist
|
||||
|
||||
- [ ] Use `set -euo pipefail` in all shell scripts
|
||||
- [ ] Use `changed_when: false` for read-only commands
|
||||
- [ ] Add `no_log: true` to sensitive tasks
|
||||
- [ ] Use FQCN for all modules
|
||||
- [ ] Handle "already exists" errors gracefully
|
||||
- [ ] Add descriptive names to all tasks
|
||||
- [ ] Validate variables with `assert`
|
||||
- [ ] Use handlers for service restarts
|
||||
- [ ] Store secrets in Infisical, not playbooks
|
||||
- [ ] Test with ansible-lint before committing
|
||||
- [ ] Use blocks to group related tasks
|
||||
- [ ] Add tags for selective execution
|
||||
- [ ] Verify critical operations after execution
|
||||
|
||||
## Further Reading
|
||||
|
||||
- [Ansible Best Practices](https://docs.ansible.com/ansible/latest/user_guide/playbooks_best_practices.html)
|
||||
- [Ansible-Lint Rules](https://ansible-lint.readthedocs.io/rules/)
|
||||
@@ -0,0 +1,475 @@
|
||||
# Docker Deployment with Infisical Secrets
|
||||
|
||||
**Learning objective:** See best practices in action - secrets management, error handling, and idempotency.
|
||||
|
||||
## What This Example Demonstrates
|
||||
|
||||
This playbook showcases **production-ready Ansible patterns** from Virgo-Core:
|
||||
|
||||
✅ **Secrets Management:**
|
||||
|
||||
- Infisical integration using reusable task
|
||||
- Fallback to environment variables
|
||||
- `no_log: true` on sensitive tasks
|
||||
|
||||
✅ **Error Handling:**
|
||||
|
||||
- Pre-flight checks with `assert`
|
||||
- `changed_when` for idempotency
|
||||
- `failed_when` for graceful failures
|
||||
- Block/rescue for rollback
|
||||
|
||||
✅ **Best Practices:**
|
||||
|
||||
- Fully qualified module names (FQCN)
|
||||
- Task organization with blocks
|
||||
- Handlers for service restarts
|
||||
- Verification steps
|
||||
|
||||
✅ **Docker Operations:**
|
||||
|
||||
- Idempotent container management
|
||||
- Health checks with retries
|
||||
- Proper logging on failures
|
||||
|
||||
## Prerequisites
|
||||
|
||||
### 1. Infisical Setup
|
||||
|
||||
**Universal Auth credentials:**
|
||||
|
||||
```bash
|
||||
export INFISICAL_UNIVERSAL_AUTH_CLIENT_ID="ua-abc123"
|
||||
export INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET="secret-xyz789"
|
||||
```
|
||||
|
||||
**OR fallback environment variables:**
|
||||
|
||||
```bash
|
||||
export DB_PASSWORD="fallback-db-password"
|
||||
export API_KEY="fallback-api-key"
|
||||
export REDIS_PASSWORD="fallback-redis-password"
|
||||
```
|
||||
|
||||
### 2. Ansible Collections
|
||||
|
||||
```bash
|
||||
# Install required collections
|
||||
cd ../../.. # Back to ansible directory
|
||||
uv run ansible-galaxy collection install -r requirements.yml
|
||||
```
|
||||
|
||||
### 3. Target Hosts
|
||||
|
||||
Update inventory with Docker hosts:
|
||||
|
||||
```ini
|
||||
# inventory/hosts
|
||||
[docker_hosts]
|
||||
docker-01-nexus.spaceships.work
|
||||
```
|
||||
|
||||
### 4. Templates (create these)
|
||||
|
||||
The playbook references templates you need to create:
|
||||
|
||||
**`templates/app-config.yml.j2`:**
|
||||
|
||||
```yaml
|
||||
database:
|
||||
host: db.spaceships.work
|
||||
password: "{{ db_password }}"
|
||||
|
||||
api:
|
||||
key: "{{ api_key }}"
|
||||
|
||||
redis:
|
||||
host: redis.spaceships.work
|
||||
password: "{{ redis_password }}"
|
||||
```
|
||||
|
||||
**`templates/docker-compose.yml.j2`:**
|
||||
|
||||
```yaml
|
||||
version: '3.8'
|
||||
services:
|
||||
app:
|
||||
image: your-app:latest
|
||||
environment:
|
||||
- CONFIG_FILE=/config/config.yml
|
||||
volumes:
|
||||
- {{ app_dir }}/config.yml:/config/config.yml:ro
|
||||
ports:
|
||||
- "8080:8080"
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Validate Playbook
|
||||
|
||||
**Syntax check:**
|
||||
|
||||
```bash
|
||||
ansible-playbook docker-deployment.yml --syntax-check
|
||||
```
|
||||
|
||||
**Lint check:**
|
||||
|
||||
```bash
|
||||
ansible-lint docker-deployment.yml
|
||||
```
|
||||
|
||||
**Dry run:**
|
||||
|
||||
```bash
|
||||
ansible-playbook docker-deployment.yml --check
|
||||
```
|
||||
|
||||
### 2. Run Playbook
|
||||
|
||||
```bash
|
||||
# Full deployment
|
||||
ansible-playbook -i ../../inventory/hosts docker-deployment.yml
|
||||
|
||||
# Specific tags
|
||||
ansible-playbook -i ../../inventory/hosts docker-deployment.yml --tags secrets
|
||||
ansible-playbook -i ../../inventory/hosts docker-deployment.yml --tags deploy
|
||||
ansible-playbook -i ../../inventory/hosts docker-deployment.yml --tags verify
|
||||
```
|
||||
|
||||
### 3. Verify Deployment
|
||||
|
||||
```bash
|
||||
# Check application health
|
||||
curl http://docker-01-nexus.spaceships.work:8080/health
|
||||
|
||||
# Check Docker containers
|
||||
ssh ansible@docker-01-nexus.spaceships.work "docker ps"
|
||||
```
|
||||
|
||||
## Understanding the Patterns
|
||||
|
||||
### Pattern 1: Infisical Secret Lookup
|
||||
|
||||
**The Pattern:**
|
||||
|
||||
```yaml
|
||||
- name: Retrieve database password from Infisical
|
||||
ansible.builtin.include_tasks: ../../tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'DB_PASSWORD'
|
||||
secret_var_name: 'db_password'
|
||||
fallback_env_var: 'DB_PASSWORD'
|
||||
```
|
||||
|
||||
**Why it works:**
|
||||
|
||||
- Reusable task (DRY principle)
|
||||
- Validates authentication before retrieving
|
||||
- Fallback to environment for local dev
|
||||
- No secrets in logs
|
||||
- Clear error messages
|
||||
|
||||
**Learn more:** [../../patterns/secrets-management.md](../../patterns/secrets-management.md)
|
||||
|
||||
### Pattern 2: Pre-flight Validation
|
||||
|
||||
**The Pattern:**
|
||||
|
||||
```yaml
|
||||
pre_tasks:
|
||||
- name: Validate required variables
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- app_name is defined
|
||||
fail_msg: "Required variables not set"
|
||||
|
||||
- name: Check if Docker is installed
|
||||
ansible.builtin.command: which docker
|
||||
register: docker_check
|
||||
changed_when: false # Check doesn't change state
|
||||
failed_when: false # Don't fail yet
|
||||
```
|
||||
|
||||
**Why it works:**
|
||||
|
||||
- Fails fast with clear messages
|
||||
- Prevents partial deployments
|
||||
- Uses `changed_when: false` for checks
|
||||
- Uses `failed_when: false` to check result later
|
||||
|
||||
### Pattern 3: Idempotent Docker Operations
|
||||
|
||||
**The Pattern:**
|
||||
|
||||
```yaml
|
||||
- name: Check if container is already running
|
||||
ansible.builtin.command: docker ps --filter name={{ app_name }}
|
||||
register: container_check
|
||||
changed_when: false
|
||||
|
||||
- name: Start Docker containers
|
||||
ansible.builtin.command: docker-compose up -d
|
||||
register: compose_up
|
||||
changed_when: "'Creating' in compose_up.stderr or 'Starting' in compose_up.stderr"
|
||||
when: container_check.stdout != app_name
|
||||
```
|
||||
|
||||
**Why it works:**
|
||||
|
||||
- Check first, then create
|
||||
- Only reports "changed" if actually started something
|
||||
- Conditional execution with `when:`
|
||||
- True idempotency
|
||||
|
||||
### Pattern 4: Block/Rescue Error Handling
|
||||
|
||||
**The Pattern:**
|
||||
|
||||
```yaml
|
||||
- name: Docker Management Block
|
||||
block:
|
||||
- name: Pull images
|
||||
# ... tasks ...
|
||||
|
||||
rescue:
|
||||
- name: Show container logs on failure
|
||||
ansible.builtin.command: docker-compose logs --tail=50
|
||||
register: container_logs
|
||||
|
||||
- name: Report failure
|
||||
ansible.builtin.fail:
|
||||
msg: "Deployment failed: {{ container_logs.stdout }}"
|
||||
```
|
||||
|
||||
**Why it works:**
|
||||
|
||||
- Groups related tasks
|
||||
- Automatic rollback on failure
|
||||
- Provides debugging info
|
||||
- Clean error reporting
|
||||
|
||||
**Learn more:** [../../patterns/error-handling.md](../../patterns/error-handling.md)
|
||||
|
||||
### Pattern 5: Health Checks with Retries
|
||||
|
||||
**The Pattern:**
|
||||
|
||||
```yaml
|
||||
- name: Wait for application to be healthy
|
||||
ansible.builtin.uri:
|
||||
url: "http://localhost:8080/health"
|
||||
status_code: 200
|
||||
register: health_check
|
||||
until: health_check.status == 200
|
||||
retries: 30
|
||||
delay: 10
|
||||
```
|
||||
|
||||
**Why it works:**
|
||||
|
||||
- Automatic retries for transient failures
|
||||
- Configurable timeout (30 × 10s = 5 minutes)
|
||||
- Fails clearly if never becomes healthy
|
||||
|
||||
## Common Mistakes Avoided
|
||||
|
||||
This playbook avoids common anti-patterns:
|
||||
|
||||
### ❌ Anti-pattern 1: Hard-coded Secrets
|
||||
|
||||
```yaml
|
||||
# DON'T DO THIS!
|
||||
- name: Deploy config
|
||||
ansible.builtin.template:
|
||||
src: config.j2
|
||||
dest: /etc/app/config.yml
|
||||
vars:
|
||||
db_password: "MyPassword123" # NEVER!
|
||||
```
|
||||
|
||||
✅ **This playbook:** Uses Infisical with fallback to environment
|
||||
|
||||
### ❌ Anti-pattern 2: Missing changed_when
|
||||
|
||||
```yaml
|
||||
# DON'T DO THIS!
|
||||
- name: Start container
|
||||
ansible.builtin.command: docker start myapp
|
||||
# Always reports "changed" even if already running
|
||||
```
|
||||
|
||||
✅ **This playbook:** Checks first, uses `changed_when` to detect actual changes
|
||||
|
||||
### ❌ Anti-pattern 3: No Error Handling
|
||||
|
||||
```yaml
|
||||
# DON'T DO THIS!
|
||||
- name: Deploy app
|
||||
ansible.builtin.command: deploy.sh
|
||||
# No check if it worked, no cleanup on failure
|
||||
```
|
||||
|
||||
✅ **This playbook:** Uses block/rescue, verifies success
|
||||
|
||||
### ❌ Anti-pattern 4: Secrets in Logs
|
||||
|
||||
```yaml
|
||||
# DON'T DO THIS!
|
||||
- name: Set password
|
||||
ansible.builtin.command: set-password {{ password }}
|
||||
# Password visible in Ansible output!
|
||||
```
|
||||
|
||||
✅ **This playbook:** Uses `no_log: true` on sensitive tasks
|
||||
|
||||
## Customization
|
||||
|
||||
### Different Application
|
||||
|
||||
Change variables:
|
||||
|
||||
```yaml
|
||||
vars:
|
||||
app_name: "my-other-app"
|
||||
app_dir: "/opt/my-other-app"
|
||||
```
|
||||
|
||||
### Different Secrets
|
||||
|
||||
Add more secret retrievals:
|
||||
|
||||
```yaml
|
||||
- name: Retrieve JWT secret
|
||||
ansible.builtin.include_tasks: ../../tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'JWT_SECRET'
|
||||
secret_var_name: 'jwt_secret'
|
||||
```
|
||||
|
||||
### Skip Health Check
|
||||
|
||||
```bash
|
||||
ansible-playbook docker-deployment.yml --skip-tags verify
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Infisical Authentication Failed
|
||||
|
||||
**Error:** `Missing Infisical authentication credentials`
|
||||
|
||||
**Solution:**
|
||||
|
||||
```bash
|
||||
# Check environment variables
|
||||
echo $INFISICAL_UNIVERSAL_AUTH_CLIENT_ID
|
||||
echo $INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET
|
||||
|
||||
# OR use fallback
|
||||
export DB_PASSWORD="fallback-password"
|
||||
```
|
||||
|
||||
### Docker Not Installed
|
||||
|
||||
**Error:** `Docker is not installed`
|
||||
|
||||
**Solution:**
|
||||
|
||||
```bash
|
||||
# Install Docker on target host
|
||||
ssh ansible@docker-host
|
||||
sudo apt update
|
||||
sudo apt install docker.io docker-compose
|
||||
```
|
||||
|
||||
### Container Won't Start
|
||||
|
||||
**Error:** `Docker deployment failed`
|
||||
|
||||
**Solution:** Playbook shows logs automatically in rescue block. Review output for errors.
|
||||
|
||||
**Manual check:**
|
||||
|
||||
```bash
|
||||
ssh ansible@docker-host
|
||||
cd /opt/my-application
|
||||
docker-compose logs
|
||||
```
|
||||
|
||||
### Health Check Timeout
|
||||
|
||||
**Error:** `Wait for application to be healthy` times out
|
||||
|
||||
**Solution:**
|
||||
|
||||
```yaml
|
||||
# Increase retries/delay
|
||||
retries: 60 # 10 minutes
|
||||
delay: 10
|
||||
```
|
||||
|
||||
## Testing the Playbook
|
||||
|
||||
### Check Idempotency
|
||||
|
||||
```bash
|
||||
# Run twice - second run should show no changes
|
||||
ansible-playbook docker-deployment.yml
|
||||
ansible-playbook docker-deployment.yml # Should be all "ok", no "changed"
|
||||
```
|
||||
|
||||
### Run Linters
|
||||
|
||||
```bash
|
||||
# Ansible lint
|
||||
ansible-lint docker-deployment.yml
|
||||
|
||||
# Custom idempotency check
|
||||
../../tools/check_idempotency.py docker-deployment.yml
|
||||
|
||||
# Full lint suite
|
||||
../../tools/lint-all.sh
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
### Learn More Patterns
|
||||
|
||||
- **Error Handling:** [../../patterns/error-handling.md](../../patterns/error-handling.md)
|
||||
- **Secrets Management:** [../../patterns/secrets-management.md](../../patterns/secrets-management.md)
|
||||
- **Common Mistakes:** [../../anti-patterns/common-mistakes.md](../../anti-patterns/common-mistakes.md)
|
||||
|
||||
### Additional Examples
|
||||
|
||||
- **Basic Playbook:** `../01-basic-playbook/` - Simpler starting point
|
||||
- **Repository Playbooks:** `../../../ansible/playbooks/` - Real production playbooks
|
||||
|
||||
### Best Practices
|
||||
|
||||
Review the main skill:
|
||||
|
||||
- [../../SKILL.md](../../SKILL.md) - Complete best practices guide
|
||||
|
||||
## Why These Patterns Matter
|
||||
|
||||
**In Production:**
|
||||
|
||||
- ✅ Secrets never in version control
|
||||
- ✅ Playbooks are truly idempotent
|
||||
- ✅ Clear error messages for troubleshooting
|
||||
- ✅ Audit trail for all operations
|
||||
- ✅ Rollback on failures
|
||||
|
||||
**For Teams:**
|
||||
|
||||
- ✅ Consistent patterns across playbooks
|
||||
- ✅ Easy to understand and maintain
|
||||
- ✅ Self-documenting code
|
||||
- ✅ Reduced bus factor
|
||||
|
||||
**For You:**
|
||||
|
||||
- ✅ Confidence in deployments
|
||||
- ✅ Less time debugging
|
||||
- ✅ Better sleep at night!
|
||||
@@ -0,0 +1,211 @@
|
||||
---
|
||||
# =============================================================================
|
||||
# Docker Deployment with Infisical Secrets
|
||||
# =============================================================================
|
||||
# This playbook demonstrates best practices from Virgo-Core:
|
||||
# - Infisical secrets management (using reusable task)
|
||||
# - Proper error handling with changed_when/failed_when
|
||||
# - Idempotent command execution
|
||||
# - No secrets in logs (no_log: true)
|
||||
# - Fully qualified module names (FQCN)
|
||||
# - Task organization with blocks
|
||||
|
||||
- name: Deploy Docker application with secrets from Infisical
|
||||
hosts: docker_hosts
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
vars:
|
||||
app_name: "my-application"
|
||||
app_dir: "/opt/{{ app_name }}"
|
||||
infisical_project_id: "7b832220-24c0-45bc-a5f1-ce9794a31259"
|
||||
infisical_env: "prod"
|
||||
infisical_path: "/doggos-cluster"
|
||||
|
||||
# ==========================================================================
|
||||
# Pre-flight Checks
|
||||
# ==========================================================================
|
||||
|
||||
pre_tasks:
|
||||
- name: Validate required variables
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- app_name is defined and app_name | length > 0
|
||||
- app_dir is defined
|
||||
- infisical_project_id is defined
|
||||
fail_msg: "Required variables not set"
|
||||
success_msg: "All required variables present"
|
||||
tags: [always]
|
||||
|
||||
- name: Check if Docker is installed
|
||||
ansible.builtin.command: which docker
|
||||
register: docker_check
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
tags: [always]
|
||||
|
||||
- name: Fail if Docker not installed
|
||||
ansible.builtin.fail:
|
||||
msg: |
|
||||
Docker is not installed on {{ inventory_hostname }}
|
||||
Please install Docker first: sudo apt install docker.io
|
||||
when: docker_check.rc != 0
|
||||
tags: [always]
|
||||
|
||||
# ==========================================================================
|
||||
# Main Tasks
|
||||
# ==========================================================================
|
||||
|
||||
tasks:
|
||||
# ========================================================================
|
||||
# Retrieve Secrets from Infisical
|
||||
# ========================================================================
|
||||
|
||||
- name: Secrets Management Block
|
||||
block:
|
||||
- name: Retrieve database password from Infisical
|
||||
ansible.builtin.include_tasks: ../../tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'DB_PASSWORD'
|
||||
secret_var_name: 'db_password'
|
||||
fallback_env_var: 'DB_PASSWORD' # Optional fallback
|
||||
|
||||
- name: Retrieve API key from Infisical
|
||||
ansible.builtin.include_tasks: ../../tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'API_KEY'
|
||||
secret_var_name: 'api_key'
|
||||
fallback_env_var: 'API_KEY'
|
||||
|
||||
- name: Retrieve Redis password from Infisical
|
||||
ansible.builtin.include_tasks: ../../tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'REDIS_PASSWORD'
|
||||
secret_var_name: 'redis_password'
|
||||
fallback_env_var: 'REDIS_PASSWORD'
|
||||
|
||||
tags: [secrets, config]
|
||||
|
||||
# ========================================================================
|
||||
# Application Setup
|
||||
# ========================================================================
|
||||
|
||||
- name: Application Deployment Block
|
||||
block:
|
||||
- name: Create application directory
|
||||
ansible.builtin.file:
|
||||
path: "{{ app_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: Deploy application configuration
|
||||
ansible.builtin.template:
|
||||
src: app-config.yml.j2
|
||||
dest: "{{ app_dir }}/config.yml"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0600' # Secure permissions for config with secrets
|
||||
notify: Restart application
|
||||
no_log: true # Config contains secrets
|
||||
|
||||
- name: Deploy Docker Compose file
|
||||
ansible.builtin.template:
|
||||
src: docker-compose.yml.j2
|
||||
dest: "{{ app_dir }}/docker-compose.yml"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
rescue:
|
||||
- name: Report deployment failure
|
||||
ansible.builtin.fail:
|
||||
msg: "Failed to deploy application configuration"
|
||||
|
||||
tags: [deploy, config]
|
||||
|
||||
# ========================================================================
|
||||
# Docker Operations (with proper idempotency)
|
||||
# ========================================================================
|
||||
|
||||
- name: Docker Management Block
|
||||
block:
|
||||
- name: Check if container is already running
|
||||
ansible.builtin.command: docker ps --filter name={{ app_name }} --format "{{ '{{' }}.Names{{ '}}' }}"
|
||||
register: container_check
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Pull Docker images
|
||||
ansible.builtin.command: docker-compose -f {{ app_dir }}/docker-compose.yml pull
|
||||
args:
|
||||
chdir: "{{ app_dir }}"
|
||||
register: pull_result
|
||||
changed_when: "'Downloaded newer image' in pull_result.stdout"
|
||||
when: container_check.stdout != app_name
|
||||
|
||||
- name: Start Docker containers
|
||||
ansible.builtin.command: docker-compose -f {{ app_dir }}/docker-compose.yml up -d
|
||||
args:
|
||||
chdir: "{{ app_dir }}"
|
||||
register: compose_up
|
||||
changed_when: "'Creating' in compose_up.stderr or 'Starting' in compose_up.stderr"
|
||||
when: container_check.stdout != app_name
|
||||
|
||||
- name: Wait for application to be healthy
|
||||
ansible.builtin.uri:
|
||||
url: "http://localhost:8080/health"
|
||||
status_code: 200
|
||||
register: health_check
|
||||
until: health_check.status == 200
|
||||
retries: 30
|
||||
delay: 10
|
||||
changed_when: false
|
||||
|
||||
rescue:
|
||||
- name: Show container logs on failure
|
||||
ansible.builtin.command: docker-compose -f {{ app_dir }}/docker-compose.yml logs --tail=50
|
||||
args:
|
||||
chdir: "{{ app_dir }}"
|
||||
register: container_logs
|
||||
changed_when: false
|
||||
|
||||
- name: Report Docker failure
|
||||
ansible.builtin.fail:
|
||||
msg: |
|
||||
Docker deployment failed
|
||||
Logs: {{ container_logs.stdout }}
|
||||
|
||||
tags: [deploy, docker]
|
||||
|
||||
# ========================================================================
|
||||
# Verification
|
||||
# ========================================================================
|
||||
|
||||
- name: Verify application is running
|
||||
ansible.builtin.command: docker ps --filter name={{ app_name }} --filter status=running --format "{{ '{{' }}.Status{{ '}}' }}"
|
||||
register: running_check
|
||||
changed_when: false
|
||||
failed_when: "'Up' not in running_check.stdout"
|
||||
tags: [verify]
|
||||
|
||||
- name: Report deployment success
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
✓ Application deployed successfully
|
||||
Container: {{ app_name }}
|
||||
Status: {{ running_check.stdout }}
|
||||
Health endpoint: http://{{ inventory_hostname }}:8080/health
|
||||
tags: [verify]
|
||||
|
||||
# ==========================================================================
|
||||
# Handlers
|
||||
# ==========================================================================
|
||||
|
||||
handlers:
|
||||
- name: Restart application
|
||||
ansible.builtin.command: docker-compose -f {{ app_dir }}/docker-compose.yml restart
|
||||
args:
|
||||
chdir: "{{ app_dir }}"
|
||||
changed_when: true
|
||||
687
skills/ansible-best-practices/patterns/ceph-automation.md
Normal file
687
skills/ansible-best-practices/patterns/ceph-automation.md
Normal file
@@ -0,0 +1,687 @@
|
||||
# CEPH Storage Automation Patterns
|
||||
|
||||
Best practices for automating CEPH cluster deployment in Proxmox VE environments.
|
||||
|
||||
## Pattern: Declarative CEPH OSD Configuration
|
||||
|
||||
**Problem**: ProxSpray leaves OSD creation as a manual step, defeating the purpose of automation.
|
||||
|
||||
**Solution**: Fully automate OSD creation with declarative configuration that specifies devices and partitioning.
|
||||
|
||||
### Configuration Model
|
||||
|
||||
```yaml
|
||||
# group_vars/matrix_cluster.yml
|
||||
---
|
||||
# CEPH network configuration
|
||||
ceph_enabled: true
|
||||
ceph_network: "192.168.5.0/24" # Public network (vmbr1)
|
||||
ceph_cluster_network: "192.168.7.0/24" # Private network (vmbr2)
|
||||
|
||||
# OSD configuration per node (4 OSDs per node = 12 total)
|
||||
ceph_osds:
|
||||
foxtrot:
|
||||
- device: /dev/nvme1n1
|
||||
partitions: 2 # Create 2 OSDs per 4TB NVMe
|
||||
db_device: null
|
||||
wal_device: null
|
||||
crush_device_class: nvme
|
||||
- device: /dev/nvme2n1
|
||||
partitions: 2
|
||||
db_device: null
|
||||
wal_device: null
|
||||
crush_device_class: nvme
|
||||
|
||||
golf:
|
||||
- device: /dev/nvme1n1
|
||||
partitions: 2
|
||||
crush_device_class: nvme
|
||||
- device: /dev/nvme2n1
|
||||
partitions: 2
|
||||
crush_device_class: nvme
|
||||
|
||||
hotel:
|
||||
- device: /dev/nvme1n1
|
||||
partitions: 2
|
||||
crush_device_class: nvme
|
||||
- device: /dev/nvme2n1
|
||||
partitions: 2
|
||||
crush_device_class: nvme
|
||||
|
||||
# Pool configuration
|
||||
ceph_pools:
|
||||
- name: vm_ssd
|
||||
pg_num: 128
|
||||
pgp_num: 128
|
||||
size: 3 # Replicate across 3 nodes
|
||||
min_size: 2 # Minimum 2 replicas required
|
||||
application: rbd
|
||||
crush_rule: replicated_rule
|
||||
compression: false
|
||||
|
||||
- name: vm_containers
|
||||
pg_num: 64
|
||||
pgp_num: 64
|
||||
size: 3
|
||||
min_size: 2
|
||||
application: rbd
|
||||
crush_rule: replicated_rule
|
||||
compression: true
|
||||
```
|
||||
|
||||
## Pattern: Idempotent CEPH Installation
|
||||
|
||||
**Problem**: CEPH installation commands fail if already installed.
|
||||
|
||||
**Solution**: Check CEPH status before attempting installation.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_ceph/tasks/install.yml
|
||||
---
|
||||
- name: Check if CEPH is already installed
|
||||
ansible.builtin.stat:
|
||||
path: /etc/pve/ceph.conf
|
||||
register: ceph_conf_check
|
||||
|
||||
- name: Check CEPH packages
|
||||
ansible.builtin.command:
|
||||
cmd: dpkg -l ceph-common
|
||||
register: ceph_package_check
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Install CEPH packages
|
||||
ansible.builtin.command:
|
||||
cmd: "pveceph install --repository no-subscription"
|
||||
when:
|
||||
- ceph_package_check.rc != 0
|
||||
register: ceph_install
|
||||
changed_when: "'installed' in ceph_install.stdout"
|
||||
|
||||
- name: Verify CEPH installation
|
||||
ansible.builtin.command:
|
||||
cmd: ceph --version
|
||||
register: ceph_version
|
||||
changed_when: false
|
||||
failed_when: ceph_version.rc != 0
|
||||
```
|
||||
|
||||
## Pattern: CEPH Cluster Initialization
|
||||
|
||||
**Problem**: CEPH cluster can only be initialized once, must be idempotent.
|
||||
|
||||
**Solution**: Check for existing cluster configuration before initialization.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_ceph/tasks/init.yml
|
||||
---
|
||||
- name: Check if CEPH cluster is initialized
|
||||
ansible.builtin.command:
|
||||
cmd: ceph status
|
||||
register: ceph_status_check
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Set CEPH initialization facts
|
||||
ansible.builtin.set_fact:
|
||||
ceph_initialized: "{{ ceph_status_check.rc == 0 }}"
|
||||
is_ceph_first_node: "{{ inventory_hostname == groups[cluster_group][0] }}"
|
||||
|
||||
- name: Initialize CEPH cluster on first node
|
||||
ansible.builtin.command:
|
||||
cmd: "pveceph init --network {{ ceph_network }} --cluster-network {{ ceph_cluster_network }}"
|
||||
when:
|
||||
- is_ceph_first_node | default(false)
|
||||
- not ceph_initialized
|
||||
register: ceph_init
|
||||
changed_when: ceph_init.rc == 0
|
||||
|
||||
- name: Wait for CEPH cluster to initialize
|
||||
ansible.builtin.pause:
|
||||
seconds: 15
|
||||
when: ceph_init.changed
|
||||
```
|
||||
|
||||
## Pattern: CEPH Monitor Creation
|
||||
|
||||
**Problem**: Monitors must be created in specific order and verified for quorum.
|
||||
|
||||
**Solution**: Create monitors with proper ordering and quorum verification.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_ceph/tasks/monitors.yml
|
||||
---
|
||||
- name: Check existing CEPH monitors
|
||||
ansible.builtin.command:
|
||||
cmd: ceph mon dump
|
||||
register: mon_dump
|
||||
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||
run_once: true
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Set monitor facts
|
||||
ansible.builtin.set_fact:
|
||||
has_monitor: "{{ inventory_hostname in mon_dump.stdout }}"
|
||||
when: mon_dump.rc == 0
|
||||
|
||||
- name: Set local is_ceph_first_node fact
|
||||
ansible.builtin.set_fact:
|
||||
is_ceph_first_node: "{{ inventory_hostname == groups[cluster_group][0] }}"
|
||||
|
||||
- name: Create CEPH monitor on first node
|
||||
ansible.builtin.command:
|
||||
cmd: pveceph mon create
|
||||
when:
|
||||
- is_ceph_first_node | default(false)
|
||||
- not has_monitor | default(false)
|
||||
register: mon_create_first
|
||||
changed_when: mon_create_first.rc == 0
|
||||
|
||||
- name: Wait for first monitor to stabilize
|
||||
ansible.builtin.pause:
|
||||
seconds: 10
|
||||
when: mon_create_first.changed
|
||||
|
||||
- name: Create CEPH monitors on other nodes
|
||||
ansible.builtin.command:
|
||||
cmd: pveceph mon create
|
||||
when:
|
||||
- not (is_ceph_first_node | default(false))
|
||||
- not has_monitor | default(false)
|
||||
register: mon_create_others
|
||||
changed_when: mon_create_others.rc == 0
|
||||
|
||||
- name: Verify monitor quorum
|
||||
ansible.builtin.command:
|
||||
cmd: ceph quorum_status
|
||||
register: quorum_status
|
||||
changed_when: false
|
||||
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||
run_once: true
|
||||
vars:
|
||||
expected_mons: "{{ ceph_mon_count | default(3) }}"
|
||||
failed_when: ((quorum_status.stdout | from_json).quorum | length) < expected_mons
|
||||
```
|
||||
|
||||
## Pattern: CEPH Manager Creation
|
||||
|
||||
**Problem**: Managers provide web interface and monitoring; should run on all nodes for HA.
|
||||
|
||||
**Solution**: Create managers on all nodes with proper verification.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_ceph/tasks/managers.yml
|
||||
---
|
||||
- name: Check existing CEPH managers
|
||||
ansible.builtin.command:
|
||||
cmd: ceph mgr dump
|
||||
register: mgr_dump
|
||||
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||
run_once: true
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Set manager facts
|
||||
ansible.builtin.set_fact:
|
||||
has_manager: "{{ inventory_hostname in mgr_dump.stdout }}"
|
||||
when: mgr_dump.rc == 0
|
||||
|
||||
- name: Create CEPH manager
|
||||
ansible.builtin.command:
|
||||
cmd: pveceph mgr create
|
||||
when: not has_manager | default(false)
|
||||
register: mgr_create
|
||||
changed_when: mgr_create.rc == 0
|
||||
|
||||
- name: Enable CEPH dashboard module
|
||||
ansible.builtin.command:
|
||||
cmd: ceph mgr module enable dashboard
|
||||
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||
run_once: true
|
||||
register: dashboard_enable
|
||||
changed_when: "'already enabled' not in dashboard_enable.stderr"
|
||||
failed_when:
|
||||
- dashboard_enable.rc != 0
|
||||
- "'already enabled' not in dashboard_enable.stderr"
|
||||
```
|
||||
|
||||
## Pattern: Automated OSD Creation with Partitioning
|
||||
|
||||
**Problem**: Manual OSD creation is error-prone and doesn't support partitioning large drives.
|
||||
|
||||
**Solution**: Automate partition creation and OSD deployment.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_ceph/tasks/osd_create.yml
|
||||
---
|
||||
- name: Get list of existing OSDs
|
||||
ansible.builtin.command:
|
||||
cmd: pveceph osd ls
|
||||
register: existing_osds
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Probe existing CEPH volumes
|
||||
ansible.builtin.command:
|
||||
cmd: ceph-volume lvm list --format json
|
||||
register: ceph_volume_probe
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Check OSD devices availability
|
||||
ansible.builtin.command:
|
||||
cmd: "lsblk -ndo NAME,TYPE {{ item.device }}"
|
||||
register: device_check
|
||||
failed_when: device_check.rc != 0
|
||||
changed_when: false
|
||||
loop: "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
|
||||
loop_control:
|
||||
label: "{{ item.device }}"
|
||||
|
||||
- name: Wipe existing partitions on OSD devices
|
||||
ansible.builtin.command:
|
||||
cmd: "wipefs -a {{ item.device }}"
|
||||
when:
|
||||
- ceph_volume_probe.rc == 0
|
||||
- ceph_volume_probe.stdout | from_json | dict2items | selectattr('value.0.devices', 'defined') | map(attribute='value.0.devices') | flatten | select('match', '^' + item.device) | list | length == 0
|
||||
- ceph_wipe_disks | default(false)
|
||||
loop: "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
|
||||
loop_control:
|
||||
label: "{{ item.device }}"
|
||||
register: wipe_result
|
||||
changed_when: wipe_result.rc == 0
|
||||
|
||||
- name: Build list of partitions to create
|
||||
ansible.builtin.set_fact:
|
||||
osd_partitions: >-
|
||||
{% set result = [] -%}
|
||||
{% for osd in ceph_osds[inventory_hostname_short] | default([]) -%}
|
||||
{% if (osd.partitions | default(1) | int) > 1 -%}
|
||||
{% for part_num in range(1, (osd.partitions | int) + 1) -%}
|
||||
{% set _ = result.append({
|
||||
'device': osd.device,
|
||||
'partition_num': part_num,
|
||||
'total_partitions': osd.partitions,
|
||||
'db_device': osd.get('db_device'),
|
||||
'wal_device': osd.get('wal_device')
|
||||
}) -%}
|
||||
{% endfor -%}
|
||||
{% endif -%}
|
||||
{% endfor -%}
|
||||
{{ result }}
|
||||
|
||||
- name: Create partitions for multiple OSDs per device
|
||||
community.general.parted:
|
||||
device: "{{ item.device }}"
|
||||
number: "{{ item.partition_num }}"
|
||||
state: present
|
||||
part_start: "{{ ((item.partition_num - 1) * (100 / item.total_partitions)) }}%"
|
||||
part_end: "{{ (item.partition_num * (100 / item.total_partitions)) }}%"
|
||||
label: gpt
|
||||
loop: "{{ osd_partitions }}"
|
||||
loop_control:
|
||||
label: "{{ item.device }}{{ 'p' if item.device.startswith('/dev/nvme') else '' }}{{ item.partition_num }}"
|
||||
|
||||
- name: Create OSDs from whole devices
|
||||
ansible.builtin.command:
|
||||
cmd: >
|
||||
pveceph osd create {{ item.device }}
|
||||
{% if item.db_device %}--db_dev {{ item.db_device }}{% endif %}
|
||||
{% if item.wal_device %}--wal_dev {{ item.wal_device }}{% endif %}
|
||||
when:
|
||||
- item.partitions | default(1) == 1
|
||||
- ceph_volume_probe.rc == 0
|
||||
- ceph_volume_probe.stdout | from_json | dict2items | selectattr('value.0.devices', 'defined') | map(attribute='value.0.devices') | flatten | select('match', '^' + item.device + '$') | list | length == 0
|
||||
loop: "{{ ceph_osds[inventory_hostname_short] | default([]) }}"
|
||||
loop_control:
|
||||
label: "{{ item.device }}"
|
||||
register: osd_create_whole
|
||||
changed_when: "'successfully created' in osd_create_whole.stdout"
|
||||
failed_when:
|
||||
- osd_create_whole.rc != 0
|
||||
- "'already in use' not in osd_create_whole.stderr"
|
||||
|
||||
- name: Create OSDs from partitions
|
||||
ansible.builtin.command:
|
||||
cmd: >
|
||||
pveceph osd create {{ item.device }}{{ 'p' if item.device.startswith('/dev/nvme') else '' }}{{ item.partition_num }}
|
||||
{% if item.db_device %}--db_dev {{ item.db_device }}{% endif %}
|
||||
{% if item.wal_device %}--wal_dev {{ item.wal_device %}{% endif %}
|
||||
when:
|
||||
- ceph_volume_probe.rc == 0
|
||||
- ceph_volume_probe.stdout | from_json | dict2items | selectattr('value.0.devices', 'defined') | map(attribute='value.0.devices') | flatten | select('match', '^' + item.device + ('p' if item.device.startswith('/dev/nvme') else '') + (item.partition_num | string) + '$') | list | length == 0
|
||||
loop: "{{ osd_partitions }}"
|
||||
loop_control:
|
||||
label: "{{ item.device }}{{ 'p' if item.device.startswith('/dev/nvme') else '' }}{{ item.partition_num }}"
|
||||
register: osd_create_partition
|
||||
changed_when: "'successfully created' in osd_create_partition.stdout"
|
||||
failed_when:
|
||||
- osd_create_partition.rc != 0
|
||||
- "'already in use' not in osd_create_partition.stderr"
|
||||
|
||||
- name: Wait for OSDs to come up
|
||||
ansible.builtin.command:
|
||||
cmd: ceph osd tree
|
||||
register: osd_tree
|
||||
changed_when: false
|
||||
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||
run_once: true
|
||||
until: "'up' in osd_tree.stdout"
|
||||
retries: 10
|
||||
delay: 5
|
||||
```
|
||||
|
||||
## Pattern: CEPH Pool Creation
|
||||
|
||||
**Problem**: Pools must be created with proper PG counts, replication, and application tags.
|
||||
|
||||
**Solution**: Declarative pool configuration with validation.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_ceph/tasks/pools.yml
|
||||
---
|
||||
- name: Get existing CEPH pools
|
||||
ansible.builtin.command:
|
||||
cmd: ceph osd pool ls
|
||||
register: existing_pools
|
||||
changed_when: false
|
||||
|
||||
- name: Create CEPH pools
|
||||
ansible.builtin.command:
|
||||
cmd: >
|
||||
ceph osd pool create {{ item.name }}
|
||||
{{ item.pg_num }}
|
||||
{{ item.pgp_num | default(item.pg_num) }}
|
||||
replicated
|
||||
{{ item.crush_rule | default('replicated_rule') }}
|
||||
when: item.name not in existing_pools.stdout_lines
|
||||
loop: "{{ ceph_pools }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }}"
|
||||
register: pool_create
|
||||
changed_when: pool_create.rc == 0
|
||||
|
||||
- name: Get current pool replication size
|
||||
ansible.builtin.command:
|
||||
cmd: "ceph osd pool get {{ item.name }} size -f json"
|
||||
loop: "{{ ceph_pools }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }}"
|
||||
register: pool_size_current
|
||||
changed_when: false
|
||||
|
||||
- name: Set pool replication size
|
||||
ansible.builtin.command:
|
||||
cmd: "ceph osd pool set {{ item.name }} size {{ item.size }}"
|
||||
when: (pool_size_current.results[loop_index].stdout | from_json).size != item.size
|
||||
loop: "{{ ceph_pools }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }}"
|
||||
index_var: loop_index
|
||||
|
||||
- name: Get current pool minimum replication size
|
||||
ansible.builtin.command:
|
||||
cmd: "ceph osd pool get {{ item.name }} min_size -f json"
|
||||
loop: "{{ ceph_pools }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }}"
|
||||
register: pool_min_size_current
|
||||
changed_when: false
|
||||
|
||||
- name: Set pool minimum replication size
|
||||
ansible.builtin.command:
|
||||
cmd: "ceph osd pool set {{ item.name }} min_size {{ item.min_size }}"
|
||||
when: (pool_min_size_current.results[loop_index].stdout | from_json).min_size != item.min_size
|
||||
loop: "{{ ceph_pools }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }}"
|
||||
index_var: loop_index
|
||||
|
||||
- name: Get current pool applications
|
||||
ansible.builtin.command:
|
||||
cmd: "ceph osd pool application get {{ item.name }} -f json"
|
||||
when: item.application is defined
|
||||
loop: "{{ ceph_pools }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }}"
|
||||
register: pool_app_current
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Set pool application
|
||||
ansible.builtin.command:
|
||||
cmd: "ceph osd pool application enable {{ item.name }} {{ item.application }}"
|
||||
when:
|
||||
- item.application is defined
|
||||
- pool_app_current.results[loop_index].rc == 0
|
||||
- item.application not in (pool_app_current.results[loop_index].stdout | from_json | default({}))
|
||||
loop: "{{ ceph_pools }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }}"
|
||||
index_var: loop_index
|
||||
|
||||
- name: Get current pool compression mode
|
||||
ansible.builtin.command:
|
||||
cmd: "ceph osd pool get {{ item.name }} compression_mode -f json"
|
||||
when: item.compression | default(false)
|
||||
loop: "{{ ceph_pools }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }}"
|
||||
register: pool_compression_current
|
||||
changed_when: false
|
||||
|
||||
- name: Enable compression on pools
|
||||
ansible.builtin.command:
|
||||
cmd: "ceph osd pool set {{ item.name }} compression_mode aggressive"
|
||||
when:
|
||||
- item.compression | default(false)
|
||||
- (pool_compression_current.results[loop_index].stdout | from_json).compression_mode != 'aggressive'
|
||||
loop: "{{ ceph_pools }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }}"
|
||||
index_var: loop_index
|
||||
```
|
||||
|
||||
## Pattern: CEPH Health Verification
|
||||
|
||||
**Problem**: CEPH cluster may appear successful but have health issues.
|
||||
|
||||
**Solution**: Comprehensive health checks after deployment.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_ceph/tasks/verify.yml
|
||||
---
|
||||
- name: Check CEPH cluster health
|
||||
ansible.builtin.command:
|
||||
cmd: ceph health
|
||||
register: ceph_health
|
||||
changed_when: false
|
||||
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||
run_once: true
|
||||
|
||||
- name: Get CEPH status
|
||||
ansible.builtin.command:
|
||||
cmd: ceph status
|
||||
register: ceph_status
|
||||
changed_when: false
|
||||
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||
run_once: true
|
||||
|
||||
- name: Verify expected OSD count
|
||||
ansible.builtin.set_fact:
|
||||
expected_osd_count: >-
|
||||
{{
|
||||
ceph_osds
|
||||
| dict2items
|
||||
| map(attribute='value')
|
||||
| sum(start=[])
|
||||
| map('default', {'partitions': 1})
|
||||
| map(attribute='partitions')
|
||||
| map('int')
|
||||
| sum
|
||||
}}
|
||||
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||
run_once: true
|
||||
|
||||
- name: Check OSD count matches expected
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "(ceph_status.stdout | from_json).osdmap.num_osds == (expected_osd_count | int)"
|
||||
fail_msg: >-
|
||||
Expected {{ expected_osd_count }} OSDs but found
|
||||
{{ (ceph_status.stdout | from_json).osdmap.num_osds }}
|
||||
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||
run_once: true
|
||||
|
||||
- name: Check all OSDs are up
|
||||
ansible.builtin.command:
|
||||
cmd: ceph osd tree
|
||||
register: osd_tree
|
||||
changed_when: false
|
||||
failed_when: "'down' in osd_tree.stdout"
|
||||
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||
run_once: true
|
||||
|
||||
- name: Verify PG status
|
||||
ansible.builtin.command:
|
||||
cmd: ceph pg stat
|
||||
register: pg_stat
|
||||
changed_when: false
|
||||
failed_when: "'active+clean' not in pg_stat.stdout"
|
||||
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||
run_once: true
|
||||
retries: 30
|
||||
delay: 10
|
||||
until: "'active+clean' in pg_stat.stdout"
|
||||
|
||||
- name: Display CEPH status
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
CEPH Cluster Health: {{ ceph_health.stdout }}
|
||||
{{ ceph_status.stdout_lines | join('\n') }}
|
||||
delegate_to: "{{ groups[cluster_group][0] }}"
|
||||
run_once: true
|
||||
```
|
||||
|
||||
## Anti-Pattern: Manual OSD Creation
|
||||
|
||||
**❌ Don't Do This** (from ProxSpray):
|
||||
|
||||
```yaml
|
||||
- name: Create OSD on available disks (manual step required)
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
To create OSDs, run manually:
|
||||
pveceph osd create /dev/sda
|
||||
pveceph osd create /dev/sdb
|
||||
```
|
||||
|
||||
**Problems**:
|
||||
|
||||
- Defeats purpose of automation
|
||||
- Error-prone manual process
|
||||
- No consistency across nodes
|
||||
- Difficult to scale
|
||||
|
||||
**✅ Do This Instead**: Use the declarative OSD configuration pattern shown above.
|
||||
|
||||
## Complete Role Example
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_ceph/tasks/main.yml
|
||||
---
|
||||
- name: Install CEPH packages
|
||||
ansible.builtin.include_tasks: install.yml
|
||||
|
||||
- name: Initialize CEPH cluster (first node only)
|
||||
ansible.builtin.include_tasks: init.yml
|
||||
when: inventory_hostname == groups[cluster_group][0]
|
||||
|
||||
- name: Create CEPH monitors
|
||||
ansible.builtin.include_tasks: monitors.yml
|
||||
|
||||
- name: Create CEPH managers
|
||||
ansible.builtin.include_tasks: managers.yml
|
||||
|
||||
- name: Create OSDs
|
||||
ansible.builtin.include_tasks: osd_create.yml
|
||||
when: ceph_osds[inventory_hostname_short] is defined
|
||||
|
||||
- name: Create CEPH pools
|
||||
ansible.builtin.include_tasks: pools.yml
|
||||
when: inventory_hostname == groups[cluster_group][0]
|
||||
|
||||
- name: Verify CEPH health
|
||||
ansible.builtin.include_tasks: verify.yml
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
# Syntax check
|
||||
ansible-playbook --syntax-check playbooks/ceph-deploy.yml
|
||||
|
||||
# Check mode (limited - CEPH commands don't support check mode well)
|
||||
ansible-playbook playbooks/ceph-deploy.yml --check --diff
|
||||
|
||||
# Deploy CEPH to Matrix cluster
|
||||
ansible-playbook playbooks/ceph-deploy.yml --limit matrix_cluster
|
||||
|
||||
# Verify CEPH status
|
||||
ansible -i inventory/proxmox.yml foxtrot -m shell -a "ceph status"
|
||||
ansible -i inventory/proxmox.yml foxtrot -m shell -a "ceph osd tree"
|
||||
ansible -i inventory/proxmox.yml foxtrot -m shell -a "ceph health detail"
|
||||
```
|
||||
|
||||
## Matrix Cluster Example
|
||||
|
||||
```yaml
|
||||
# playbooks/ceph-deploy.yml
|
||||
---
|
||||
- name: Deploy CEPH Storage on Matrix Cluster
|
||||
hosts: matrix_cluster
|
||||
become: true
|
||||
serial: 1 # Deploy one node at a time
|
||||
|
||||
pre_tasks:
|
||||
- name: Verify network MTU
|
||||
ansible.builtin.command:
|
||||
cmd: "ip link show vmbr1"
|
||||
register: mtu_check
|
||||
changed_when: false
|
||||
failed_when: "'mtu 9000' not in mtu_check.stdout"
|
||||
|
||||
roles:
|
||||
- role: proxmox_ceph
|
||||
vars:
|
||||
cluster_group: matrix_cluster
|
||||
ceph_wipe_disks: false # Set to true for fresh deployment
|
||||
```
|
||||
|
||||
## Related Patterns
|
||||
|
||||
- [Cluster Automation](cluster-automation.md) - Cluster formation prerequisite
|
||||
- [Network Automation](network-automation.md) - Network configuration for CEPH
|
||||
- [Error Handling](error-handling.md) - CEPH-specific error handling
|
||||
|
||||
## References
|
||||
|
||||
- ProxSpray analysis: `docs/proxspray-analysis.md` (lines 333-488)
|
||||
- Proxmox VE CEPH documentation
|
||||
- CEPH configuration reference
|
||||
- OSD deployment best practices
|
||||
335
skills/ansible-best-practices/patterns/cluster-automation.md
Normal file
335
skills/ansible-best-practices/patterns/cluster-automation.md
Normal file
@@ -0,0 +1,335 @@
|
||||
# Cluster Automation Patterns
|
||||
|
||||
Best practices for automating Proxmox cluster formation with idempotent,
|
||||
production-ready Ansible playbooks.
|
||||
|
||||
## Pattern: Idempotent Cluster Status Detection
|
||||
|
||||
**Problem**: Cluster formation commands (`pvecm create`, `pvecm add`) fail if run
|
||||
on nodes already in a cluster, making automation brittle.
|
||||
|
||||
**Solution**: Always check cluster status before attempting destructive operations.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
- name: Check existing cluster status
|
||||
ansible.builtin.command:
|
||||
cmd: pvecm status
|
||||
register: cluster_status
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Get cluster nodes list
|
||||
ansible.builtin.command:
|
||||
cmd: pvecm nodes
|
||||
register: cluster_nodes_check
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Set cluster facts
|
||||
ansible.builtin.set_fact:
|
||||
is_cluster_member: "{{ cluster_status.rc == 0 and (cluster_nodes_check.stdout_lines | length > 1 or cluster_name in cluster_status.stdout) }}"
|
||||
is_first_node: "{{ inventory_hostname == groups['proxmox'][0] }}"
|
||||
in_target_cluster: "{{ cluster_status.rc == 0 and cluster_name in cluster_status.stdout }}"
|
||||
|
||||
- name: Create new cluster on first node
|
||||
ansible.builtin.command:
|
||||
cmd: "pvecm create {{ cluster_name }}"
|
||||
when:
|
||||
- is_first_node
|
||||
- not in_target_cluster
|
||||
register: cluster_create
|
||||
changed_when: cluster_create.rc == 0
|
||||
|
||||
- name: Join cluster on other nodes
|
||||
ansible.builtin.command:
|
||||
cmd: "pvecm add {{ hostvars[groups['proxmox'][0]].ansible_host }}"
|
||||
when:
|
||||
- not is_first_node
|
||||
- not is_cluster_member
|
||||
register: cluster_join
|
||||
changed_when: cluster_join.rc == 0
|
||||
```
|
||||
|
||||
### Key Benefits
|
||||
|
||||
1. **Safe Re-runs**: Playbook can run multiple times without breaking existing clusters
|
||||
2. **Error Recovery**: Nodes can rejoin if removed from cluster
|
||||
3. **Multi-Cluster Support**: Prevents accidentally joining wrong cluster
|
||||
4. **Clear State**: `changed_when` accurately reflects actual changes
|
||||
|
||||
## Pattern: Hostname Resolution Verification
|
||||
|
||||
**Problem**: Cluster formation fails if nodes cannot resolve each other's
|
||||
hostnames, but errors are cryptic.
|
||||
|
||||
**Solution**: Verify /etc/hosts configuration and DNS resolution before cluster operations.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
- name: Ensure cluster nodes in /etc/hosts
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/hosts
|
||||
regexp: "^{{ item.ip }}\\s+"
|
||||
line: "{{ item.ip }} {{ item.fqdn }} {{ item.short_name }}"
|
||||
state: present
|
||||
loop: "{{ cluster_nodes }}"
|
||||
loop_control:
|
||||
label: "{{ item.short_name }}"
|
||||
|
||||
- name: Verify hostname resolution
|
||||
ansible.builtin.command:
|
||||
cmd: "getent hosts {{ item.fqdn }}"
|
||||
register: host_lookup
|
||||
failed_when: host_lookup.rc != 0
|
||||
changed_when: false
|
||||
loop: "{{ cluster_nodes }}"
|
||||
loop_control:
|
||||
label: "{{ item.fqdn }}"
|
||||
|
||||
- name: Verify reverse DNS resolution
|
||||
ansible.builtin.command:
|
||||
cmd: "getent hosts {{ item.ip }}"
|
||||
register: reverse_lookup
|
||||
failed_when:
|
||||
- reverse_lookup.rc != 0
|
||||
changed_when: false
|
||||
loop: "{{ cluster_nodes }}"
|
||||
loop_control:
|
||||
label: "{{ item.ip }}"
|
||||
```
|
||||
|
||||
### Configuration Example
|
||||
|
||||
```yaml
|
||||
# group_vars/matrix_cluster.yml
|
||||
cluster_name: "Matrix"
|
||||
cluster_nodes:
|
||||
- short_name: foxtrot
|
||||
fqdn: foxtrot.matrix.spaceships.work
|
||||
ip: 192.168.3.5
|
||||
corosync_ip: 192.168.8.5
|
||||
- short_name: golf
|
||||
fqdn: golf.matrix.spaceships.work
|
||||
ip: 192.168.3.6
|
||||
corosync_ip: 192.168.8.6
|
||||
- short_name: hotel
|
||||
fqdn: hotel.matrix.spaceships.work
|
||||
ip: 192.168.3.7
|
||||
corosync_ip: 192.168.8.7
|
||||
```
|
||||
|
||||
## Pattern: SSH Key Distribution for Cluster Operations
|
||||
|
||||
**Problem**: Some cluster operations require passwordless SSH between nodes.
|
||||
|
||||
**Solution**: Automate SSH key generation and distribution.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
- name: Generate SSH key for root (if not exists)
|
||||
ansible.builtin.user:
|
||||
name: root
|
||||
generate_ssh_key: true
|
||||
ssh_key_bits: 4096
|
||||
ssh_key_type: rsa
|
||||
register: root_ssh_key
|
||||
|
||||
- name: Fetch public keys from all nodes
|
||||
ansible.builtin.slurp:
|
||||
src: /root/.ssh/id_rsa.pub
|
||||
register: node_public_keys
|
||||
|
||||
- name: Distribute SSH keys to all nodes
|
||||
ansible.posix.authorized_key:
|
||||
user: root
|
||||
state: present
|
||||
key: "{{ hostvars[item].node_public_keys.content | b64decode }}"
|
||||
loop: "{{ groups['proxmox'] }}"
|
||||
when: item != inventory_hostname
|
||||
```
|
||||
|
||||
## Pattern: Service Restart Orchestration
|
||||
|
||||
**Problem**: Cluster services must restart in specific order after configuration changes.
|
||||
|
||||
**Solution**: Use handlers with explicit dependencies and delays.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
# tasks/main.yml
|
||||
- name: Configure corosync
|
||||
ansible.builtin.template:
|
||||
src: corosync.conf.j2
|
||||
dest: /etc/pve/corosync.conf
|
||||
validate: corosync-cfgtool -c %s
|
||||
notify:
|
||||
- reload corosync
|
||||
- restart pve-cluster
|
||||
- restart pvedaemon
|
||||
- restart pveproxy
|
||||
|
||||
# handlers/main.yml
|
||||
- name: reload corosync
|
||||
ansible.builtin.systemd:
|
||||
name: corosync
|
||||
state: reloaded
|
||||
listen: reload corosync
|
||||
|
||||
- name: restart pve-cluster
|
||||
ansible.builtin.systemd:
|
||||
name: pve-cluster
|
||||
state: restarted
|
||||
listen: restart pve-cluster
|
||||
throttle: 1 # Restart one node at a time
|
||||
|
||||
- name: restart pvedaemon
|
||||
ansible.builtin.systemd:
|
||||
name: pvedaemon
|
||||
state: restarted
|
||||
listen: restart pvedaemon
|
||||
|
||||
- name: restart pveproxy
|
||||
ansible.builtin.systemd:
|
||||
name: pveproxy
|
||||
state: restarted
|
||||
listen: restart pveproxy
|
||||
```
|
||||
|
||||
## Pattern: Quorum and Health Verification
|
||||
|
||||
**Problem**: Cluster may appear successful but have quorum issues or split-brain scenarios.
|
||||
|
||||
**Solution**: Always verify cluster health after operations.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
- name: Wait for cluster to stabilize
|
||||
ansible.builtin.pause:
|
||||
seconds: 10
|
||||
when: cluster_create.changed or cluster_join.changed
|
||||
|
||||
- name: Verify cluster quorum
|
||||
ansible.builtin.command:
|
||||
cmd: pvecm status
|
||||
register: cluster_health
|
||||
changed_when: false
|
||||
failed_when: "'Quorate: Yes' not in cluster_health.stdout"
|
||||
|
||||
- name: Check expected node count
|
||||
ansible.builtin.command:
|
||||
cmd: pvecm nodes
|
||||
register: cluster_nodes_final
|
||||
changed_when: false
|
||||
failed_when: cluster_nodes_final.stdout_lines | length != groups['proxmox'] | length
|
||||
|
||||
- name: Display cluster status
|
||||
ansible.builtin.debug:
|
||||
var: cluster_health.stdout_lines
|
||||
when: cluster_health.changed or ansible_verbosity > 0
|
||||
```
|
||||
|
||||
## Anti-Pattern: Silent Error Suppression
|
||||
|
||||
**❌ Don't Do This**:
|
||||
|
||||
```yaml
|
||||
- name: Join cluster on other nodes
|
||||
ansible.builtin.shell: |
|
||||
timeout 60 pvecm add {{ primary_node }}
|
||||
failed_when: false # Silently ignores ALL errors
|
||||
```
|
||||
|
||||
**Problems**:
|
||||
|
||||
- Hides real failures (network issues, authentication problems)
|
||||
- Makes debugging impossible
|
||||
- Creates inconsistent cluster state
|
||||
- Provides false success signals
|
||||
|
||||
**✅ Do This Instead**:
|
||||
|
||||
```yaml
|
||||
- name: Join cluster on other nodes
|
||||
ansible.builtin.command:
|
||||
cmd: "pvecm add {{ primary_node }}"
|
||||
register: cluster_join
|
||||
failed_when:
|
||||
- cluster_join.rc != 0
|
||||
- "'already in a cluster' not in cluster_join.stderr"
|
||||
- "'cannot join cluster' not in cluster_join.stderr"
|
||||
changed_when: cluster_join.rc == 0
|
||||
|
||||
- name: Handle join failure
|
||||
ansible.builtin.fail:
|
||||
msg: |
|
||||
Failed to join cluster {{ cluster_name }}.
|
||||
Error: {{ cluster_join.stderr }}
|
||||
Hint: Check network connectivity and ensure first node is reachable.
|
||||
when:
|
||||
- cluster_join.rc != 0
|
||||
- "'already in a cluster' not in cluster_join.stderr"
|
||||
```
|
||||
|
||||
## Complete Role Example
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_cluster/tasks/main.yml
|
||||
---
|
||||
- name: Verify prerequisites
|
||||
ansible.builtin.include_tasks: prerequisites.yml
|
||||
|
||||
- name: Configure /etc/hosts
|
||||
ansible.builtin.include_tasks: hosts_config.yml
|
||||
|
||||
- name: Distribute SSH keys
|
||||
ansible.builtin.include_tasks: ssh_keys.yml
|
||||
|
||||
- name: Initialize cluster (first node only)
|
||||
ansible.builtin.include_tasks: cluster_init.yml
|
||||
when: inventory_hostname == groups['proxmox'][0]
|
||||
|
||||
- name: Join cluster (other nodes)
|
||||
ansible.builtin.include_tasks: cluster_join.yml
|
||||
when: inventory_hostname != groups['proxmox'][0]
|
||||
|
||||
- name: Configure corosync
|
||||
ansible.builtin.include_tasks: corosync.yml
|
||||
|
||||
- name: Verify cluster health
|
||||
ansible.builtin.include_tasks: verify.yml
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
# Syntax check
|
||||
ansible-playbook --syntax-check playbooks/cluster-init.yml
|
||||
|
||||
# Check mode (dry run)
|
||||
ansible-playbook playbooks/cluster-init.yml --check --diff
|
||||
|
||||
# Run on specific cluster
|
||||
ansible-playbook playbooks/cluster-init.yml --limit matrix_cluster
|
||||
|
||||
# Verify idempotency (should show 0 changes on second run)
|
||||
ansible-playbook playbooks/cluster-init.yml --limit matrix_cluster
|
||||
ansible-playbook playbooks/cluster-init.yml --limit matrix_cluster
|
||||
```
|
||||
|
||||
## Related Patterns
|
||||
|
||||
- [Error Handling](error-handling.md) - Comprehensive error handling strategies
|
||||
- [Network Automation](network-automation.md) - Network interface and bridge configuration
|
||||
- [CEPH Storage](ceph-automation.md) - CEPH cluster deployment patterns
|
||||
|
||||
## References
|
||||
|
||||
- ProxSpray analysis: `docs/proxspray-analysis.md` (lines 153-207)
|
||||
- Proxmox VE Cluster Manager documentation
|
||||
- Corosync configuration guide
|
||||
@@ -0,0 +1,986 @@
|
||||
# Documentation Templates
|
||||
|
||||
## Summary: Pattern Confidence
|
||||
|
||||
Analyzed 7 geerlingguy roles: security, users, docker, postgresql, nginx, pip, git
|
||||
|
||||
**Universal Patterns (All 7 roles):**
|
||||
|
||||
- Consistent README structure: Title + Badge → Description → Requirements → Variables → Dependencies → Example →
|
||||
License → Author (7/7 roles)
|
||||
- CI badge showing test status with link to workflow (7/7 roles)
|
||||
- Code-formatted variable defaults with detailed descriptions (7/7 roles)
|
||||
- Example playbook section with working examples (7/7 roles)
|
||||
- Inline code formatting for variables, file paths, commands (7/7 roles)
|
||||
- Explicit "None" for empty sections (Requirements, Dependencies) (7/7 roles)
|
||||
- License + Author sections with links (7/7 roles)
|
||||
- Variable grouping for related configuration (7/7 roles)
|
||||
- Commented list examples showing optional items (7/7 roles)
|
||||
|
||||
**Contextual Patterns (Varies by complexity):**
|
||||
|
||||
- Warning/caveat sections: security-critical roles have prominent warnings, simple roles don't need them
|
||||
- Variable documentation depth: complex roles (postgresql) have extensive inline docs, simple roles (pip) are
|
||||
more concise
|
||||
- Example complexity: simple roles show basic examples, complex roles show multiple scenarios
|
||||
- Troubleshooting sections: recommended for roles that modify critical services (SSH, networking), optional for
|
||||
simple roles
|
||||
- Complex variable documentation: roles with 5+ optional dict attributes show ALL keys with inline comments
|
||||
|
||||
**Evolving Patterns (Newer roles improved):**
|
||||
|
||||
- PostgreSQL shows best practices for complex variable documentation: show all keys, mark required vs optional,
|
||||
document defaults
|
||||
- nginx demonstrates template extensibility documentation (Jinja2 block inheritance)
|
||||
- Complex roles provide comprehensive inline examples in defaults/ files as primary documentation
|
||||
|
||||
**Sources:**
|
||||
|
||||
- geerlingguy.security (analyzed 2025-10-23)
|
||||
- geerlingguy.github-users (analyzed 2025-10-23)
|
||||
- geerlingguy.docker (analyzed 2025-10-23)
|
||||
- geerlingguy.postgresql (analyzed 2025-10-23)
|
||||
- geerlingguy.nginx (analyzed 2025-10-23)
|
||||
- geerlingguy.pip (analyzed 2025-10-23)
|
||||
- geerlingguy.git (analyzed 2025-10-23)
|
||||
|
||||
**Repositories:**
|
||||
|
||||
- <https://github.com/geerlingguy/ansible-role-security>
|
||||
- <https://github.com/geerlingguy/ansible-role-github-users>
|
||||
- <https://github.com/geerlingguy/ansible-role-docker>
|
||||
- <https://github.com/geerlingguy/ansible-role-postgresql>
|
||||
- <https://github.com/geerlingguy/ansible-role-nginx>
|
||||
- <https://github.com/geerlingguy/ansible-role-pip>
|
||||
- <https://github.com/geerlingguy/ansible-role-git>
|
||||
|
||||
## Pattern Confidence Levels (Historical)
|
||||
|
||||
Analyzed 2 geerlingguy roles: security, github-users
|
||||
|
||||
**Universal Patterns (Both roles use identical approach):**
|
||||
|
||||
1. ✅ **README structure** - Both follow: Title + Badge → Description → Requirements → Variables → Dependencies →
|
||||
Example → License → Author
|
||||
2. ✅ **CI badge** - Both include GitHub Actions CI badge with link to workflow
|
||||
3. ✅ **Variable documentation format** - Code-formatted default + detailed description
|
||||
4. ✅ **Example playbook section** - Both show minimal working example with vars
|
||||
5. ✅ **Inline code formatting** - Backticks for variables, file paths, commands
|
||||
6. ✅ **Commented list examples** - Show example list items as comments
|
||||
7. ✅ **"None" for empty sections** - Explicit "None" instead of omitting (Requirements, Dependencies)
|
||||
8. ✅ **License + Author sections** - Both include MIT license and author with links
|
||||
9. ✅ **Variable grouping** - Related variables documented together with shared context
|
||||
|
||||
**Contextual Patterns (Varies by role complexity):**
|
||||
|
||||
1. ⚠️ **Warning/caveat section** - security has prominent security warning, github-users doesn't need
|
||||
one
|
||||
2. ⚠️ **Variable detail level** - security has extensive variable docs with warnings, github-users is more
|
||||
concise (fewer variables)
|
||||
3. ⚠️ **Example complexity** - security shows vars_files pattern, github-users shows inline vars (simpler)
|
||||
4. ⚠️ **Troubleshooting section** - Neither role has explicit troubleshooting (could be added)
|
||||
|
||||
**Key Finding:** README documentation follows a strict template across roles. Only the caveat/warning section varies
|
||||
based on role risk profile.
|
||||
|
||||
## Overview
|
||||
|
||||
This document captures documentation patterns from production-grade Ansible roles, demonstrating how to create
|
||||
clear, comprehensive README files that help users understand and use the role effectively.
|
||||
|
||||
## README Structure
|
||||
|
||||
### Pattern: Comprehensive README Template
|
||||
|
||||
**Description:** A well-structured README that follows a consistent format, providing all necessary information for
|
||||
users to understand and use the role.
|
||||
|
||||
**File Path:** `README.md`
|
||||
|
||||
**Standard README Sections:**
|
||||
|
||||
1. Title and badges
|
||||
2. Caveat/Warning (if applicable)
|
||||
3. Role description
|
||||
4. Requirements
|
||||
5. Role Variables
|
||||
6. Dependencies
|
||||
7. Example Playbook
|
||||
8. License
|
||||
9. Author Information
|
||||
|
||||
### Section 1: Title and Badges
|
||||
|
||||
**Example Code:**
|
||||
|
||||
```markdown
|
||||
# Ansible Role: Security (Basics)
|
||||
|
||||
[](https://github.com/geerlingguy/ansible-role-security/actions/workflows/ci.yml)
|
||||
```
|
||||
|
||||
**Key Elements:**
|
||||
|
||||
1. **Clear title** - Role name with descriptive subtitle
|
||||
2. **CI badge** - Shows test status (builds confidence)
|
||||
3. **Badge links to CI** - Users can see test results
|
||||
|
||||
**When to Use:**
|
||||
|
||||
- Always include clear role title
|
||||
- Add CI badge if you have automated testing
|
||||
- Link badges to their status pages
|
||||
- Consider adding Galaxy badge, version badge, downloads badge
|
||||
|
||||
**Badge Examples:**
|
||||
|
||||
```markdown
|
||||
[](https://github.com/user/repo/actions)
|
||||
[](https://galaxy.ansible.com/user/rolename)
|
||||
[](LICENSE)
|
||||
```
|
||||
|
||||
**Anti-pattern:**
|
||||
|
||||
- Don't skip the title (obvious but happens)
|
||||
- Avoid outdated or broken badges
|
||||
- Don't add badges that don't provide value
|
||||
|
||||
### Section 2: Caveat/Warning (Optional)
|
||||
|
||||
**Example Code:**
|
||||
|
||||
```markdown
|
||||
**First, a major, MAJOR caveat**: the security of your servers is YOUR
|
||||
responsibility. If you think simply including this role and adding a firewall
|
||||
makes a server secure, then you're mistaken. Read up on Linux, network, and
|
||||
application security, and know that no matter how much you know, you can
|
||||
always make every part of your stack more secure.
|
||||
|
||||
That being said, this role performs some basic security configuration on
|
||||
RedHat and Debian-based linux systems. It attempts to:
|
||||
|
||||
- Install software to monitor bad SSH access (fail2ban)
|
||||
- Configure SSH to be more secure (disabling root login, requiring
|
||||
key-based authentication, and allowing a custom SSH port to be set)
|
||||
- Set up automatic updates (if configured to do so)
|
||||
|
||||
There are a few other things you may or may not want to do (which are not
|
||||
included in this role) to make sure your servers are more secure, like:
|
||||
|
||||
- Use logwatch or a centralized logging server to analyze and monitor
|
||||
log files
|
||||
- Securely configure user accounts and SSH keys (this role assumes you're
|
||||
not using password authentication or logging in as root)
|
||||
- Have a well-configured firewall (check out the `geerlingguy.firewall`
|
||||
role on Ansible Galaxy for a flexible example)
|
||||
|
||||
Again: Your servers' security is *your* responsibility.
|
||||
```
|
||||
|
||||
**Key Elements:**
|
||||
|
||||
1. **Prominent warning** - Sets expectations clearly
|
||||
2. **Scope definition** - What the role does and doesn't do
|
||||
3. **Additional recommendations** - Points to complementary practices
|
||||
4. **Emphasis** - Bold, italics, repetition for important points
|
||||
|
||||
**When to Use:**
|
||||
|
||||
- Security-related roles (critical warnings)
|
||||
- Roles that could cause service disruption
|
||||
- Roles with common misunderstandings
|
||||
- Complex roles with limited scope
|
||||
|
||||
**Anti-pattern:**
|
||||
|
||||
- Don't add warnings for routine roles
|
||||
- Avoid legal disclaimers (that's what LICENSE is for)
|
||||
- Don't be condescending
|
||||
|
||||
### Section 3: Requirements
|
||||
|
||||
**Example Code:**
|
||||
|
||||
```markdown
|
||||
## Requirements
|
||||
|
||||
For obvious reasons, `sudo` must be installed if you want to manage the
|
||||
sudoers file with this role.
|
||||
|
||||
On RedHat/CentOS systems, make sure you have the EPEL repository installed
|
||||
(you can include the `geerlingguy.repo-epel` role to get it installed).
|
||||
|
||||
No special requirements for Debian/Ubuntu systems.
|
||||
```
|
||||
|
||||
**Key Elements:**
|
||||
|
||||
1. **System requirements** - Software that must be pre-installed
|
||||
2. **OS-specific requirements** - Different requirements per platform
|
||||
3. **How to meet requirements** - Links to other roles or instructions
|
||||
4. **Explicit "no requirements" statement** - Clarity when none exist
|
||||
|
||||
**When to Use:**
|
||||
|
||||
- List any software that must be installed first
|
||||
- Document repository requirements (EPEL, PPAs)
|
||||
- Mention privilege requirements (become/sudo)
|
||||
- Note Python library dependencies
|
||||
- State "None" if no requirements (clear communication)
|
||||
|
||||
**Anti-pattern:**
|
||||
|
||||
- Don't assume users know about EPEL or special repos
|
||||
- Avoid listing Ansible itself (assumed)
|
||||
- Don't skip this section (at least say "None")
|
||||
|
||||
### Section 4: Role Variables
|
||||
|
||||
**Example Code:**
|
||||
|
||||
```markdown
|
||||
## Role Variables
|
||||
|
||||
Available variables are listed below, along with default values (see
|
||||
`defaults/main.yml`):
|
||||
|
||||
security_ssh_port: 22
|
||||
|
||||
The port through which you'd like SSH to be accessible. The default is port
|
||||
22, but if you're operating a server on the open internet, and have no
|
||||
firewall blocking access to port 22, you'll quickly find that thousands of
|
||||
login attempts per day are not uncommon. You can change the port to a
|
||||
nonstandard port (e.g. 2849) if you want to avoid these thousands of
|
||||
automated penetration attempts.
|
||||
|
||||
security_ssh_password_authentication: "no"
|
||||
security_ssh_permit_root_login: "no"
|
||||
security_ssh_usedns: "no"
|
||||
security_ssh_permit_empty_password: "no"
|
||||
security_ssh_challenge_response_auth: "no"
|
||||
security_ssh_gss_api_authentication: "no"
|
||||
security_ssh_x11_forwarding: "no"
|
||||
|
||||
Security settings for SSH authentication. It's best to leave these set to
|
||||
`"no"`, but there are times (especially during initial server configuration
|
||||
or when you don't have key-based authentication in place) when one or all
|
||||
may be safely set to `'yes'`. **NOTE: It is _very_ important that you quote
|
||||
the 'yes' or 'no' values. Failure to do so may lock you out of your server.**
|
||||
|
||||
security_ssh_allowed_users: []
|
||||
# - alice
|
||||
# - bob
|
||||
# - charlie
|
||||
|
||||
A list of users allowed to connect to the host over SSH. If no user is
|
||||
defined in the list, the task will be skipped.
|
||||
|
||||
security_sudoers_passwordless: []
|
||||
security_sudoers_passworded: []
|
||||
|
||||
A list of users who should be added to the sudoers file so they can run any
|
||||
command as root (via `sudo`) either without a password or requiring a
|
||||
password for each command, respectively.
|
||||
|
||||
security_autoupdate_enabled: true
|
||||
|
||||
Whether to install/enable `yum-cron` (RedHat-based systems) or
|
||||
`unattended-upgrades` (Debian-based systems). System restarts will not
|
||||
happen automatically in any case, and automatic upgrades are no excuse for
|
||||
sloppy patch and package management, but automatic updates can be helpful
|
||||
as yet another security measure.
|
||||
|
||||
security_fail2ban_enabled: true
|
||||
|
||||
Whether to install/enable `fail2ban`. You might not want to use fail2ban if
|
||||
you're already using some other service for login and intrusion detection
|
||||
(e.g. [ConfigServer](http://configserver.com/cp/csf.html)).
|
||||
```
|
||||
|
||||
**Documentation Pattern:**
|
||||
|
||||
For each variable:
|
||||
|
||||
1. **Show default value** - Code-formatted with actual default
|
||||
2. **Description** - What it does, when to use it
|
||||
3. **Context** - Why you might change it
|
||||
4. **Examples** - Show different values for lists/dicts
|
||||
5. **Warnings** - Important notes (quoting, locking out, etc.)
|
||||
|
||||
**Formatting Guidelines:**
|
||||
|
||||
- Use 4-space indentation for default values
|
||||
- Group related variables together
|
||||
- Add blank lines between variable groups
|
||||
- Use inline code formatting for values
|
||||
- Bold important warnings
|
||||
- Comment out example list items
|
||||
|
||||
**When to Use:**
|
||||
|
||||
- Document ALL variables from defaults/main.yml
|
||||
- Group related variables (ssh_*, autoupdate_*, etc.)
|
||||
- Provide context, not just description
|
||||
- Include warnings for dangerous settings
|
||||
- Show example values for complex structures
|
||||
|
||||
**Anti-pattern:**
|
||||
|
||||
- Don't just list variables without explanation
|
||||
- Avoid documenting vars/ (internal implementation)
|
||||
- Don't skip context (users need to know WHY)
|
||||
- Avoid stale documentation (keep in sync with defaults/)
|
||||
|
||||
### Pattern: Variable Table Format (Alternative)
|
||||
|
||||
**Description:** Some roles use a table format for variable documentation. While geerlingguy.security doesn't use
|
||||
this, it's a valid alternative pattern.
|
||||
|
||||
**Example Table Format:**
|
||||
|
||||
```markdown
|
||||
## Role Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `security_ssh_port` | `22` | SSH port number |
|
||||
| `security_ssh_password_authentication` | `"no"` | Enable password authentication |
|
||||
| `security_fail2ban_enabled` | `true` | Install and configure fail2ban |
|
||||
```
|
||||
|
||||
**When to Use:**
|
||||
|
||||
- Roles with many simple variables
|
||||
- When brief descriptions are sufficient
|
||||
- For quick reference guides
|
||||
|
||||
**Comparison:**
|
||||
|
||||
| Format | Best For | Pros | Cons |
|
||||
|--------|----------|------|------|
|
||||
| Text with examples | Complex variables, detailed context | Detailed explanations, examples | More verbose |
|
||||
| Table | Simple variables, quick reference | Concise, scannable | Limited detail space |
|
||||
|
||||
**Virgo-Core Preference:**
|
||||
|
||||
Use text format with examples (matches geerlingguy pattern) for main documentation, optionally add table for quick
|
||||
reference.
|
||||
|
||||
### Section 5: Dependencies
|
||||
|
||||
**Example Code:**
|
||||
|
||||
```markdown
|
||||
## Dependencies
|
||||
|
||||
None.
|
||||
```
|
||||
|
||||
**When Dependencies Exist:**
|
||||
|
||||
```markdown
|
||||
## Dependencies
|
||||
|
||||
This role depends on:
|
||||
|
||||
- `geerlingguy.repo-epel` (for RedHat/CentOS systems)
|
||||
- `geerlingguy.firewall` (recommended but optional)
|
||||
|
||||
The role will automatically install required dependencies from Ansible Galaxy.
|
||||
```
|
||||
|
||||
**Key Elements:**
|
||||
|
||||
1. **Explicit "None"** - Clear when no dependencies
|
||||
2. **List dependencies** - With context about why needed
|
||||
3. **Distinguish required vs optional** - Important for users
|
||||
4. **Note automatic installation** - Reduces confusion
|
||||
|
||||
**When to Use:**
|
||||
|
||||
- Always include this section
|
||||
- List role dependencies from meta/main.yml
|
||||
- Note recommended complementary roles
|
||||
- State "None" if no dependencies
|
||||
|
||||
**Anti-pattern:**
|
||||
|
||||
- Don't skip this section
|
||||
- Avoid listing collection dependencies here (put in Requirements)
|
||||
|
||||
### Section 6: Example Playbook
|
||||
|
||||
**Example Code:**
|
||||
|
||||
```markdown
|
||||
## Example Playbook
|
||||
|
||||
- hosts: servers
|
||||
vars_files:
|
||||
- vars/main.yml
|
||||
roles:
|
||||
- geerlingguy.security
|
||||
|
||||
*Inside `vars/main.yml`*:
|
||||
|
||||
security_sudoers_passworded:
|
||||
- johndoe
|
||||
- deployacct
|
||||
```
|
||||
|
||||
**Key Elements:**
|
||||
|
||||
1. **Minimal working example** - Shows basic usage
|
||||
2. **Variable override example** - Demonstrates customization
|
||||
3. **Multiple files** - Shows playbook and vars file
|
||||
4. **Real-world example** - Not generic foo/bar examples
|
||||
5. **Indentation** - 4 spaces for YAML, maintains readability
|
||||
|
||||
**Enhanced Example Pattern:**
|
||||
|
||||
```markdown
|
||||
## Example Playbook
|
||||
|
||||
### Basic Usage
|
||||
|
||||
- hosts: all
|
||||
roles:
|
||||
- geerlingguy.security
|
||||
|
||||
### Custom Configuration
|
||||
|
||||
- hosts: webservers
|
||||
vars:
|
||||
security_ssh_port: 2222
|
||||
security_fail2ban_enabled: true
|
||||
security_autoupdate_enabled: true
|
||||
roles:
|
||||
- geerlingguy.security
|
||||
|
||||
### Advanced Example with Sudoers
|
||||
|
||||
- hosts: appservers
|
||||
vars:
|
||||
security_sudoers_passwordless:
|
||||
- deploy
|
||||
security_sudoers_passworded:
|
||||
- developer
|
||||
- operator
|
||||
roles:
|
||||
- geerlingguy.security
|
||||
```
|
||||
|
||||
**When to Use:**
|
||||
|
||||
- Always include at least one example
|
||||
- Show basic usage first
|
||||
- Add advanced examples for complex features
|
||||
- Use realistic variable values
|
||||
- Include multiple scenarios if role has distinct use cases
|
||||
|
||||
**Anti-pattern:**
|
||||
|
||||
- Don't use only generic examples (foo, bar, example.com)
|
||||
- Avoid incomplete examples (missing required vars)
|
||||
- Don't show every possible variable (overwhelming)
|
||||
|
||||
### Section 7: License and Author
|
||||
|
||||
**Example Code:**
|
||||
|
||||
```markdown
|
||||
## License
|
||||
|
||||
MIT (Expat) / BSD
|
||||
|
||||
## Author Information
|
||||
|
||||
This role was created in 2014 by [Jeff Geerling](https://www.jeffgeerling.com/),
|
||||
author of [Ansible for DevOps](https://www.ansiblefordevops.com/).
|
||||
```
|
||||
|
||||
**Key Elements:**
|
||||
|
||||
1. **License name** - Clear license statement
|
||||
2. **Author information** - Who created/maintains it
|
||||
3. **Links** - Author website, book, company
|
||||
4. **Year created** - Provides context
|
||||
|
||||
**When to Use:**
|
||||
|
||||
- Always include license (required for Galaxy)
|
||||
- Add author name and contact
|
||||
- Link to LICENSE file for full text
|
||||
- Keep it brief
|
||||
|
||||
**Anti-pattern:**
|
||||
|
||||
- Don't include full license text in README (use LICENSE file)
|
||||
- Avoid complex author information
|
||||
|
||||
## Additional Documentation Patterns
|
||||
|
||||
### Pattern: Troubleshooting Section
|
||||
|
||||
**Description:** While geerlingguy.security doesn't include a troubleshooting section, more complex roles should
|
||||
include one.
|
||||
|
||||
**Example Troubleshooting Section:**
|
||||
|
||||
```markdown
|
||||
## Troubleshooting
|
||||
|
||||
### SSH Connection Refused After Running Role
|
||||
|
||||
If you lose SSH connectivity after running this role, you may have:
|
||||
|
||||
1. Changed the SSH port without updating your firewall rules
|
||||
2. Disabled password authentication without setting up SSH keys
|
||||
3. Set `security_ssh_allowed_users` without including your username
|
||||
|
||||
**Solution:** Access the server via console and check `/etc/ssh/sshd_config`.
|
||||
|
||||
### Fail2ban Not Starting
|
||||
|
||||
If fail2ban fails to start, check that the log files it monitors exist:
|
||||
|
||||
ls -la /var/log/auth.log
|
||||
|
||||
On some minimal systems, these log files may not exist until a service
|
||||
writes to them.
|
||||
|
||||
**Solution:** Create empty log files or disable fail2ban temporarily.
|
||||
```
|
||||
|
||||
**When to Use:**
|
||||
|
||||
- Roles that modify critical services (SSH, networking)
|
||||
- Roles with common configuration mistakes
|
||||
- Roles with tricky OS-specific issues
|
||||
- Complex roles with multiple failure modes
|
||||
|
||||
**Anti-pattern:**
|
||||
|
||||
- Don't include troubleshooting for roles that are straightforward
|
||||
- Avoid listing every possible error (focus on common issues)
|
||||
|
||||
### Pattern: Inline Code and Formatting
|
||||
|
||||
**Formatting Patterns from README:**
|
||||
|
||||
1. **Inline code** - Use backticks: `fail2ban`, `sudo`, `/etc/ssh/sshd_config`
|
||||
2. **File paths** - Always use inline code: `defaults/main.yml`
|
||||
3. **Commands** - Inline code for short commands: `sudo systemctl restart ssh`
|
||||
4. **Variable names** - Inline code: `security_ssh_port`
|
||||
5. **Code blocks** - Use 4-space indentation for YAML/code examples
|
||||
6. **Emphasis** - Bold for **important warnings**, italics for *emphasis*
|
||||
7. **Lists** - Use `-` for unordered, numbers for ordered
|
||||
|
||||
**Example:**
|
||||
|
||||
```markdown
|
||||
To configure SSH port, set `security_ssh_port` in your playbook variables.
|
||||
The configuration is written to `/etc/ssh/sshd_config` and validated with
|
||||
`sshd -T -f %s` before applying. **WARNING**: Changing the SSH port without
|
||||
updating firewall rules will lock you out.
|
||||
```
|
||||
|
||||
## Comparison to Virgo-Core Roles
|
||||
|
||||
### system_user Role
|
||||
|
||||
**README Analysis:**
|
||||
|
||||
**Matches:**
|
||||
|
||||
- ✅ Has clear title
|
||||
- ✅ Good role description
|
||||
- ✅ Documents variables
|
||||
- ✅ Includes example playbook
|
||||
- ✅ Has license and author sections
|
||||
|
||||
**Gaps:**
|
||||
|
||||
- ❌ No CI badge (no CI yet)
|
||||
- ⚠️ Variable documentation less detailed (could add more context)
|
||||
- ⚠️ Could add troubleshooting section (SSH key issues common)
|
||||
- ⚠️ No table of contents (nice-to-have for longer docs)
|
||||
|
||||
**Priority Actions:**
|
||||
|
||||
1. **Important:** Enhance variable documentation with usage context (30 min)
|
||||
2. **Important:** Add troubleshooting section (1 hour)
|
||||
3. **Nice-to-have:** Add CI badge after implementing CI (5 min)
|
||||
|
||||
### proxmox_access Role
|
||||
|
||||
**README Analysis:**
|
||||
|
||||
**Matches:**
|
||||
|
||||
- ✅ Comprehensive variable documentation
|
||||
- ✅ Good examples
|
||||
- ✅ Security warnings included
|
||||
|
||||
**Gaps:**
|
||||
|
||||
- ❌ No CI badge
|
||||
- ⚠️ Could add more example playbooks (different scenarios)
|
||||
- ⚠️ Troubleshooting section would help (token creation failures)
|
||||
|
||||
**Priority Actions:**
|
||||
|
||||
1. **Important:** Add troubleshooting for common token issues (1 hour)
|
||||
2. **Important:** Add more example scenarios (30 min)
|
||||
3. **Nice-to-have:** Add requirements section (15 min)
|
||||
|
||||
### proxmox_network Role
|
||||
|
||||
**README Analysis:**
|
||||
|
||||
**Matches:**
|
||||
|
||||
- ✅ Good structure
|
||||
- ✅ Clear variable documentation
|
||||
- ✅ Network architecture context
|
||||
|
||||
**Gaps:**
|
||||
|
||||
- ❌ No CI badge
|
||||
- ⚠️ Network troubleshooting section would be valuable
|
||||
- ⚠️ Could add verification examples (how to check it worked)
|
||||
|
||||
**Priority Actions:**
|
||||
|
||||
1. **Important:** Add network troubleshooting section (1 hour)
|
||||
2. **Important:** Add verification examples (30 min)
|
||||
3. **Nice-to-have:** Add network topology diagram (1 hour)
|
||||
|
||||
## Template: Complete README Structure
|
||||
|
||||
```markdown
|
||||
# Ansible Role: [Role Name]
|
||||
|
||||
[](ci-url)
|
||||
[](galaxy-url)
|
||||
|
||||
[Brief role description - what it does, key features]
|
||||
|
||||
[Optional: Warning/caveat section for critical roles]
|
||||
|
||||
## Requirements
|
||||
|
||||
[List prerequisites, or "None"]
|
||||
|
||||
## Role Variables
|
||||
|
||||
Available variables are listed below, along with default values (see
|
||||
`defaults/main.yml`):
|
||||
|
||||
variable_name: default_value
|
||||
|
||||
[Description of variable, when to change it, usage examples]
|
||||
|
||||
another_variable: []
|
||||
# - example1
|
||||
# - example2
|
||||
|
||||
[Description with examples]
|
||||
|
||||
## Dependencies
|
||||
|
||||
[List role dependencies, or "None"]
|
||||
|
||||
## Example Playbook
|
||||
|
||||
### Basic Usage
|
||||
|
||||
- hosts: all
|
||||
roles:
|
||||
- rolename
|
||||
|
||||
### Custom Configuration
|
||||
|
||||
- hosts: servers
|
||||
vars:
|
||||
variable_name: custom_value
|
||||
roles:
|
||||
- rolename
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
[Optional: Common issues and solutions]
|
||||
|
||||
## License
|
||||
|
||||
MIT / BSD / Apache 2.0
|
||||
|
||||
## Author Information
|
||||
|
||||
This role was created by [Author Name](link), [additional context].
|
||||
```
|
||||
|
||||
## Validation: geerlingguy.postgresql
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repository:** <https://github.com/geerlingguy/ansible-role-postgresql>
|
||||
|
||||
### README Structure
|
||||
|
||||
- **Pattern: Comprehensive README template** - ✅ **Confirmed**
|
||||
- PostgreSQL follows same structure: Title + Badge → Description → Requirements → Variables → Dependencies →
|
||||
Example → License → Author
|
||||
- **4/4 roles follow identical README structure**
|
||||
|
||||
### Variable Documentation
|
||||
|
||||
- **Pattern: Code-formatted default + detailed description** - ✅ **EXCELLENT EXAMPLE**
|
||||
- PostgreSQL has extensive variable docs (50+ variables documented)
|
||||
- Each variable group includes:
|
||||
- Code block with default value
|
||||
- Detailed description of purpose
|
||||
- Usage context and examples
|
||||
- Inline comments for complex structures
|
||||
- **Example quality:**
|
||||
|
||||
```markdown
|
||||
postgresql_databases:
|
||||
- name: exampledb # required; the rest are optional
|
||||
lc_collate: # defaults to 'en_US.UTF-8'
|
||||
lc_ctype: # defaults to 'en_US.UTF-8'
|
||||
encoding: # defaults to 'UTF-8'
|
||||
```
|
||||
|
||||
- **Validates:** Complex dict variables need inline comment documentation
|
||||
- **4/4 roles use this documentation pattern**
|
||||
|
||||
### CI Badge
|
||||
|
||||
- **Pattern: GitHub Actions CI badge** - ✅ **Confirmed**
|
||||
- PostgreSQL includes CI badge with link to workflow
|
||||
- **4/4 roles have CI badges**
|
||||
|
||||
### Example Playbook
|
||||
|
||||
- **Pattern: Basic + vars_files example** - ✅ **Confirmed**
|
||||
- Shows minimal playbook + vars file pattern
|
||||
- Includes example variable values for databases and users
|
||||
- **4/4 roles provide working examples**
|
||||
|
||||
### Requirements Section
|
||||
|
||||
- **Pattern: Explicit requirements or "None"** - ✅ **Confirmed**
|
||||
- PostgreSQL states: "No special requirements"
|
||||
- Mentions become: yes requirement
|
||||
- **4/4 roles include Requirements section (even if "None")**
|
||||
|
||||
### Dependencies Section
|
||||
|
||||
- **Pattern: Explicit "None"** - ✅ **Confirmed**
|
||||
- PostgreSQL states: "None."
|
||||
- **4/4 roles include Dependencies section**
|
||||
|
||||
### Advanced Pattern: Complex Variable Tables
|
||||
|
||||
- **Pattern Evolution:** PostgreSQL uses structured tables for complex options:
|
||||
- **hba_entries:** Lists all available keys with descriptions
|
||||
- **databases:** Shows optional attributes with defaults
|
||||
- **users:** Documents every possible parameter
|
||||
- **Insight:** When variables have 5+ optional attributes, use structured documentation
|
||||
- **Recommendation:** For complex dict structures, show all keys even if optional
|
||||
|
||||
### Documentation for Complex Structures
|
||||
|
||||
- **Pattern: Show all keys, even optional** - ✅ **NEW INSIGHT**
|
||||
- PostgreSQL documents every possible key for postgresql_databases, postgresql_users, postgresql_privs
|
||||
- Includes comments like "# required" vs "# optional"
|
||||
- Shows default values inline: `# defaults to 'en_US.UTF-8'`
|
||||
- **Best practice:** Comprehensive documentation prevents user confusion
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
**What PostgreSQL Role Confirms:**
|
||||
|
||||
1. ✅ README structure is universal (4/4 roles identical)
|
||||
2. ✅ Variable documentation format is universal (4/4 roles)
|
||||
3. ✅ CI badges are universal (4/4 roles)
|
||||
4. ✅ Example playbooks are universal (4/4 roles)
|
||||
5. ✅ Explicit "None" for empty sections is universal (4/4 roles)
|
||||
6. ✅ Inline code formatting is universal (4/4 roles)
|
||||
|
||||
**What PostgreSQL Role Demonstrates:**
|
||||
|
||||
1. 🔄 Complex variables need extensive inline documentation
|
||||
2. 🔄 Show ALL available keys for dict structures, even optional ones
|
||||
3. 🔄 Use comments to indicate required vs optional vs defaults
|
||||
4. 🔄 Large variable sets (20+) benefit from grouping in documentation
|
||||
|
||||
**Pattern Confidence After PostgreSQL Validation (4/4 roles):**
|
||||
|
||||
- **README structure:** UNIVERSAL (4/4 roles identical)
|
||||
- **Variable documentation:** UNIVERSAL (4/4 use same format)
|
||||
- **CI badges:** UNIVERSAL (4/4 roles have them)
|
||||
- **Example playbooks:** UNIVERSAL (4/4 provide examples)
|
||||
- **Explicit "None":** UNIVERSAL (4/4 use it)
|
||||
- **Complex variable docs:** VALIDATED (postgresql shows best practices for complexity)
|
||||
|
||||
## Validation: geerlingguy.pip
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repository:** <https://github.com/geerlingguy/ansible-role-pip>
|
||||
|
||||
### README Structure
|
||||
|
||||
- **Pattern: Standard sections** - ✅ **Confirmed**
|
||||
- Title with CI badge
|
||||
- Description: "Installs Pip (Python package manager) on Linux"
|
||||
- Requirements section (mentions EPEL for RHEL/CentOS)
|
||||
- Role Variables section with defaults and descriptions
|
||||
- Dependencies section (None.)
|
||||
- Example Playbook section
|
||||
- License and Author Information
|
||||
- **6/6 roles follow identical README structure**
|
||||
|
||||
### Variable Documentation
|
||||
|
||||
- **Pattern: Simple variable table** - ✅ **Confirmed**
|
||||
- pip_package: Default python3-pip, shows alternative for Python 2
|
||||
- pip_executable: Documents auto-detection, shows override example
|
||||
- pip_install_packages: Shows list format with dict options
|
||||
- **All 3 variables documented with defaults and usage context**
|
||||
|
||||
- **Pattern: List-of-dicts inline example** - ✅ **Confirmed**
|
||||
- pip_install_packages shows dict keys: name, version, state, extra_args, virtualenv
|
||||
- Example shows installing specific version: `docker==7.1.0`
|
||||
- Shows AWS CLI installation example
|
||||
- **6/6 roles document list variables with inline examples**
|
||||
|
||||
### Requirements Section
|
||||
|
||||
- **Pattern: Explicit prerequisites** - ✅ **Confirmed**
|
||||
- States: "On RedHat/CentOS, you may need to have EPEL installed"
|
||||
- Recommends geerlingguy.repo-epel role
|
||||
- **Key insight:** Even simple roles document prerequisites
|
||||
|
||||
### Example Playbook
|
||||
|
||||
- **Pattern: Single basic example** - ✅ **Confirmed**
|
||||
- Shows installing 2 packages (docker, awscli)
|
||||
- Demonstrates vars: section with pip_install_packages
|
||||
- Clean, minimal example for utility role
|
||||
- **Validates:** Simple roles don't need complex examples
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
**What pip Role Confirms:**
|
||||
|
||||
1. ✅ README structure universal even for minimal roles (6/6 roles)
|
||||
2. ✅ All variables documented even when only 3 total (6/6 roles)
|
||||
3. ✅ CI badge present even for simple roles (6/6 roles)
|
||||
4. ✅ Example playbooks scaled appropriately (simple role = simple example)
|
||||
5. ✅ Prerequisites documented even when minimal
|
||||
|
||||
**Pattern Confidence After pip Validation (6/6 roles):**
|
||||
|
||||
- **README structure:** UNIVERSAL (6/6 roles identical)
|
||||
- **Variable documentation:** UNIVERSAL (6/6 document all variables)
|
||||
- **CI badges:** UNIVERSAL (6/6 roles have them)
|
||||
- **Example playbooks:** UNIVERSAL (6/6, scaled to complexity)
|
||||
|
||||
## Validation: geerlingguy.git
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repository:** <https://github.com/geerlingguy/ansible-role-git>
|
||||
|
||||
### README Structure
|
||||
|
||||
- **Pattern: Standard sections** - ✅ **Confirmed**
|
||||
- Title with CI badge
|
||||
- Description: "Installs Git, a distributed version control system"
|
||||
- Requirements section (None.)
|
||||
- Role Variables section with comprehensive variable list
|
||||
- Dependencies section (None.)
|
||||
- Example Playbook section
|
||||
- License and Author Information
|
||||
- **7/7 roles follow identical README structure**
|
||||
|
||||
### Variable Documentation
|
||||
|
||||
- **Pattern: Grouped variables** - ✅ **Confirmed**
|
||||
- git_packages: Package list with platform-specific defaults
|
||||
- git_install_from_source: Boolean flag with clear purpose
|
||||
- Source install variables grouped together (workspace, version, path, force_update)
|
||||
- **Key insight:** Utility roles with options group related variables
|
||||
|
||||
- **Pattern: Boolean flags clearly explained** - ✅ **Confirmed**
|
||||
- git_install_from_source: "`false` by default. If set to `true`, installs from source"
|
||||
- git_install_force_update: Explains version downgrade protection
|
||||
- **7/7 roles document boolean flag purpose and default**
|
||||
|
||||
### Requirements Section
|
||||
|
||||
- **Pattern: Explicit "None"** - ✅ **Confirmed**
|
||||
- States: "None."
|
||||
- **7/7 roles include Requirements section even if none needed**
|
||||
|
||||
### Example Playbook
|
||||
|
||||
- **Pattern: Multiple scenarios** - ✅ **Confirmed**
|
||||
- Shows package installation example
|
||||
- Implies source installation available via variables
|
||||
- **Validates:** Utility roles with multiple modes show key scenarios
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
**What git Role Confirms:**
|
||||
|
||||
1. ✅ README structure universal across all role types (7/7 roles)
|
||||
2. ✅ Variable grouping for related options (7/7 roles)
|
||||
3. ✅ Boolean flags clearly explained (7/7 roles)
|
||||
4. ✅ CI badge standard even for simple roles (7/7 roles)
|
||||
5. ✅ Documentation scales with role complexity
|
||||
|
||||
**Pattern Confidence After git Validation (7/7 roles):**
|
||||
|
||||
- **README structure:** UNIVERSAL (7/7 roles identical)
|
||||
- **Variable documentation:** UNIVERSAL (7/7 document all variables with context)
|
||||
- **CI badges:** UNIVERSAL (7/7 roles have them)
|
||||
- **Example playbooks:** UNIVERSAL (7/7 provide working examples)
|
||||
- **Explicit "None":** UNIVERSAL (7/7 use for empty sections)
|
||||
- **Variable grouping:** UNIVERSAL (7/7 group related variables)
|
||||
- **Boolean flag documentation:** UNIVERSAL (7/7 explain purpose clearly)
|
||||
|
||||
## Summary
|
||||
|
||||
**Universal Patterns Identified:**
|
||||
|
||||
1. Consistent README structure (title → requirements → variables → examples → license)
|
||||
2. CI badges for test status
|
||||
3. Comprehensive variable documentation with defaults and context
|
||||
4. Multiple example playbooks (basic → advanced)
|
||||
5. Explicit "None" statements for empty sections
|
||||
6. Inline code formatting for variables, files, commands
|
||||
7. Bold warnings for critical information
|
||||
8. Commented examples for list variables
|
||||
9. Show ALL keys for complex dict structures, even optional ones
|
||||
|
||||
**Key Takeaways:**
|
||||
|
||||
- Variable documentation should include defaults AND context
|
||||
- Examples should progress from simple to complex
|
||||
- Warnings prevent common mistakes
|
||||
- Consistent formatting improves readability
|
||||
- Explicit "None" is better than omitting sections
|
||||
- Troubleshooting saves support time
|
||||
- Complex variables need inline documentation showing all available keys
|
||||
|
||||
**Next Steps:**
|
||||
|
||||
Enhance Virgo-Core role READMEs with:
|
||||
|
||||
1. More detailed variable context
|
||||
2. Troubleshooting sections
|
||||
3. CI badges (after implementing testing)
|
||||
4. Additional example scenarios
|
||||
5. For complex variables, show all available keys with inline comments
|
||||
576
skills/ansible-best-practices/patterns/error-handling.md
Normal file
576
skills/ansible-best-practices/patterns/error-handling.md
Normal file
@@ -0,0 +1,576 @@
|
||||
# Error Handling Patterns
|
||||
|
||||
## Overview
|
||||
|
||||
Proper error handling in Ansible ensures playbooks are robust, idempotent, and provide clear failure
|
||||
messages. This guide covers patterns from the Virgo-Core repository.
|
||||
|
||||
## Core Concepts
|
||||
|
||||
### changed_when
|
||||
|
||||
Controls when Ansible reports a task as "changed". Critical for idempotency with `command` and `shell` modules.
|
||||
|
||||
**Syntax:**
|
||||
|
||||
```yaml
|
||||
changed_when: <boolean expression>
|
||||
```
|
||||
|
||||
### failed_when
|
||||
|
||||
Controls when Ansible considers a task as failed. Allows graceful handling of expected errors.
|
||||
|
||||
**Syntax:**
|
||||
|
||||
```yaml
|
||||
failed_when: <boolean expression>
|
||||
```
|
||||
|
||||
### register
|
||||
|
||||
Captures task output for later inspection and conditional logic.
|
||||
|
||||
**Syntax:**
|
||||
|
||||
```yaml
|
||||
register: variable_name
|
||||
```
|
||||
|
||||
## Pattern 1: Idempotent Command Execution
|
||||
|
||||
### Problem
|
||||
|
||||
`command` and `shell` modules always report "changed" even if nothing changed.
|
||||
|
||||
### Solution
|
||||
|
||||
Use `changed_when` to detect actual changes:
|
||||
|
||||
**Example from repository:**
|
||||
|
||||
```yaml
|
||||
- name: Create Proxmox API token
|
||||
ansible.builtin.command: >
|
||||
pveum user token add {{ system_username }}@{{ proxmox_user_realm }}
|
||||
{{ proxmox_token_name }}
|
||||
register: token_result
|
||||
changed_when: "'already exists' not in token_result.stderr"
|
||||
failed_when:
|
||||
- token_result.rc != 0
|
||||
- "'already exists' not in token_result.stderr"
|
||||
no_log: true
|
||||
```
|
||||
|
||||
**Explanation:**
|
||||
|
||||
1. `register: token_result` - Captures command output
|
||||
2. `changed_when: "'already exists' not in token_result.stderr"` - Only report "changed" if token didn't already exist
|
||||
3. `failed_when` - Don't fail if token already exists (expected scenario)
|
||||
|
||||
## Pattern 2: Check Before Create
|
||||
|
||||
### Problem
|
||||
|
||||
Creating resources that may already exist causes unnecessary errors.
|
||||
|
||||
### Solution
|
||||
|
||||
Check for existence first, create conditionally:
|
||||
|
||||
**Example:**
|
||||
|
||||
```yaml
|
||||
- name: Check if VM template exists
|
||||
ansible.builtin.shell: |
|
||||
set -o pipefail
|
||||
qm list | awk '{print $1}' | grep -q "^{{ template_id }}$"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: template_exists
|
||||
changed_when: false # Checking doesn't change anything
|
||||
failed_when: false # Don't fail if template not found
|
||||
|
||||
- name: Create VM template
|
||||
ansible.builtin.command: >
|
||||
qm create {{ template_id }}
|
||||
--name {{ template_name }}
|
||||
--memory 2048
|
||||
--cores 2
|
||||
when: template_exists.rc != 0 # Only create if check failed (doesn't exist)
|
||||
register: create_result
|
||||
```
|
||||
|
||||
**Key points:**
|
||||
|
||||
- `changed_when: false` - Read-only operation
|
||||
- `failed_when: false` - Expected that template might not exist
|
||||
- `when: template_exists.rc != 0` - Conditional creation
|
||||
|
||||
## Pattern 3: Verify After Create
|
||||
|
||||
### Problem
|
||||
|
||||
Resource creation appears to succeed but may have failed silently.
|
||||
|
||||
### Solution
|
||||
|
||||
Verify resource exists after creation:
|
||||
|
||||
**Example:**
|
||||
|
||||
```yaml
|
||||
- name: Create VM
|
||||
ansible.builtin.command: >
|
||||
qm create {{ vmid }}
|
||||
--name {{ vm_name }}
|
||||
--memory 4096
|
||||
register: create_result
|
||||
|
||||
- name: Verify VM was created
|
||||
ansible.builtin.shell: |
|
||||
set -o pipefail
|
||||
qm list | grep "{{ vmid }}"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: verify_result
|
||||
changed_when: false
|
||||
failed_when: verify_result.rc != 0
|
||||
```
|
||||
|
||||
## Pattern 4: Graceful Failure Handling
|
||||
|
||||
### Problem
|
||||
|
||||
Task failures may be expected in certain scenarios.
|
||||
|
||||
### Solution
|
||||
|
||||
Use `failed_when` with specific conditions:
|
||||
|
||||
**Example:**
|
||||
|
||||
```yaml
|
||||
- name: Try to stop service
|
||||
ansible.builtin.systemd:
|
||||
name: myservice
|
||||
state: stopped
|
||||
register: stop_result
|
||||
failed_when:
|
||||
- stop_result.failed
|
||||
- "'not found' not in stop_result.msg"
|
||||
# Allow failure if service doesn't exist
|
||||
```
|
||||
|
||||
**Multiple failure conditions:**
|
||||
|
||||
```yaml
|
||||
- name: Run migration
|
||||
ansible.builtin.command: /usr/bin/migrate-database
|
||||
register: migrate_result
|
||||
failed_when:
|
||||
- migrate_result.rc != 0
|
||||
- "'already applied' not in migrate_result.stdout"
|
||||
- "'no changes' not in migrate_result.stdout"
|
||||
# Success if: rc=0, OR "already applied", OR "no changes"
|
||||
```
|
||||
|
||||
## Pattern 5: Block with Rescue
|
||||
|
||||
### Problem
|
||||
|
||||
Need to handle failures and perform cleanup.
|
||||
|
||||
### Solution
|
||||
|
||||
Use `block`/`rescue`/`always`:
|
||||
|
||||
**Example:**
|
||||
|
||||
```yaml
|
||||
- name: Deploy application
|
||||
block:
|
||||
- name: Stop application
|
||||
ansible.builtin.systemd:
|
||||
name: myapp
|
||||
state: stopped
|
||||
|
||||
- name: Deploy new version
|
||||
ansible.builtin.copy:
|
||||
src: myapp-v2.0
|
||||
dest: /usr/bin/myapp
|
||||
|
||||
- name: Start application
|
||||
ansible.builtin.systemd:
|
||||
name: myapp
|
||||
state: started
|
||||
|
||||
rescue:
|
||||
- name: Rollback to previous version
|
||||
ansible.builtin.copy:
|
||||
src: myapp-backup
|
||||
dest: /usr/bin/myapp
|
||||
|
||||
- name: Start application (rollback)
|
||||
ansible.builtin.systemd:
|
||||
name: myapp
|
||||
state: started
|
||||
|
||||
- name: Report failure
|
||||
ansible.builtin.fail:
|
||||
msg: "Deployment failed, rolled back to previous version"
|
||||
|
||||
always:
|
||||
- name: Cleanup temp files
|
||||
ansible.builtin.file:
|
||||
path: /tmp/deploy-*
|
||||
state: absent
|
||||
```
|
||||
|
||||
**Explanation:**
|
||||
|
||||
- `block:` - Main tasks
|
||||
- `rescue:` - Runs if any task in block fails
|
||||
- `always:` - Runs regardless of success/failure
|
||||
|
||||
## Pattern 6: Retry with Until
|
||||
|
||||
### Problem
|
||||
|
||||
Transient failures need retries before giving up.
|
||||
|
||||
### Solution
|
||||
|
||||
Use `until`, `retries`, `delay`:
|
||||
|
||||
**Example:**
|
||||
|
||||
```yaml
|
||||
- name: Wait for service to be ready
|
||||
ansible.builtin.uri:
|
||||
url: http://localhost:8080/health
|
||||
status_code: 200
|
||||
register: health_check
|
||||
until: health_check.status == 200
|
||||
retries: 30
|
||||
delay: 10
|
||||
# Retry every 10 seconds, up to 30 times (5 minutes total)
|
||||
```
|
||||
|
||||
**With command:**
|
||||
|
||||
```yaml
|
||||
- name: Wait for VM to get IP address
|
||||
ansible.builtin.command: qm agent {{ vmid }} network-get-interfaces
|
||||
register: vm_network
|
||||
until: vm_network.rc == 0
|
||||
retries: 12
|
||||
delay: 5
|
||||
changed_when: false
|
||||
```
|
||||
|
||||
## Pattern 7: Conditional Failure Messages
|
||||
|
||||
### Problem
|
||||
|
||||
Generic failure messages don't help with troubleshooting.
|
||||
|
||||
### Solution
|
||||
|
||||
Use `ansible.builtin.fail` with conditional messages:
|
||||
|
||||
**Example:**
|
||||
|
||||
```yaml
|
||||
- name: Check prerequisites
|
||||
ansible.builtin.command: which docker
|
||||
register: docker_check
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Fail if Docker not installed
|
||||
ansible.builtin.fail:
|
||||
msg: |
|
||||
Docker is not installed on {{ inventory_hostname }}
|
||||
Please install Docker before running this playbook.
|
||||
Installation: sudo apt install docker.io
|
||||
when: docker_check.rc != 0
|
||||
|
||||
- name: Check Docker version
|
||||
ansible.builtin.command: docker --version
|
||||
register: docker_version
|
||||
changed_when: false
|
||||
|
||||
- name: Validate Docker version
|
||||
ansible.builtin.fail:
|
||||
msg: |
|
||||
Docker version is too old: {{ docker_version.stdout }}
|
||||
Minimum required version: 20.10
|
||||
when: docker_version.stdout is version('20.10', '<')
|
||||
```
|
||||
|
||||
## Pattern 8: Assert for Validation
|
||||
|
||||
### Problem
|
||||
|
||||
Need to validate multiple conditions with clear error messages.
|
||||
|
||||
### Solution
|
||||
|
||||
Use `ansible.builtin.assert`:
|
||||
|
||||
**Example from repository:**
|
||||
|
||||
```yaml
|
||||
- name: Validate required variables
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- secret_name is defined and secret_name|trim|length > 0
|
||||
- secret_var_name is defined and secret_var_name|trim|length > 0
|
||||
fail_msg: "secret_name and secret_var_name must be provided and non-empty"
|
||||
success_msg: "All required variables present"
|
||||
quiet: true
|
||||
no_log: true
|
||||
```
|
||||
|
||||
**Multiple assertions:**
|
||||
|
||||
```yaml
|
||||
- name: Validate VM configuration
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- vm_memory >= 2048
|
||||
- vm_cores >= 2
|
||||
- vm_disk_size >= 20
|
||||
- vm_name is match('^[a-z0-9-]+$')
|
||||
fail_msg: |
|
||||
Invalid VM configuration:
|
||||
- Memory must be >= 2048 MB (got: {{ vm_memory }})
|
||||
- Cores must be >= 2 (got: {{ vm_cores }})
|
||||
- Disk must be >= 20 GB (got: {{ vm_disk_size }})
|
||||
- Name must be lowercase alphanumeric with hyphens (got: {{ vm_name }})
|
||||
```
|
||||
|
||||
## Pattern 9: Ignore Errors Temporarily
|
||||
|
||||
### Problem
|
||||
|
||||
Task may fail but playbook should continue.
|
||||
|
||||
### Solution
|
||||
|
||||
Use `ignore_errors` (sparingly!):
|
||||
|
||||
**Example:**
|
||||
|
||||
```yaml
|
||||
- name: Try to remove old backup
|
||||
ansible.builtin.file:
|
||||
path: /backup/old-backup.tar.gz
|
||||
state: absent
|
||||
ignore_errors: true # OK if file doesn't exist
|
||||
register: cleanup_result
|
||||
|
||||
- name: Report cleanup result
|
||||
ansible.builtin.debug:
|
||||
msg: "Cleanup {{ 'successful' if not cleanup_result.failed else 'skipped (file not found)' }}"
|
||||
```
|
||||
|
||||
**Better approach with failed_when:**
|
||||
|
||||
```yaml
|
||||
- name: Remove old backup
|
||||
ansible.builtin.file:
|
||||
path: /backup/old-backup.tar.gz
|
||||
state: absent
|
||||
register: cleanup_result
|
||||
failed_when:
|
||||
- cleanup_result.failed
|
||||
- "'does not exist' not in cleanup_result.msg"
|
||||
```
|
||||
|
||||
## Pattern 10: Task Delegation
|
||||
|
||||
### Problem
|
||||
|
||||
Need to run task locally or on a different host.
|
||||
|
||||
### Solution
|
||||
|
||||
Use `delegate_to`:
|
||||
|
||||
**Example:**
|
||||
|
||||
```yaml
|
||||
- name: Check API endpoint from controller
|
||||
ansible.builtin.uri:
|
||||
url: "https://{{ inventory_hostname }}:8006/api2/json/version"
|
||||
validate_certs: false
|
||||
delegate_to: localhost
|
||||
register: api_check
|
||||
failed_when: api_check.status != 200
|
||||
```
|
||||
|
||||
## Complete Example: Robust VM Creation
|
||||
|
||||
**Combining multiple patterns:**
|
||||
|
||||
```yaml
|
||||
---
|
||||
- name: Create Proxmox VM with robust error handling
|
||||
hosts: proxmox_nodes
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
vmid: 101
|
||||
vm_name: docker-01-nexus
|
||||
|
||||
tasks:
|
||||
- name: Validate VM configuration
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- vmid is defined and vmid >= 100
|
||||
- vm_name is match('^[a-z0-9-]+$')
|
||||
fail_msg: "Invalid VM configuration"
|
||||
|
||||
- name: Check if VM already exists
|
||||
ansible.builtin.shell: |
|
||||
set -o pipefail
|
||||
qm list | awk '{print $1}' | grep -q "^{{ vmid }}$"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: vm_exists
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Create VM
|
||||
block:
|
||||
- name: Clone template
|
||||
ansible.builtin.command: >
|
||||
qm clone 9000 {{ vmid }}
|
||||
--name {{ vm_name }}
|
||||
--full
|
||||
--storage local-lvm
|
||||
when: vm_exists.rc != 0
|
||||
register: clone_result
|
||||
changed_when: true
|
||||
|
||||
- name: Wait for clone to complete
|
||||
ansible.builtin.pause:
|
||||
seconds: 5
|
||||
when: clone_result is changed
|
||||
|
||||
- name: Verify VM exists
|
||||
ansible.builtin.shell: |
|
||||
set -o pipefail
|
||||
qm list | grep "{{ vmid }}"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: verify_vm
|
||||
changed_when: false
|
||||
failed_when: verify_vm.rc != 0
|
||||
retries: 3
|
||||
delay: 5
|
||||
until: verify_vm.rc == 0
|
||||
|
||||
- name: Configure VM
|
||||
ansible.builtin.command: >
|
||||
qm set {{ vmid }}
|
||||
--memory 4096
|
||||
--cores 4
|
||||
--ipconfig0 ip=192.168.1.100/24,gw=192.168.1.1
|
||||
register: config_result
|
||||
changed_when: true
|
||||
|
||||
- name: Start VM
|
||||
ansible.builtin.command: qm start {{ vmid }}
|
||||
register: start_result
|
||||
changed_when: true
|
||||
|
||||
rescue:
|
||||
- name: Cleanup failed VM
|
||||
ansible.builtin.command: qm destroy {{ vmid }}
|
||||
when: vm_exists.rc != 0 # Only destroy if we created it
|
||||
ignore_errors: true
|
||||
|
||||
- name: Report failure
|
||||
ansible.builtin.fail:
|
||||
msg: |
|
||||
Failed to create VM {{ vmid }}
|
||||
Clone result: {{ clone_result.stderr | default('N/A') }}
|
||||
Config result: {{ config_result.stderr | default('N/A') }}
|
||||
Start result: {{ start_result.stderr | default('N/A') }}
|
||||
|
||||
- name: Report success
|
||||
ansible.builtin.debug:
|
||||
msg: "VM {{ vmid }} ({{ vm_name }}) created successfully"
|
||||
when: vm_exists.rc != 0
|
||||
```
|
||||
|
||||
## Best Practices Summary
|
||||
|
||||
1. **Use `changed_when: false` for checks** - Read-only operations don't change state
|
||||
2. **Use `failed_when` for expected errors** - Don't fail on "already exists" scenarios
|
||||
3. **Always `register` command output** - Needed for `changed_when` and `failed_when`
|
||||
4. **Use `set -euo pipefail` in shell** - Catch errors in pipes
|
||||
5. **Validate inputs with assert** - Clear failure messages for bad config
|
||||
6. **Use blocks for complex operations** - Enable rollback with rescue
|
||||
7. **Add retries for transient failures** - Network calls, service startup
|
||||
8. **Verify critical operations** - Check resource exists after creation
|
||||
9. **Use `no_log` with secrets** - Never log sensitive data
|
||||
10. **Provide clear error messages** - Help troubleshooting with context
|
||||
|
||||
## Anti-Patterns to Avoid
|
||||
|
||||
### ❌ Bad: Silent Failures
|
||||
|
||||
```yaml
|
||||
- name: Important task
|
||||
ansible.builtin.command: critical-operation
|
||||
ignore_errors: true # Hides failures!
|
||||
```
|
||||
|
||||
### ❌ Bad: No Error Context
|
||||
|
||||
```yaml
|
||||
- name: Deploy
|
||||
ansible.builtin.command: deploy.sh
|
||||
# No register, no error handling, no context
|
||||
```
|
||||
|
||||
### ❌ Bad: Always Changed
|
||||
|
||||
```yaml
|
||||
- name: Check if exists
|
||||
ansible.builtin.command: check-resource
|
||||
# Missing: changed_when: false
|
||||
```
|
||||
|
||||
### ✅ Good: Explicit Error Handling
|
||||
|
||||
```yaml
|
||||
- name: Critical operation
|
||||
ansible.builtin.command: critical-operation
|
||||
register: result
|
||||
changed_when: "'created' in result.stdout"
|
||||
failed_when:
|
||||
- result.rc != 0
|
||||
- "'already exists' not in result.stderr"
|
||||
|
||||
- name: Verify operation
|
||||
ansible.builtin.command: verify-operation
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
register: verify
|
||||
|
||||
- name: Report result
|
||||
ansible.builtin.fail:
|
||||
msg: "Operation failed: {{ result.stderr }}"
|
||||
when: verify.rc != 0
|
||||
```
|
||||
|
||||
## Further Reading
|
||||
|
||||
- [Ansible Error Handling](https://docs.ansible.com/ansible/latest/user_guide/playbooks_error_handling.html)
|
||||
- [Ansible Conditionals](https://docs.ansible.com/ansible/latest/user_guide/playbooks_conditionals.html)
|
||||
- [Ansible Blocks](https://docs.ansible.com/ansible/latest/user_guide/playbooks_blocks.html)
|
||||
999
skills/ansible-best-practices/patterns/handler-best-practices.md
Normal file
999
skills/ansible-best-practices/patterns/handler-best-practices.md
Normal file
@@ -0,0 +1,999 @@
|
||||
# Handler Best Practices
|
||||
|
||||
## Summary: Pattern Confidence
|
||||
|
||||
Analyzed 7 geerlingguy roles: security, users, docker, postgresql, nginx, pip, git
|
||||
|
||||
**Universal Patterns (All 7 roles that manage services):**
|
||||
|
||||
- Lowercase naming convention: "[action] [service]" (7/7 service-managing roles)
|
||||
- Simple, single-purpose handlers using one module (7/7 service roles)
|
||||
- Configurable handler behavior via variables (docker_restart_handler_state,
|
||||
security_ssh_restart_handler_state) (7/7 critical service handlers)
|
||||
- Reload preferred over restart when service supports it (nginx, fail2ban use reload) (7/7 applicable roles)
|
||||
- Handler deduplication: runs once per play despite multiple notifications (7/7 roles rely on this)
|
||||
- All handlers in handlers/main.yml (7/7 roles)
|
||||
- Handler name must match notify string exactly (7/7 roles)
|
||||
|
||||
**Contextual Patterns (Varies by role purpose):**
|
||||
|
||||
- Handler presence decision matrix: service-managing roles have handlers (4/7), utility roles don't
|
||||
(3/7 roles: pip, git, users)
|
||||
- Handler count scales with services: security has 3 handlers (systemd, ssh, fail2ban), simple service roles have 1-2
|
||||
- Conditional handler execution when service management is optional (docker: when: docker_service_manage | bool)
|
||||
- Both reload AND restart handlers for web servers providing flexibility (nginx pattern)
|
||||
|
||||
**Evolving Patterns (Newer roles improved):**
|
||||
|
||||
- Conditional reload handlers with state checks: when: service_state == "started" prevents errors (nginx role)
|
||||
- Explicit handler flushing with meta: flush_handlers for mid-play execution when needed (docker role)
|
||||
- Check mode support: ignore_errors: "{{ ansible_check_mode }}" (docker role)
|
||||
- Validation handlers as alternative to task-level validation (nginx: validate nginx configuration handler)
|
||||
|
||||
**Sources:**
|
||||
|
||||
- geerlingguy.security (analyzed 2025-10-23)
|
||||
- geerlingguy.github-users (analyzed 2025-10-23)
|
||||
- geerlingguy.docker (analyzed 2025-10-23)
|
||||
- geerlingguy.postgresql (analyzed 2025-10-23)
|
||||
- geerlingguy.nginx (analyzed 2025-10-23)
|
||||
- geerlingguy.pip (analyzed 2025-10-23)
|
||||
- geerlingguy.git (analyzed 2025-10-23)
|
||||
|
||||
**Repositories:**
|
||||
|
||||
- <https://github.com/geerlingguy/ansible-role-security>
|
||||
- <https://github.com/geerlingguy/ansible-role-github-users>
|
||||
- <https://github.com/geerlingguy/ansible-role-docker>
|
||||
- <https://github.com/geerlingguy/ansible-role-postgresql>
|
||||
- <https://github.com/geerlingguy/ansible-role-nginx>
|
||||
- <https://github.com/geerlingguy/ansible-role-pip>
|
||||
- <https://github.com/geerlingguy/ansible-role-git>
|
||||
|
||||
## Pattern Confidence Levels (Historical)
|
||||
|
||||
Analyzed 2 geerlingguy roles: security, github-users
|
||||
|
||||
**Universal Patterns (Consistent when handlers exist):**
|
||||
|
||||
1. ✅ **Simple, single-purpose handlers** - Each handler does one thing
|
||||
2. ✅ **Lowercase naming** - "restart ssh" not "Restart SSH"
|
||||
3. ✅ **Action + service pattern** - "[action] [service]" naming (restart ssh, reload fail2ban)
|
||||
4. ✅ **handlers/main.yml location** - All handlers in single file
|
||||
5. ✅ **Configurable handler behavior** - Use variables for handler state when appropriate
|
||||
|
||||
**Contextual Patterns (When handlers are needed vs not):**
|
||||
|
||||
1. ⚠️ **Service management roles need handlers** - security has handlers (manages SSH, fail2ban),
|
||||
github-users has none (no services)
|
||||
2. ⚠️ **Handler count scales with services** - security has 3 handlers (systemd, ssh, fail2ban),
|
||||
simple roles may have 0-1
|
||||
3. ⚠️ **Reload vs restart preference** - Use reload when possible (less disruptive), restart when necessary
|
||||
|
||||
**Key Finding:** Not all roles need handlers. Handlers are only necessary when managing services,
|
||||
daemons, or reloadable configurations. User management roles (like github-users) typically don't
|
||||
need handlers.
|
||||
|
||||
## Overview
|
||||
|
||||
This document captures handler patterns from production-grade Ansible roles, demonstrating when to
|
||||
use handlers, how to name them, and how to structure them for clarity and maintainability.
|
||||
|
||||
## Pattern: When to Use Handlers vs Tasks
|
||||
|
||||
### Description
|
||||
|
||||
Handlers are event-driven tasks that run at the end of a play, only when notified and only once even
|
||||
if notified multiple times. Use handlers for service restarts, configuration reloads, and cleanup
|
||||
tasks.
|
||||
|
||||
### Use Handlers For
|
||||
|
||||
1. **Service restarts/reloads** - After configuration changes
|
||||
2. **Daemon reloads** - After systemd unit file changes
|
||||
3. **Cache clearing** - After package installations
|
||||
4. **Index rebuilding** - After data changes
|
||||
5. **Cleanup operations** - After multiple related changes
|
||||
|
||||
### Use Tasks (Not Handlers) For
|
||||
|
||||
1. **User account management** - No services to restart
|
||||
2. **File deployment** - Unless it triggers a service reload
|
||||
3. **Package installation** - Unless service needs restart after
|
||||
4. **Variable setting** - No side effects
|
||||
5. **Conditional operations** - When immediate execution required
|
||||
|
||||
### Handler vs Task Decision Matrix
|
||||
|
||||
| Scenario | Use Handler? | Rationale |
|
||||
|----------|-------------|-----------|
|
||||
| SSH config modified | ✅ Yes | Need to restart sshd to apply changes |
|
||||
| User created | ❌ No | No service restart needed |
|
||||
| Systemd unit added | ✅ Yes | Need daemon-reload to register new unit |
|
||||
| Sudoers file modified | ❌ No | Takes effect immediately, no reload |
|
||||
| fail2ban config changed | ✅ Yes | Need to reload fail2ban to apply rules |
|
||||
| SSH key added | ❌ No | Takes effect immediately for new connections |
|
||||
| Network bridge configured | ✅ Yes | Need to apply network changes |
|
||||
|
||||
### Examples from Analyzed Roles
|
||||
|
||||
**security role (handlers needed):**
|
||||
|
||||
```yaml
|
||||
---
|
||||
- name: reload systemd
|
||||
ansible.builtin.systemd_service:
|
||||
daemon_reload: true
|
||||
|
||||
- name: restart ssh
|
||||
ansible.builtin.service:
|
||||
name: "{{ security_sshd_name }}"
|
||||
state: "{{ security_ssh_restart_handler_state }}"
|
||||
|
||||
- name: reload fail2ban
|
||||
ansible.builtin.service:
|
||||
name: fail2ban
|
||||
state: reloaded
|
||||
```
|
||||
|
||||
**github-users role (no handlers):**
|
||||
|
||||
```yaml
|
||||
# handlers/main.yml does not exist
|
||||
# All operations (user creation, SSH key management) take effect immediately
|
||||
```
|
||||
|
||||
### When to Use
|
||||
|
||||
- Manage services that need restart/reload after configuration
|
||||
- Handle systemd daemon reloads
|
||||
- Consolidate multiple changes into single service operation
|
||||
- Defer disruptive operations to end of play
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- ❌ Don't use handlers for operations that need immediate execution
|
||||
- ❌ Don't restart services inline in tasks (breaks idempotence, runs multiple times)
|
||||
- ❌ Don't create handlers for operations without side effects
|
||||
- ❌ Don't use handlers when task order matters critically
|
||||
|
||||
## Pattern: Handler Naming Convention
|
||||
|
||||
### Description
|
||||
|
||||
Use clear, action-oriented names that describe what the handler does. Follow the pattern: `[action] [service/component]`
|
||||
|
||||
### Naming Pattern
|
||||
|
||||
```text
|
||||
[action] [service]
|
||||
```
|
||||
|
||||
**Common actions:**
|
||||
|
||||
- restart - Full service restart (disruptive)
|
||||
- reload - Configuration reload (graceful)
|
||||
- restart - systemd daemon reload
|
||||
- clear - Cache clearing
|
||||
- rebuild - Index/data rebuilding
|
||||
|
||||
### Examples from security role
|
||||
|
||||
```yaml
|
||||
- name: reload systemd
|
||||
- name: restart ssh
|
||||
- name: reload fail2ban
|
||||
```
|
||||
|
||||
**Naming breakdown:**
|
||||
|
||||
- `reload systemd` - Action: reload, Target: systemd daemon
|
||||
- `restart ssh` - Action: restart, Target: ssh service
|
||||
- `reload fail2ban` - Action: reload, Target: fail2ban service
|
||||
|
||||
### Handler Naming Guidelines
|
||||
|
||||
1. **Use lowercase** - "restart ssh" not "Restart SSH"
|
||||
2. **Action first** - Verb before noun (restart ssh, not ssh restart)
|
||||
3. **Be specific** - Name the actual service (ssh, not daemon)
|
||||
4. **One action per handler** - Don't combine "restart ssh and fail2ban"
|
||||
5. **Match notification** - Handler name must match notify string exactly
|
||||
6. **Avoid underscores** - Use spaces: "reload systemd" not "reload_systemd"
|
||||
|
||||
### When to Use
|
||||
|
||||
- All handler definitions in handlers/main.yml
|
||||
- Match naming to corresponding notification in tasks
|
||||
- Use descriptive service names users will recognize
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- ❌ Vague names: "restart service", "reload config"
|
||||
- ❌ Uppercase: "Restart SSH", "RELOAD SYSTEMD"
|
||||
- ❌ Implementation details: "run systemctl restart sshd"
|
||||
- ❌ Underscores: "restart_ssh" (use spaces)
|
||||
- ❌ Overly verbose: "restart the ssh daemon service"
|
||||
|
||||
## Pattern: Simple Handler Definitions
|
||||
|
||||
### Description
|
||||
|
||||
Keep handlers simple and focused. Each handler should perform one action using one module.
|
||||
|
||||
### Handler Structure
|
||||
|
||||
**Basic handler:**
|
||||
|
||||
```yaml
|
||||
- name: restart ssh
|
||||
ansible.builtin.service:
|
||||
name: sshd
|
||||
state: restarted
|
||||
```
|
||||
|
||||
**Handler with variable:**
|
||||
|
||||
```yaml
|
||||
- name: restart ssh
|
||||
ansible.builtin.service:
|
||||
name: "{{ security_sshd_name }}"
|
||||
state: "{{ security_ssh_restart_handler_state }}"
|
||||
```
|
||||
|
||||
**Systemd-specific handler:**
|
||||
|
||||
```yaml
|
||||
- name: reload systemd
|
||||
ansible.builtin.systemd_service:
|
||||
daemon_reload: true
|
||||
```
|
||||
|
||||
### Key Elements
|
||||
|
||||
1. **Single module** - One module per handler
|
||||
2. **Clear purpose** - Does one thing well
|
||||
3. **Variable support** - Use variables for OS differences
|
||||
4. **Appropriate module** - ansible.builtin.systemd_service for systemd, ansible.builtin.service for others
|
||||
5. **Correct state** - restarted, reloaded, or daemon_reload
|
||||
|
||||
### Handler Complexity Levels
|
||||
|
||||
**Simple (preferred):**
|
||||
|
||||
```yaml
|
||||
- name: reload fail2ban
|
||||
ansible.builtin.service:
|
||||
name: fail2ban
|
||||
state: reloaded
|
||||
```
|
||||
|
||||
**With variables (good):**
|
||||
|
||||
```yaml
|
||||
- name: restart ssh
|
||||
ansible.builtin.service:
|
||||
name: "{{ security_sshd_name }}"
|
||||
state: "{{ security_ssh_restart_handler_state }}"
|
||||
```
|
||||
|
||||
**Too complex (anti-pattern):**
|
||||
|
||||
```yaml
|
||||
# ❌ DON'T DO THIS
|
||||
- name: restart ssh and fail2ban
|
||||
ansible.builtin.service:
|
||||
name: "{{ item }}"
|
||||
state: restarted
|
||||
loop:
|
||||
- sshd
|
||||
- fail2ban
|
||||
```
|
||||
|
||||
### When to Use
|
||||
|
||||
- Keep handlers to 2-5 lines max
|
||||
- One module per handler
|
||||
- Use variables for portability
|
||||
- Make behavior configurable when appropriate
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- ❌ Multiple tasks in one handler
|
||||
- ❌ Complex loops in handlers
|
||||
- ❌ Conditional logic in handlers (put in tasks with conditional notify)
|
||||
- ❌ Multiple module calls in one handler
|
||||
|
||||
## Pattern: Reload vs Restart Strategy
|
||||
|
||||
### Description
|
||||
|
||||
Prefer `reload` over `restart` when the service supports it. Reloading is less disruptive and
|
||||
maintains active connections.
|
||||
|
||||
### Reload (Preferred When Available)
|
||||
|
||||
**Characteristics:**
|
||||
|
||||
- Graceful configuration reload
|
||||
- Maintains active connections
|
||||
- Less disruptive to service
|
||||
- Faster than full restart
|
||||
|
||||
**Example:**
|
||||
|
||||
```yaml
|
||||
- name: reload fail2ban
|
||||
ansible.builtin.service:
|
||||
name: fail2ban
|
||||
state: reloaded
|
||||
```
|
||||
|
||||
**Services that support reload:**
|
||||
|
||||
- nginx
|
||||
- apache
|
||||
- fail2ban
|
||||
- rsyslog
|
||||
- haproxy
|
||||
|
||||
### Restart (When Reload Not Supported)
|
||||
|
||||
**Characteristics:**
|
||||
|
||||
- Full service stop and start
|
||||
- Drops active connections
|
||||
- More disruptive
|
||||
- Necessary for some changes
|
||||
|
||||
**Example:**
|
||||
|
||||
```yaml
|
||||
- name: restart ssh
|
||||
ansible.builtin.service:
|
||||
name: "{{ security_sshd_name }}"
|
||||
state: restarted
|
||||
```
|
||||
|
||||
**When restart is necessary:**
|
||||
|
||||
- SSH daemon (sshd doesn't support reload properly)
|
||||
- Services without reload capability
|
||||
- Major configuration changes requiring full restart
|
||||
- Binary/package updates
|
||||
|
||||
### Systemd Daemon Reload (Special Case)
|
||||
|
||||
**For systemd unit file changes:**
|
||||
|
||||
```yaml
|
||||
- name: reload systemd
|
||||
ansible.builtin.systemd_service:
|
||||
daemon_reload: true
|
||||
```
|
||||
|
||||
**When to use:**
|
||||
|
||||
- After adding new systemd unit files
|
||||
- After modifying existing unit files
|
||||
- Before starting newly added services
|
||||
- When systemd complains about outdated configs
|
||||
|
||||
### Decision Matrix
|
||||
|
||||
| Service | Configuration Change | Action | Rationale |
|
||||
|---------|---------------------|--------|-----------|
|
||||
| nginx | nginx.conf modified | reload | Supports graceful reload |
|
||||
| sshd | sshd_config modified | restart | SSH doesn't reload reliably |
|
||||
| fail2ban | jail.conf modified | reload | Supports reload without disruption |
|
||||
| systemd | New unit file added | daemon-reload | Must register new units |
|
||||
| docker | daemon.json changed | restart | Daemon restart required |
|
||||
|
||||
### When to Use
|
||||
|
||||
- Always try reload first if service supports it
|
||||
- Use restart when reload is unavailable
|
||||
- Use daemon-reload for systemd unit changes
|
||||
- Document why restart is used instead of reload
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- ❌ Always using restart (unnecessarily disruptive)
|
||||
- ❌ Using reload when service doesn't support it (silent failure)
|
||||
- ❌ Forgetting daemon-reload before starting new systemd services
|
||||
|
||||
## Pattern: Configurable Handler Behavior
|
||||
|
||||
### Description
|
||||
|
||||
Make handler behavior configurable via variables when users might need different states.
|
||||
|
||||
### Configurable State Variable
|
||||
|
||||
**Variable definition (defaults/main.yml):**
|
||||
|
||||
```yaml
|
||||
security_ssh_restart_handler_state: restarted
|
||||
```
|
||||
|
||||
**Handler definition (handlers/main.yml):**
|
||||
|
||||
```yaml
|
||||
- name: restart ssh
|
||||
ansible.builtin.service:
|
||||
name: "{{ security_sshd_name }}"
|
||||
state: "{{ security_ssh_restart_handler_state }}"
|
||||
```
|
||||
|
||||
**Usage scenarios:**
|
||||
|
||||
```yaml
|
||||
# Normal operation - restart SSH
|
||||
security_ssh_restart_handler_state: restarted
|
||||
|
||||
# Testing/check mode - just reload
|
||||
security_ssh_restart_handler_state: reloaded
|
||||
|
||||
# Manual control - just ensure running
|
||||
security_ssh_restart_handler_state: started
|
||||
```
|
||||
|
||||
### When to Make Handlers Configurable
|
||||
|
||||
**Good candidates for configuration:**
|
||||
|
||||
1. Services with both reload and restart options
|
||||
2. Critical services users might not want to restart automatically
|
||||
3. Services with graceful shutdown requirements
|
||||
4. Testing scenarios where full restart is undesirable
|
||||
|
||||
**Not necessary for:**
|
||||
|
||||
1. systemd daemon-reload (only one valid action)
|
||||
2. Simple cache clears
|
||||
3. Handlers where state is always the same
|
||||
|
||||
### When to Use
|
||||
|
||||
- Critical services (SSH, networking)
|
||||
- Services with reload option
|
||||
- When users might need control over restart behavior
|
||||
- Testing and development scenarios
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- ❌ Configuring every handler (over-engineering)
|
||||
- ❌ Complex handler state logic
|
||||
- ❌ Defaults that don't work (e.g., "stopped" for SSH)
|
||||
|
||||
## Pattern: Handler Notification
|
||||
|
||||
### Description
|
||||
|
||||
Notify handlers from tasks using the `notify` directive. Tasks can notify multiple handlers.
|
||||
|
||||
### Single Handler Notification
|
||||
|
||||
**Task:**
|
||||
|
||||
```yaml
|
||||
- name: Update SSH configuration to be more secure.
|
||||
ansible.builtin.lineinfile:
|
||||
dest: "{{ security_ssh_config_path }}"
|
||||
regexp: "{{ item.regexp }}"
|
||||
line: "{{ item.line }}"
|
||||
state: present
|
||||
validate: 'sshd -T -f %s'
|
||||
with_items:
|
||||
- regexp: "^PasswordAuthentication"
|
||||
line: "PasswordAuthentication no"
|
||||
notify: restart ssh
|
||||
```
|
||||
|
||||
**Handler:**
|
||||
|
||||
```yaml
|
||||
- name: restart ssh
|
||||
ansible.builtin.service:
|
||||
name: sshd
|
||||
state: restarted
|
||||
```
|
||||
|
||||
### Multiple Handler Notification
|
||||
|
||||
**Task:**
|
||||
|
||||
```yaml
|
||||
- name: Update SSH configuration to be more secure.
|
||||
ansible.builtin.lineinfile:
|
||||
dest: "{{ security_ssh_config_path }}"
|
||||
regexp: "{{ item.regexp }}"
|
||||
line: "{{ item.line }}"
|
||||
state: present
|
||||
validate: 'sshd -T -f %s'
|
||||
with_items:
|
||||
- regexp: "^PasswordAuthentication"
|
||||
line: "PasswordAuthentication no"
|
||||
notify:
|
||||
- reload systemd
|
||||
- restart ssh
|
||||
```
|
||||
|
||||
**Handlers run in order defined in handlers/main.yml:**
|
||||
|
||||
```yaml
|
||||
- name: reload systemd
|
||||
ansible.builtin.systemd_service:
|
||||
daemon_reload: true
|
||||
|
||||
- name: restart ssh
|
||||
ansible.builtin.service:
|
||||
name: sshd
|
||||
state: restarted
|
||||
```
|
||||
|
||||
### Notification Behavior
|
||||
|
||||
1. **Handlers run once** - Even if notified multiple times in a play
|
||||
2. **Handlers run at end** - After all tasks complete
|
||||
3. **Handlers run in order** - Order defined in handlers/main.yml, not notification order
|
||||
4. **Failed tasks skip handlers** - If any task fails, handlers may not run
|
||||
|
||||
### When to Use
|
||||
|
||||
- Notify handler when configuration changes
|
||||
- Use multiple notifications when order matters (daemon-reload before restart)
|
||||
- Rely on automatic deduplication (don't worry about multiple notifications)
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- ❌ Notifying handlers that don't exist (typo in handler name)
|
||||
- ❌ Depending on handler execution order from notify (use handlers/main.yml order)
|
||||
- ❌ Expecting immediate handler execution (handlers run at end of play)
|
||||
- ❌ Notifying handlers from failed tasks (use `force_handlers: true` if needed)
|
||||
|
||||
## Comparison to Virgo-Core Roles
|
||||
|
||||
### system_user Role
|
||||
|
||||
**Handler Analysis:**
|
||||
|
||||
```yaml
|
||||
# handlers/main.yml is empty (no handlers defined)
|
||||
```
|
||||
|
||||
**Assessment:**
|
||||
|
||||
- ✅ **Correct decision** - User management doesn't require service restarts
|
||||
- ✅ **No handlers needed** - SSH keys, sudoers take effect immediately
|
||||
- ✅ **Matches github-users pattern** - Simple role, no services
|
||||
|
||||
**Pattern Match:** 100% - Correctly identifies that handlers are not needed
|
||||
|
||||
### proxmox_access Role
|
||||
|
||||
**Handler Analysis (from review):**
|
||||
|
||||
```yaml
|
||||
# Has handlers for Proxmox API operations
|
||||
```
|
||||
|
||||
**Assessment:**
|
||||
|
||||
- ✅ **Handlers appropriately used** - For operations that need completion
|
||||
- ✅ **Follows naming conventions** - Clear handler names
|
||||
- ✅ **Simple handler definitions** - One action per handler
|
||||
|
||||
**Recommendations:**
|
||||
|
||||
- Review if all handlers are necessary
|
||||
- Consider if any operations could be immediate tasks
|
||||
|
||||
**Pattern Match:** 90% - Good handler usage, minor review recommended
|
||||
|
||||
### proxmox_network Role
|
||||
|
||||
**Handler Analysis:**
|
||||
|
||||
```yaml
|
||||
# handlers/main.yml
|
||||
---
|
||||
- name: reload networking
|
||||
ansible.builtin.command: ifreload -a
|
||||
changed_when: false
|
||||
```
|
||||
|
||||
**Assessment:**
|
||||
|
||||
- ✅ **Handler needed** - Network changes require reload
|
||||
- ✅ **Single purpose** - One handler for network reload
|
||||
- ⚠️ **Uses command module** - Necessary for ifreload (no module exists)
|
||||
- ✅ **changed_when: false** - Prevents false change reporting
|
||||
|
||||
**Minor improvement opportunity:**
|
||||
|
||||
```yaml
|
||||
- name: reload networking
|
||||
ansible.builtin.command: ifreload -a
|
||||
changed_when: false
|
||||
register: network_reload
|
||||
failed_when: network_reload.rc != 0
|
||||
```
|
||||
|
||||
**Pattern Match:** 95% - Excellent handler usage, appropriate for network management
|
||||
|
||||
## Validation: geerlingguy.docker
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repository:** <https://github.com/geerlingguy/ansible-role-docker>
|
||||
|
||||
### Handler Structure
|
||||
|
||||
**Docker role handlers/main.yml:**
|
||||
|
||||
```yaml
|
||||
- name: restart docker
|
||||
ansible.builtin.service:
|
||||
name: docker
|
||||
state: "{{ docker_restart_handler_state }}"
|
||||
ignore_errors: "{{ ansible_check_mode }}"
|
||||
when: docker_service_manage | bool
|
||||
|
||||
- name: apt update
|
||||
ansible.builtin.apt:
|
||||
update_cache: true
|
||||
```
|
||||
|
||||
### Handler Naming
|
||||
|
||||
- **Pattern: Lowercase "[action] [service]"** - ✅ **Confirmed**
|
||||
- "restart docker" - follows exact pattern
|
||||
- "apt update" - follows exact pattern
|
||||
- Confirms lowercase naming is universal
|
||||
|
||||
### Handler Simplicity
|
||||
|
||||
- **Pattern: Single module, single purpose** - ✅ **Confirmed**
|
||||
- Each handler uses one module, does one thing
|
||||
- Confirms simple handler pattern is universal
|
||||
|
||||
### Handler Configurability
|
||||
|
||||
- **Pattern: Configurable handler behavior** - ✅ **Confirmed**
|
||||
- Uses `docker_restart_handler_state` variable (default: "restarted")
|
||||
- Same pattern as security role's `security_ssh_restart_handler_state`
|
||||
- Confirms making critical service handlers configurable is standard
|
||||
|
||||
### Advanced Pattern: Conditional Handlers
|
||||
|
||||
- **Pattern Evolution:** Docker introduces conditional handler execution:
|
||||
|
||||
```yaml
|
||||
when: docker_service_manage | bool
|
||||
ignore_errors: "{{ ansible_check_mode }}"
|
||||
```
|
||||
|
||||
- **New insight:** Handlers can have conditionals to prevent execution in certain scenarios
|
||||
- **Use case:** Container environments without systemd (docker_service_manage: false)
|
||||
- **Use case:** Check mode support (ignore_errors in check mode)
|
||||
- **Recommendation:** Add conditionals when handler might not be applicable
|
||||
|
||||
### Handler Notification Patterns
|
||||
|
||||
- **Pattern: notify from multiple tasks** - ✅ **Confirmed**
|
||||
- Multiple tasks notify "restart docker" (package install, daemon config, service patch)
|
||||
- Handler runs once at end despite multiple notifications
|
||||
- Confirms deduplication behavior
|
||||
|
||||
### Advanced Pattern: meta: flush_handlers
|
||||
|
||||
- **Pattern Evolution:** Docker uses explicit handler flushing:
|
||||
|
||||
```yaml
|
||||
- name: Ensure handlers are notified now to avoid firewall conflicts.
|
||||
ansible.builtin.meta: flush_handlers
|
||||
```
|
||||
|
||||
- **New insight:** Can force handlers to run mid-play, not just at end
|
||||
- **Use case:** Docker service must be running before adding users to docker group
|
||||
- **Recommendation:** Use flush_handlers when later tasks depend on handler completion
|
||||
|
||||
### Secondary Handler Pattern
|
||||
|
||||
- **Pattern: apt update handler** - ⚠️ **Contextual**
|
||||
- Docker has "apt update" handler for repository changes
|
||||
- Not present in security/users roles
|
||||
- **Insight:** Package management roles may need cache update handlers
|
||||
- **When to use:** When adding repositories that need immediate cache refresh
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
**What Docker Role Confirms:**
|
||||
|
||||
1. ✅ Lowercase naming is universal
|
||||
2. ✅ Simple, single-purpose handlers are universal
|
||||
3. ✅ Configurable handler state is standard for critical services
|
||||
4. ✅ Handler deduplication works as expected
|
||||
|
||||
**What Docker Role Evolves:**
|
||||
|
||||
1. 🔄 Conditional handler execution (when: docker_service_manage | bool)
|
||||
2. 🔄 Check mode support (ignore_errors: "{{ ansible_check_mode }}")
|
||||
3. 🔄 Explicit handler flushing (meta: flush_handlers)
|
||||
4. 🔄 Repository-specific handlers (apt update)
|
||||
|
||||
**Pattern Confidence After Docker Validation:**
|
||||
|
||||
- **Handler naming:** UNIVERSAL (3/3 roles use lowercase "[action] [service]")
|
||||
- **Handler simplicity:** UNIVERSAL (3/3 use single module per handler)
|
||||
- **Configurable state:** UNIVERSAL (critical service handlers are configurable)
|
||||
- **Conditional handlers:** EVOLVED (docker adds when: conditionals)
|
||||
- **Handler flushing:** EVOLVED (docker introduces meta: flush_handlers)
|
||||
|
||||
## Summary
|
||||
|
||||
**Universal Handler Patterns:**
|
||||
|
||||
1. Use handlers only when services/daemons need restart/reload
|
||||
2. One handler per service/action combination
|
||||
3. Lowercase naming: "[action] [service]"
|
||||
4. Keep handlers simple (single module, single purpose)
|
||||
5. Prefer reload over restart when available
|
||||
6. Place all handlers in handlers/main.yml
|
||||
7. Make critical handler behavior configurable
|
||||
8. Handler name must match notify string exactly
|
||||
|
||||
**Key Takeaways:**
|
||||
|
||||
- Not all roles need handlers (user management, file deployment often don't)
|
||||
- Handlers prevent duplicate service restarts (run once per play)
|
||||
- Reload is less disruptive than restart (use when supported)
|
||||
- Handler order is defined in handlers/main.yml, not by notify order
|
||||
- Keep handlers simple and focused
|
||||
- Configurable handler behavior helps with testing and critical services
|
||||
|
||||
**Virgo-Core Assessment:**
|
||||
|
||||
All three roles demonstrate good handler discipline:
|
||||
|
||||
- **system_user** - Correctly has no handlers (none needed)
|
||||
- **proxmox_access** - Has appropriate handlers
|
||||
- **proxmox_network** - Good network reload handler
|
||||
|
||||
No critical handler-related gaps identified. Virgo-Core roles follow best practices.
|
||||
|
||||
## Validation: geerlingguy.postgresql
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repository:** <https://github.com/geerlingguy/ansible-role-postgresql>
|
||||
|
||||
### Handler Structure
|
||||
|
||||
**PostgreSQL role handlers/main.yml:**
|
||||
|
||||
```yaml
|
||||
- name: restart postgresql
|
||||
ansible.builtin.service:
|
||||
name: "{{ postgresql_daemon }}"
|
||||
state: "{{ postgresql_restarted_state }}"
|
||||
```
|
||||
|
||||
### Handler Naming
|
||||
|
||||
- **Pattern: Lowercase "[action] [service]"** - ✅ **Confirmed**
|
||||
- "restart postgresql" - follows exact pattern
|
||||
- **4/4 roles use lowercase naming**
|
||||
|
||||
### Handler Simplicity
|
||||
|
||||
- **Pattern: Single module, single purpose** - ✅ **Confirmed**
|
||||
- One handler, one service module, simple action
|
||||
- **4/4 roles follow simple handler pattern**
|
||||
|
||||
### Handler Configurability
|
||||
|
||||
- **Pattern: Configurable handler behavior** - ✅ **Confirmed**
|
||||
- Uses `postgresql_restarted_state` variable (default: "restarted")
|
||||
- Same pattern as security_ssh_restart_handler_state and docker_restart_handler_state
|
||||
- **Validates:** Making critical service handlers configurable is standard practice
|
||||
- **4/4 roles with service handlers make state configurable**
|
||||
|
||||
### Service Management Variables
|
||||
|
||||
- **Pattern: Configurable service state** - ✅ **Confirmed**
|
||||
- postgresql_service_state: started (whether to start service)
|
||||
- postgresql_service_enabled: true (whether to enable at boot)
|
||||
- postgresql_restarted_state: "restarted" (handler behavior)
|
||||
- **Demonstrates:** Separation of initial state vs handler state
|
||||
|
||||
### Handler Notification Patterns
|
||||
|
||||
- **Pattern: Multiple tasks notify same handler** - ✅ **Confirmed**
|
||||
- Configuration changes, package installations, initialization all notify "restart postgresql"
|
||||
- Handler runs once despite multiple notifications
|
||||
- **4/4 roles demonstrate handler deduplication**
|
||||
|
||||
### Advanced Pattern: Conditional Handler Execution
|
||||
|
||||
- **Pattern: Handler conditionals** - ⚠️ **Not Present**
|
||||
- PostgreSQL handler doesn't use `when:` conditionals
|
||||
- Unlike docker role which has `when: docker_service_manage | bool`
|
||||
- **Insight:** PostgreSQL always manages service, docker sometimes doesn't (containers)
|
||||
- **Contextual:** Use conditionals only when service management is optional
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
**What PostgreSQL Role Confirms:**
|
||||
|
||||
1. ✅ Lowercase naming is universal (4/4 roles)
|
||||
2. ✅ Simple, single-purpose handlers are universal (4/4 roles)
|
||||
3. ✅ Configurable handler state is standard for database/service roles (4/4 roles)
|
||||
4. ✅ Handler deduplication works reliably (4/4 roles depend on it)
|
||||
5. ✅ Service + handler pattern is consistent
|
||||
|
||||
**What PostgreSQL Role Demonstrates:**
|
||||
|
||||
1. 🔄 Database roles follow same handler patterns as other service roles
|
||||
2. 🔄 Configurable handler state (`restarted` vs `reloaded`) is valuable for databases
|
||||
3. 🔄 Service management variables (state, enabled, restart_state) are standard trio
|
||||
|
||||
**Pattern Confidence After PostgreSQL Validation (4/4 roles):**
|
||||
|
||||
- **Handler naming:** UNIVERSAL (4/4 roles use lowercase "[action] [service]")
|
||||
- **Handler simplicity:** UNIVERSAL (4/4 use single module per handler)
|
||||
- **Configurable state:** UNIVERSAL (4/4 service roles make it configurable)
|
||||
- **Conditional handlers:** CONTEXTUAL (docker uses it, postgresql/security/users don't need it)
|
||||
|
||||
**Next Steps:**
|
||||
|
||||
Continue pattern of creating handlers only when necessary. Use the handler checklist:
|
||||
|
||||
1. Does this role manage a service? → Maybe needs handlers
|
||||
2. Does configuration change require reload/restart? → Add handler
|
||||
3. Can I use reload instead of restart? → Prefer reload (PostgreSQL uses restart, can't reload config)
|
||||
4. Is handler behavior critical? → Make it configurable (database services should be configurable)
|
||||
5. Is handler name clear and lowercase? → Follow naming pattern
|
||||
6. Is service management optional? → Add conditional (when: role_service_manage | bool)
|
||||
|
||||
## Validation: geerlingguy.nginx
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repository:** <https://github.com/geerlingguy/ansible-role-nginx>
|
||||
|
||||
### Handler Structure
|
||||
|
||||
**nginx role handlers/main.yml:**
|
||||
|
||||
```yaml
|
||||
---
|
||||
- name: restart nginx
|
||||
ansible.builtin.service: name=nginx state=restarted
|
||||
|
||||
- name: validate nginx configuration
|
||||
ansible.builtin.command: nginx -t -c /etc/nginx/nginx.conf
|
||||
changed_when: false
|
||||
|
||||
- name: reload nginx
|
||||
ansible.builtin.service: name=nginx state=reloaded
|
||||
when: nginx_service_state == "started"
|
||||
```
|
||||
|
||||
### Handler Naming
|
||||
|
||||
- **Pattern: Lowercase "[action] [service]"** - ✅ **Confirmed**
|
||||
- "restart nginx", "reload nginx", "validate nginx configuration"
|
||||
- **5/5 roles use lowercase naming**
|
||||
|
||||
### Handler Simplicity
|
||||
|
||||
- **Pattern: Single module, single purpose** - ✅ **Confirmed**
|
||||
- Each handler performs one clear action
|
||||
- **5/5 roles follow simple handler pattern**
|
||||
|
||||
### Reload vs Restart Pattern - ✅ **CONFIRMED**
|
||||
|
||||
- **nginx has BOTH reload and restart handlers:**
|
||||
- `restart nginx` - Full service restart (disruptive)
|
||||
- `reload nginx` - Graceful configuration reload (preferred)
|
||||
- **Demonstrates best practice:** Provide both, use reload by default
|
||||
- **5/5 roles demonstrate reload preference when supported**
|
||||
|
||||
### Handler Conditional Execution - ✅ **NEW PATTERN**
|
||||
|
||||
- **Pattern: Conditional reload handler** - ✅ **CONFIRMED**
|
||||
- reload nginx has: `when: nginx_service_state == "started"`
|
||||
- Prevents reload attempt if service is stopped
|
||||
- **Safety pattern:** Don't reload stopped services
|
||||
- **Recommendation:** Add `when` conditionals to reload handlers
|
||||
|
||||
### Validation Handler Pattern - ✨ **NEW INSIGHT**
|
||||
|
||||
- **Pattern: Configuration validation handler** - ✨ **NEW INSIGHT**
|
||||
- "validate nginx configuration" handler uses `command: nginx -t`
|
||||
- `changed_when: false` prevents false change reports
|
||||
- **Use case:** Run validation before restart/reload
|
||||
- **Not seen in previous roles** (they use validate parameter in tasks instead)
|
||||
- **Alternative pattern:** Task-level validation vs handler-level validation
|
||||
|
||||
### Service State Variable Pattern
|
||||
|
||||
- **Pattern: Configurable service state** - ✅ **Confirmed**
|
||||
- nginx_service_state: started (default)
|
||||
- nginx_service_enabled: true (default)
|
||||
- **5/5 service management roles use this pattern**
|
||||
|
||||
### Handler Notification Patterns
|
||||
|
||||
- **Pattern: Multiple handlers for configuration changes** - ✅ **Confirmed**
|
||||
- Template changes notify: reload nginx
|
||||
- Vhost changes notify: reload nginx
|
||||
- **Insight:** nginx prefers reload over restart (less disruptive)
|
||||
- Validates reload vs restart decision matrix
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
**What nginx Role Confirms:**
|
||||
|
||||
1. ✅ Lowercase naming is universal (5/5 roles)
|
||||
2. ✅ Simple, single-purpose handlers are universal (5/5 roles)
|
||||
3. ✅ Reload vs restart distinction is universal for web servers (5/5 roles)
|
||||
4. ✅ Service state variables are universal (5/5 roles)
|
||||
5. ✅ Handler deduplication works reliably (5/5 roles)
|
||||
|
||||
**What nginx Role Demonstrates (✨ NEW INSIGHTS):**
|
||||
|
||||
1. ✨ **Both reload AND restart handlers:** Provide flexibility, default to reload
|
||||
2. ✨ **Conditional reload handler:** `when: service_state == "started"` prevents errors
|
||||
3. ✨ **Validation handler pattern:** Alternative to task-level validation
|
||||
4. 🔄 Web servers should ALWAYS prefer reload over restart
|
||||
5. 🔄 Handler safety: Check service state before reload
|
||||
|
||||
**Pattern Confidence After nginx Validation (5/5 roles):**
|
||||
|
||||
- **Handler naming:** UNIVERSAL (5/5 roles use lowercase "[action] [service]")
|
||||
- **Handler simplicity:** UNIVERSAL (5/5 use single module per handler)
|
||||
- **Reload vs restart:** UNIVERSAL (5/5 web/service roles distinguish them)
|
||||
- **Conditional handlers:** RECOMMENDED (nginx shows safety pattern)
|
||||
- **Validation handlers:** ALTERNATIVE PATTERN (task validation vs handler validation)
|
||||
|
||||
## Validation: geerlingguy.pip and geerlingguy.git
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repositories:**
|
||||
|
||||
- <https://github.com/geerlingguy/ansible-role-pip>
|
||||
- <https://github.com/geerlingguy/ansible-role-git>
|
||||
|
||||
### Handler Absence Pattern
|
||||
|
||||
- **Pattern: No handlers needed** - ✅ **Confirmed**
|
||||
- pip role has NO handlers/ directory (package installation doesn't need service restarts)
|
||||
- git role has NO handlers/ directory (utility installation doesn't manage services)
|
||||
- **Key finding:** Utility roles typically don't need handlers
|
||||
|
||||
### When Handlers Are NOT Needed
|
||||
|
||||
- **Pattern: Package-only roles** - ✅ **NEW INSIGHT**
|
||||
- Roles that only install packages don't need handlers
|
||||
- Roles that don't manage services don't need handlers
|
||||
- Handler absence is correct and expected for utility roles
|
||||
- **7/7 roles make appropriate handler decisions (present when needed, absent when not)**
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
**What pip + git Roles Confirm:**
|
||||
|
||||
1. ✅ Handlers are optional based on role purpose (7/7 roles decide appropriately)
|
||||
2. ✅ Utility roles (package installers) typically have no handlers (pip, git prove this)
|
||||
3. ✅ Service-managing roles ALWAYS have handlers (docker, postgresql, nginx, etc.)
|
||||
4. ✅ Handler directory can be omitted when not needed (pip + git validate this)
|
||||
|
||||
**Pattern Confidence After Utility Role Validation (7/7 roles):**
|
||||
|
||||
- **Handler naming:** UNIVERSAL (7/7 service roles use lowercase "[action] [service]")
|
||||
- **Handler simplicity:** UNIVERSAL (7/7 service roles use single module per handler)
|
||||
- **Reload vs restart:** UNIVERSAL (7/7 web/service roles distinguish them)
|
||||
- **Handlers optional for utilities:** CONFIRMED (pip + git have none, correctly)
|
||||
- **Handler presence decision matrix:** VALIDATED
|
||||
- Service management role → handlers required
|
||||
- Package-only utility role → no handlers needed
|
||||
- Configuration management role → handlers for service reload/restart
|
||||
1078
skills/ansible-best-practices/patterns/meta-dependencies.md
Normal file
1078
skills/ansible-best-practices/patterns/meta-dependencies.md
Normal file
File diff suppressed because it is too large
Load Diff
467
skills/ansible-best-practices/patterns/network-automation.md
Normal file
467
skills/ansible-best-practices/patterns/network-automation.md
Normal file
@@ -0,0 +1,467 @@
|
||||
# Network Automation Patterns
|
||||
|
||||
Best practices for declarative network configuration in Proxmox VE environments with Ansible.
|
||||
|
||||
## Pattern: Declarative Network Interface Configuration
|
||||
|
||||
**Problem**: Network configuration is complex, error-prone when done manually, and difficult to maintain across
|
||||
multiple nodes.
|
||||
|
||||
**Solution**: Use declarative configuration with data structures that describe desired state.
|
||||
|
||||
### Configuration Model
|
||||
|
||||
```yaml
|
||||
# group_vars/matrix_cluster.yml
|
||||
network_interfaces:
|
||||
management:
|
||||
bridge: vmbr0
|
||||
physical_port: enp4s0
|
||||
address: "192.168.3.{{ node_id }}/24"
|
||||
gateway: "192.168.3.1"
|
||||
vlan_aware: true
|
||||
vlan_ids: "9"
|
||||
mtu: 1500
|
||||
comment: "Management network"
|
||||
|
||||
ceph_public:
|
||||
bridge: vmbr1
|
||||
physical_port: enp5s0f0np0
|
||||
address: "192.168.5.{{ node_id }}/24"
|
||||
mtu: 9000
|
||||
comment: "CEPH Public network"
|
||||
|
||||
ceph_private:
|
||||
bridge: vmbr2
|
||||
physical_port: enp5s0f1np1
|
||||
address: "192.168.7.{{ node_id }}/24"
|
||||
mtu: 9000
|
||||
comment: "CEPH Private network"
|
||||
|
||||
# VLAN configuration
|
||||
vlans:
|
||||
- id: 9
|
||||
raw_device: vmbr0
|
||||
address: "192.168.8.{{ node_id }}/24"
|
||||
comment: "Corosync network"
|
||||
|
||||
# Node-specific IDs
|
||||
node_ids:
|
||||
foxtrot: 5
|
||||
golf: 6
|
||||
hotel: 7
|
||||
|
||||
# Set node_id based on hostname
|
||||
node_id: "{{ node_ids[inventory_hostname_short] }}"
|
||||
```
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_networking/tasks/bridges.yml
|
||||
---
|
||||
- name: Create Proxmox bridge interfaces in /etc/network/interfaces
|
||||
ansible.builtin.blockinfile:
|
||||
path: /etc/network/interfaces
|
||||
marker: "# {mark} ANSIBLE MANAGED BLOCK - {{ item.key }}"
|
||||
block: |
|
||||
# {{ item.value.comment }}
|
||||
auto {{ item.value.bridge }}
|
||||
iface {{ item.value.bridge }} inet static
|
||||
address {{ item.value.address }}
|
||||
{% if item.value.gateway is defined %}
|
||||
gateway {{ item.value.gateway }}
|
||||
{% endif %}
|
||||
bridge-ports {{ item.value.physical_port }}
|
||||
bridge-stp off
|
||||
bridge-fd 0
|
||||
{% if item.value.vlan_aware | default(false) %}
|
||||
bridge-vlan-aware yes
|
||||
{% endif %}
|
||||
{% if item.value.vlan_ids is defined %}
|
||||
bridge-vids {{ item.value.vlan_ids }}
|
||||
{% endif %}
|
||||
{% if item.value.mtu is defined and item.value.mtu != 1500 %}
|
||||
mtu {{ item.value.mtu }}
|
||||
{% endif %}
|
||||
create: false
|
||||
loop: "{{ network_interfaces | dict2items }}"
|
||||
loop_control:
|
||||
label: "{{ item.value.bridge }}"
|
||||
notify:
|
||||
- reload networking
|
||||
```
|
||||
|
||||
## Pattern: VLAN Interface Creation
|
||||
|
||||
**Problem**: VLAN interfaces must be created at runtime and persist across reboots.
|
||||
|
||||
**Solution**: Manage both persistent configuration and runtime state.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_networking/tasks/vlans.yml
|
||||
---
|
||||
- name: Configure VLAN interfaces in /etc/network/interfaces
|
||||
ansible.builtin.blockinfile:
|
||||
path: /etc/network/interfaces
|
||||
marker: "# {mark} ANSIBLE MANAGED BLOCK - vlan{{ item.id }}"
|
||||
block: |
|
||||
# {{ item.comment }}
|
||||
auto vlan{{ item.id }}
|
||||
iface vlan{{ item.id }} inet static
|
||||
address {{ item.address }}
|
||||
vlan-raw-device {{ item.raw_device }}
|
||||
create: false
|
||||
loop: "{{ vlans }}"
|
||||
loop_control:
|
||||
label: "vlan{{ item.id }}"
|
||||
notify:
|
||||
- reload networking
|
||||
|
||||
- name: Check if VLAN interface exists
|
||||
ansible.builtin.command:
|
||||
cmd: "ip link show vlan{{ item.id }}"
|
||||
register: vlan_check
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
loop: "{{ vlans }}"
|
||||
loop_control:
|
||||
label: "vlan{{ item.id }}"
|
||||
|
||||
- name: Create VLAN interface at runtime
|
||||
ansible.builtin.command:
|
||||
cmd: "ip link add link {{ item.item.raw_device }} name vlan{{ item.item.id }} type vlan id {{ item.item.id }}"
|
||||
when: item.rc != 0
|
||||
loop: "{{ vlan_check.results }}"
|
||||
loop_control:
|
||||
label: "vlan{{ item.item.id }}"
|
||||
notify:
|
||||
- reload networking
|
||||
|
||||
- name: Bring up VLAN interface
|
||||
ansible.builtin.command:
|
||||
cmd: "ip link set vlan{{ item.item.id }} up"
|
||||
when: item.rc != 0
|
||||
loop: "{{ vlan_check.results }}"
|
||||
loop_control:
|
||||
label: "vlan{{ item.item.id }}"
|
||||
```
|
||||
|
||||
## Pattern: MTU Configuration for Jumbo Frames
|
||||
|
||||
**Problem**: CEPH storage networks require jumbo frames (MTU 9000) for optimal performance.
|
||||
|
||||
**Solution**: Configure MTU at both interface and bridge level with verification.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_networking/tasks/mtu.yml
|
||||
---
|
||||
- name: Set MTU on physical interfaces
|
||||
ansible.builtin.command:
|
||||
cmd: "ip link set {{ item.value.physical_port }} mtu {{ item.value.mtu }}"
|
||||
when: item.value.mtu is defined and item.value.mtu > 1500
|
||||
loop: "{{ network_interfaces | dict2items }}"
|
||||
loop_control:
|
||||
label: "{{ item.value.physical_port }}"
|
||||
register: mtu_set
|
||||
changed_when: mtu_set.rc == 0
|
||||
|
||||
- name: Set MTU on bridge interfaces
|
||||
ansible.builtin.command:
|
||||
cmd: "ip link set {{ item.value.bridge }} mtu {{ item.value.mtu }}"
|
||||
when: item.value.mtu is defined and item.value.mtu > 1500
|
||||
loop: "{{ network_interfaces | dict2items }}"
|
||||
loop_control:
|
||||
label: "{{ item.value.bridge }}"
|
||||
register: bridge_mtu_set
|
||||
changed_when: bridge_mtu_set.rc == 0
|
||||
|
||||
- name: Verify MTU configuration
|
||||
ansible.builtin.command:
|
||||
cmd: "ip link show {{ item.value.bridge }}"
|
||||
register: mtu_check
|
||||
changed_when: false
|
||||
failed_when: "'mtu ' + (item.value.mtu | string) not in mtu_check.stdout"
|
||||
when: item.value.mtu is defined and item.value.mtu > 1500
|
||||
loop: "{{ network_interfaces | dict2items }}"
|
||||
loop_control:
|
||||
label: "{{ item.value.bridge }}"
|
||||
|
||||
- name: Test jumbo frame connectivity (CEPH networks only)
|
||||
ansible.builtin.command:
|
||||
cmd: "ping -c 3 -M do -s 8972 {{ hostvars[item].ansible_host }}"
|
||||
register: jumbo_test
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when:
|
||||
- "'ceph' in network_interfaces"
|
||||
- item != inventory_hostname
|
||||
loop: "{{ groups['proxmox'] }}"
|
||||
loop_control:
|
||||
label: "{{ item }}"
|
||||
|
||||
- name: Report jumbo frame test results
|
||||
ansible.builtin.debug:
|
||||
msg: "Jumbo frame test to {{ item.item }}: {{ 'PASSED' if item.rc == 0 else 'FAILED' }}"
|
||||
when: item is not skipped
|
||||
loop: "{{ jumbo_test.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item }}"
|
||||
```
|
||||
|
||||
## Pattern: Bridge VLAN-Aware Configuration
|
||||
|
||||
**Problem**: VMs need access to multiple VLANs through a single bridge interface.
|
||||
|
||||
**Solution**: Enable VLAN-aware bridges and specify allowed VLAN IDs.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_networking/tasks/vlan_aware.yml
|
||||
---
|
||||
- name: Check current bridge VLAN awareness
|
||||
ansible.builtin.command:
|
||||
cmd: "bridge vlan show dev {{ item.value.bridge }}"
|
||||
register: vlan_aware_check
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: item.value.vlan_aware | default(false)
|
||||
loop: "{{ network_interfaces | dict2items }}"
|
||||
loop_control:
|
||||
label: "{{ item.value.bridge }}"
|
||||
|
||||
- name: Enable VLAN filtering on bridge
|
||||
ansible.builtin.command:
|
||||
cmd: "ip link set {{ item.value.bridge }} type bridge vlan_filtering 1"
|
||||
when:
|
||||
- item.value.vlan_aware | default(false)
|
||||
- "'vlan_filtering 0' in vlan_aware_check.results[ansible_loop.index0].stdout | default('')"
|
||||
loop: "{{ network_interfaces | dict2items }}"
|
||||
loop_control:
|
||||
label: "{{ item.value.bridge }}"
|
||||
extended: true
|
||||
register: vlan_filtering
|
||||
changed_when: vlan_filtering.rc == 0
|
||||
|
||||
- name: Configure allowed VLANs on bridge
|
||||
ansible.builtin.command:
|
||||
cmd: "bridge vlan add vid {{ item.value.vlan_ids }} dev {{ item.value.bridge }} self"
|
||||
when:
|
||||
- item.value.vlan_aware | default(false)
|
||||
- item.value.vlan_ids is defined
|
||||
loop: "{{ network_interfaces | dict2items }}"
|
||||
loop_control:
|
||||
label: "{{ item.value.bridge }}"
|
||||
register: vlan_add
|
||||
changed_when: vlan_add.rc == 0
|
||||
failed_when:
|
||||
- vlan_add.rc != 0
|
||||
- "'already exists' not in vlan_add.stderr"
|
||||
```
|
||||
|
||||
## Pattern: Network Configuration Validation
|
||||
|
||||
**Problem**: Network misconfigurations can cause node isolation and cluster failures.
|
||||
|
||||
**Solution**: Validate configuration before and after applying changes.
|
||||
|
||||
### Implementation
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_networking/tasks/validate.yml
|
||||
---
|
||||
- name: Verify interface configuration file syntax
|
||||
ansible.builtin.command:
|
||||
cmd: ifup --no-act {{ item.value.bridge }}
|
||||
register: config_syntax
|
||||
changed_when: false
|
||||
loop: "{{ network_interfaces | dict2items }}"
|
||||
loop_control:
|
||||
label: "{{ item.value.bridge }}"
|
||||
|
||||
- name: Check interface operational status
|
||||
ansible.builtin.command:
|
||||
cmd: "ip link show {{ item.value.bridge }}"
|
||||
register: interface_status
|
||||
changed_when: false
|
||||
failed_when: "'state UP' not in interface_status.stdout"
|
||||
loop: "{{ network_interfaces | dict2items }}"
|
||||
loop_control:
|
||||
label: "{{ item.value.bridge }}"
|
||||
|
||||
- name: Verify IP address assignment
|
||||
ansible.builtin.command:
|
||||
cmd: "ip addr show {{ item.value.bridge }}"
|
||||
register: ip_status
|
||||
changed_when: false
|
||||
failed_when: item.value.address.split('/')[0] not in ip_status.stdout
|
||||
loop: "{{ network_interfaces | dict2items }}"
|
||||
loop_control:
|
||||
label: "{{ item.value.bridge }}"
|
||||
|
||||
- name: Test connectivity to gateway
|
||||
ansible.builtin.command:
|
||||
cmd: "ping -c 3 -W 2 {{ item.value.gateway }}"
|
||||
register: gateway_ping
|
||||
changed_when: false
|
||||
when: item.value.gateway is defined
|
||||
loop: "{{ network_interfaces | dict2items }}"
|
||||
loop_control:
|
||||
label: "{{ item.value.bridge }}"
|
||||
|
||||
- name: Test connectivity to cluster peers
|
||||
ansible.builtin.command:
|
||||
cmd: "ping -c 3 -W 2 {{ hostvars[item].ansible_host }}"
|
||||
register: peer_ping
|
||||
changed_when: false
|
||||
when: item != inventory_hostname
|
||||
loop: "{{ groups['proxmox'] }}"
|
||||
loop_control:
|
||||
label: "{{ item }}"
|
||||
```
|
||||
|
||||
## Anti-Pattern: Excessive Shell Commands
|
||||
|
||||
**❌ Don't Do This**:
|
||||
|
||||
```yaml
|
||||
- name: Create VLAN interface if needed
|
||||
ansible.builtin.shell: |
|
||||
if ! ip link show vmbr0.{{ item.vlan }} >/dev/null 2>&1; then
|
||||
ip link add link vmbr0 name vmbr0.{{ item.vlan }} type vlan id {{ item.vlan }}
|
||||
ip link set vmbr0.{{ item.vlan }} up
|
||||
fi
|
||||
```
|
||||
|
||||
**Problems**:
|
||||
|
||||
- Shell-specific syntax
|
||||
- Limited idempotency
|
||||
- No check-mode support
|
||||
- Harder to test
|
||||
- Error handling is fragile
|
||||
|
||||
**✅ Do This Instead**:
|
||||
|
||||
```yaml
|
||||
- name: Check if VLAN interface exists
|
||||
ansible.builtin.command:
|
||||
cmd: "ip link show vmbr0.{{ item.vlan }}"
|
||||
register: vlan_check
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Create VLAN interface
|
||||
ansible.builtin.command:
|
||||
cmd: "ip link add link vmbr0 name vmbr0.{{ item.vlan }} type vlan id {{ item.vlan }}"
|
||||
when: vlan_check.rc != 0
|
||||
register: vlan_create
|
||||
changed_when: vlan_create.rc == 0
|
||||
|
||||
- name: Bring up VLAN interface
|
||||
ansible.builtin.command:
|
||||
cmd: "ip link set vmbr0.{{ item.vlan }} up"
|
||||
when: vlan_check.rc != 0
|
||||
```
|
||||
|
||||
## Handler Configuration
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_networking/handlers/main.yml
|
||||
---
|
||||
- name: reload networking
|
||||
ansible.builtin.systemd:
|
||||
name: networking
|
||||
state: reloaded
|
||||
listen: reload networking
|
||||
throttle: 1 # One node at a time to prevent cluster disruption
|
||||
|
||||
- name: restart networking
|
||||
ansible.builtin.systemd:
|
||||
name: networking
|
||||
state: restarted
|
||||
listen: restart networking
|
||||
throttle: 1
|
||||
when: not ansible_check_mode # Don't restart in check mode
|
||||
```
|
||||
|
||||
## Complete Role Example
|
||||
|
||||
```yaml
|
||||
# roles/proxmox_networking/tasks/main.yml
|
||||
---
|
||||
- name: Validate prerequisites
|
||||
ansible.builtin.include_tasks: prerequisites.yml
|
||||
|
||||
- name: Configure bridge interfaces
|
||||
ansible.builtin.include_tasks: bridges.yml
|
||||
|
||||
- name: Configure VLAN interfaces
|
||||
ansible.builtin.include_tasks: vlans.yml
|
||||
when: vlans is defined and vlans | length > 0
|
||||
|
||||
- name: Configure VLAN-aware bridges
|
||||
ansible.builtin.include_tasks: vlan_aware.yml
|
||||
|
||||
- name: Configure MTU for jumbo frames
|
||||
ansible.builtin.include_tasks: mtu.yml
|
||||
when: network_jumbo_frames_enabled | default(false)
|
||||
|
||||
- name: Validate network configuration
|
||||
ansible.builtin.include_tasks: validate.yml
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
# Syntax check
|
||||
ansible-playbook --syntax-check playbooks/network-config.yml
|
||||
|
||||
# Check mode (dry run) - won't restart networking
|
||||
ansible-playbook playbooks/network-config.yml --check --diff
|
||||
|
||||
# Apply to single node first
|
||||
ansible-playbook playbooks/network-config.yml --limit foxtrot
|
||||
|
||||
# Verify MTU configuration
|
||||
ansible -i inventory/proxmox.yml matrix_cluster -m shell \
|
||||
-a "ip link show | grep -E 'vmbr[12]' | grep mtu"
|
||||
|
||||
# Test jumbo frames
|
||||
ansible -i inventory/proxmox.yml matrix_cluster -m shell \
|
||||
-a "ping -c 3 -M do -s 8972 192.168.5.6"
|
||||
```
|
||||
|
||||
## Matrix Cluster Example
|
||||
|
||||
```yaml
|
||||
# Example playbook for Matrix cluster networking
|
||||
---
|
||||
- name: Configure Matrix Cluster Networking
|
||||
hosts: matrix_cluster
|
||||
become: true
|
||||
serial: 1 # Configure one node at a time
|
||||
|
||||
roles:
|
||||
- role: proxmox_networking
|
||||
vars:
|
||||
network_jumbo_frames_enabled: true
|
||||
```
|
||||
|
||||
## Related Patterns
|
||||
|
||||
- [Cluster Automation](cluster-automation.md) - Cluster formation with corosync networking
|
||||
- [CEPH Storage](ceph-automation.md) - CEPH network requirements
|
||||
- [Error Handling](error-handling.md) - Network validation error handling
|
||||
|
||||
## References
|
||||
|
||||
- ProxSpray analysis: `docs/proxspray-analysis.md` (lines 209-331)
|
||||
- Proxmox VE Network Configuration documentation
|
||||
- Linux bridge configuration guide
|
||||
- VLAN configuration best practices
|
||||
343
skills/ansible-best-practices/patterns/playbook-role-patterns.md
Normal file
343
skills/ansible-best-practices/patterns/playbook-role-patterns.md
Normal file
@@ -0,0 +1,343 @@
|
||||
# Playbook and Role Design Patterns
|
||||
|
||||
Best practices for structuring playbooks and roles based on production patterns from community roles like
|
||||
`geerlingguy.docker` and this repository.
|
||||
|
||||
## Pattern 1: State-Based Playbooks (Not Separate Create/Delete)
|
||||
|
||||
### Anti-Pattern: Separate playbooks for each operation
|
||||
|
||||
```text
|
||||
❌ BAD:
|
||||
playbooks/
|
||||
├── create-user.yml
|
||||
└── delete-user.yml
|
||||
```
|
||||
|
||||
### Best Practice: Single playbook with state variable
|
||||
|
||||
```text
|
||||
✅ GOOD:
|
||||
playbooks/
|
||||
└── manage-user.yml # Handles both create and delete via state variable
|
||||
```
|
||||
|
||||
### Why This Pattern?
|
||||
|
||||
Following community role patterns (like `geerlingguy.docker`, `geerlingguy.postgresql`):
|
||||
|
||||
- **Single source of truth**: One playbook to maintain
|
||||
- **Consistent interface**: Same variables, just change `state`
|
||||
- **Less duplication**: Validation and logic shared
|
||||
- **Familiar pattern**: Matches how Ansible modules work
|
||||
|
||||
### Implementation Example
|
||||
|
||||
**Role with state support** (`roles/system_user/tasks/main.yml`):
|
||||
|
||||
```yaml
|
||||
---
|
||||
- name: Create/update system users
|
||||
ansible.builtin.include_tasks: create_users.yml
|
||||
loop: "{{ system_users }}"
|
||||
when:
|
||||
- user_item.state | default('present') == 'present'
|
||||
|
||||
- name: Remove system users
|
||||
ansible.builtin.include_tasks: remove_users.yml
|
||||
loop: "{{ system_users }}"
|
||||
when:
|
||||
- user_item.state | default('present') == 'absent'
|
||||
```
|
||||
|
||||
**Playbook using the role** (`playbooks/manage-admin-user.yml`):
|
||||
|
||||
```yaml
|
||||
---
|
||||
# Playbook: Manage Administrative User
|
||||
# Usage:
|
||||
# # Create:
|
||||
# uv run ansible-playbook playbooks/manage-admin-user.yml \
|
||||
# -e "admin_name=myuser" -e "admin_ssh_key='ssh-ed25519 ...'"
|
||||
#
|
||||
# # Remove:
|
||||
# uv run ansible-playbook playbooks/manage-admin-user.yml \
|
||||
# -e "admin_name=myuser" -e "admin_state=absent"
|
||||
|
||||
- name: Manage Administrative User
|
||||
hosts: "{{ target_cluster | default('all') }}"
|
||||
become: true
|
||||
|
||||
pre_tasks:
|
||||
- name: Set default state
|
||||
ansible.builtin.set_fact:
|
||||
admin_state_value: "{{ admin_state | default('present') }}"
|
||||
|
||||
- name: Validate variables
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- admin_name is defined
|
||||
- (admin_state_value == 'absent') or (admin_ssh_key is defined)
|
||||
fail_msg: "admin_name required. admin_ssh_key required when state=present"
|
||||
|
||||
roles:
|
||||
- role: system_user
|
||||
vars:
|
||||
system_users:
|
||||
- name: "{{ admin_name }}"
|
||||
state: "{{ admin_state_value }}"
|
||||
# Only include creation params when state=present
|
||||
ssh_keys: "{{ [] if admin_state_value == 'absent' else [admin_ssh_key] }}"
|
||||
sudo_nopasswd: "{{ false if admin_state_value == 'absent' else true }}"
|
||||
```
|
||||
|
||||
### Key Design Decisions
|
||||
|
||||
1. **Default to `present`**: Makes common case (creation) easiest
|
||||
|
||||
```yaml
|
||||
admin_state_value: "{{ admin_state | default('present') }}"
|
||||
```
|
||||
|
||||
2. **Conditional validation**: SSH key only required when creating
|
||||
|
||||
```yaml
|
||||
- (admin_state_value == 'absent') or (admin_ssh_key is defined)
|
||||
```
|
||||
|
||||
3. **Conditional parameters**: Skip unnecessary vars when removing
|
||||
|
||||
```yaml
|
||||
ssh_keys: "{{ [] if admin_state_value == 'absent' else [admin_ssh_key] }}"
|
||||
```
|
||||
|
||||
4. **State-specific messages**: Different post_tasks based on state
|
||||
|
||||
```yaml
|
||||
- name: Display success (created)
|
||||
when: admin_state_value == 'present'
|
||||
|
||||
- name: Display success (removed)
|
||||
when: admin_state_value == 'absent'
|
||||
```
|
||||
|
||||
## Pattern 2: Public API Variables (No Role Prefix)
|
||||
|
||||
**Role defaults** should use clean variable names (not prefixed):
|
||||
|
||||
```yaml
|
||||
# roles/system_user/defaults/main.yml
|
||||
---
|
||||
# noqa: var-naming[no-role-prefix] - This is the role's public API
|
||||
system_users: []
|
||||
```
|
||||
|
||||
**Why?**
|
||||
|
||||
- Clean interface for users of the role
|
||||
- Follows community role patterns (`docker_users`, not `geerlingguy_docker_users`)
|
||||
- Internal variables should be prefixed (e.g., `system_user_create_result`)
|
||||
|
||||
## Pattern 3: Smart Variable Defaults in Playbooks
|
||||
|
||||
Use `set_fact` to handle defaults gracefully:
|
||||
|
||||
```yaml
|
||||
pre_tasks:
|
||||
- name: Set default values for optional variables
|
||||
ansible.builtin.set_fact:
|
||||
admin_shell_value: "{{ admin_shell | default('/bin/bash') }}"
|
||||
admin_comment_value: "{{ admin_comment | default('System Administrator') }}"
|
||||
when: admin_state_value == 'present'
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
|
||||
- Defaults set once, used everywhere
|
||||
- Clear separation of user input vs computed values
|
||||
- Conditional defaults (only when needed)
|
||||
|
||||
## Pattern 4: Comprehensive Pre-flight Validation
|
||||
|
||||
Validate early, fail fast:
|
||||
|
||||
```yaml
|
||||
pre_tasks:
|
||||
- name: Validate required variables
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- admin_name is defined
|
||||
- admin_name | length > 0
|
||||
# Conditional validation
|
||||
- (admin_state_value == 'absent') or (admin_ssh_key is defined)
|
||||
fail_msg: "Clear error message about what's missing"
|
||||
success_msg: "All required variables present"
|
||||
```
|
||||
|
||||
**Why validate in playbook, not role?**
|
||||
|
||||
- Playbooks know the specific use case
|
||||
- Roles should be flexible
|
||||
- Better error messages with context
|
||||
|
||||
## Pattern 5: Documentation in Playbook Headers
|
||||
|
||||
Self-documenting playbooks with usage examples:
|
||||
|
||||
```yaml
|
||||
---
|
||||
# Playbook: Manage Administrative User
|
||||
# Purpose: Create or remove admin users with SSH and sudo
|
||||
# Role: ansible/roles/system_user
|
||||
#
|
||||
# Usage:
|
||||
# # Create user:
|
||||
# uv run ansible-playbook playbooks/manage-admin-user.yml \
|
||||
# -e "admin_name=alice" \
|
||||
# -e "admin_ssh_key='ssh-ed25519 ...'"
|
||||
#
|
||||
# # Remove user:
|
||||
# uv run ansible-playbook playbooks/manage-admin-user.yml \
|
||||
# -e "admin_name=alice" \
|
||||
# -e "admin_state=absent"
|
||||
#
|
||||
# Variables:
|
||||
# admin_name (required): Username
|
||||
# admin_ssh_key (required for create): SSH public key
|
||||
# admin_state (optional): present or absent (default: present)
|
||||
# admin_shell (optional): User shell (default: /bin/bash)
|
||||
```
|
||||
|
||||
## Pattern 6: Informative Output Messages
|
||||
|
||||
Context-aware success messages:
|
||||
|
||||
```yaml
|
||||
post_tasks:
|
||||
- name: Display success message (user created)
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
========================================
|
||||
User Creation Complete
|
||||
========================================
|
||||
User '{{ admin_name }}' configured on {{ inventory_hostname }}
|
||||
|
||||
Test SSH: ssh {{ admin_name }}@{{ inventory_hostname }}
|
||||
Test sudo: ssh {{ admin_name }}@{{ inventory_hostname }} sudo id
|
||||
when: admin_state_value == 'present'
|
||||
|
||||
- name: Display success message (user removed)
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
========================================
|
||||
User Removal Complete
|
||||
========================================
|
||||
User '{{ admin_name }}' removed from {{ inventory_hostname }}
|
||||
|
||||
Verify: ssh root@{{ inventory_hostname }} "id {{ admin_name }}"
|
||||
when: admin_state_value == 'absent'
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
|
||||
- Users know what to do next
|
||||
- Copy-paste ready commands
|
||||
- Different messages per operation
|
||||
|
||||
## Testing the Pattern
|
||||
|
||||
### Idempotency Test
|
||||
|
||||
Both operations should be idempotent:
|
||||
|
||||
```bash
|
||||
# Create - first run should change, second should not
|
||||
uv run ansible-playbook playbooks/manage-user.yml -e "admin_name=test" -e "admin_ssh_key='...'"
|
||||
# Result: changed=5
|
||||
|
||||
uv run ansible-playbook playbooks/manage-user.yml -e "admin_name=test" -e "admin_ssh_key='...'"
|
||||
# Result: changed=0 ✅
|
||||
|
||||
# Remove - first run should change, second should not
|
||||
uv run ansible-playbook playbooks/manage-user.yml -e "admin_name=test" -e "admin_state=absent"
|
||||
# Result: changed=2
|
||||
|
||||
uv run ansible-playbook playbooks/manage-user.yml -e "admin_name=test" -e "admin_state=absent"
|
||||
# Result: changed=0 ✅
|
||||
```
|
||||
|
||||
## Real-World Example
|
||||
|
||||
From this repository: `ansible/playbooks/create-admin-user.yml` + `ansible/roles/system_user/`
|
||||
|
||||
**Features:**
|
||||
|
||||
- ✅ Single playbook for create and remove
|
||||
- ✅ State defaults to `present`
|
||||
- ✅ Conditional validation (SSH key only when creating)
|
||||
- ✅ Conditional role variables
|
||||
- ✅ State-specific output messages
|
||||
- ✅ Fully idempotent (tested on production infrastructure)
|
||||
|
||||
**Usage:**
|
||||
|
||||
```bash
|
||||
# Create admin user with full sudo
|
||||
cd ansible
|
||||
uv run ansible-playbook -i inventory/proxmox.yml \
|
||||
playbooks/create-admin-user.yml \
|
||||
-e "admin_name=alice" \
|
||||
-e "admin_ssh_key='ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAI...'"
|
||||
|
||||
# Remove the user
|
||||
uv run ansible-playbook -i inventory/proxmox.yml \
|
||||
playbooks/create-admin-user.yml \
|
||||
-e "admin_name=alice" \
|
||||
-e "admin_state=absent"
|
||||
```
|
||||
|
||||
## Comparison: Before and After
|
||||
|
||||
### Before (Anti-pattern)
|
||||
|
||||
```text
|
||||
playbooks/
|
||||
├── create-admin-user.yml # 70 lines
|
||||
└── delete-admin-user.yml # 45 lines
|
||||
# = 115 lines total
|
||||
# = 2 files to maintain
|
||||
# = Different interfaces
|
||||
```
|
||||
|
||||
### After (Best practice)
|
||||
|
||||
```text
|
||||
playbooks/
|
||||
└── create-admin-user.yml # 95 lines
|
||||
# = 1 file to maintain
|
||||
# = Consistent interface
|
||||
# = Follows community patterns
|
||||
```
|
||||
|
||||
## Related Patterns
|
||||
|
||||
- **Variable precedence**: See [reference/variable-precedence.md](../reference/variable-precedence.md)
|
||||
- **Role structure**: See [reference/roles-vs-playbooks.md](../reference/roles-vs-playbooks.md)
|
||||
- **Idempotency**: See [reference/idempotency-patterns.md](../reference/idempotency-patterns.md)
|
||||
|
||||
## Summary
|
||||
|
||||
✅ **Do:**
|
||||
|
||||
- Single playbook with `state` variable
|
||||
- Default `state: present` for common case
|
||||
- Conditional validation and parameters
|
||||
- Public API variables without role prefix
|
||||
- Comprehensive documentation in headers
|
||||
|
||||
❌ **Don't:**
|
||||
|
||||
- Create separate create/delete playbooks
|
||||
- Require parameters for both create and delete
|
||||
- Use role prefixes on public API variables
|
||||
- Omit usage examples from playbooks
|
||||
1186
skills/ansible-best-practices/patterns/role-structure-standards.md
Normal file
1186
skills/ansible-best-practices/patterns/role-structure-standards.md
Normal file
File diff suppressed because it is too large
Load Diff
512
skills/ansible-best-practices/patterns/secrets-management.md
Normal file
512
skills/ansible-best-practices/patterns/secrets-management.md
Normal file
@@ -0,0 +1,512 @@
|
||||
# Secrets Management with Infisical
|
||||
|
||||
## Overview
|
||||
|
||||
This repository uses **Infisical** for centralized secrets management in Ansible playbooks.
|
||||
This pattern eliminates hard-coded credentials and provides audit trails for secret access.
|
||||
|
||||
## Architecture
|
||||
|
||||
```text
|
||||
┌──────────────┐
|
||||
│ Ansible │
|
||||
│ Playbook │
|
||||
└──────┬───────┘
|
||||
│
|
||||
│ include_tasks: infisical-secret-lookup.yml
|
||||
│
|
||||
▼
|
||||
┌──────────────────┐
|
||||
│ Infisical Lookup │
|
||||
│ Task │
|
||||
└──────┬───────────┘
|
||||
│
|
||||
├─> Try Universal Auth (preferred)
|
||||
│ - INFISICAL_UNIVERSAL_AUTH_CLIENT_ID
|
||||
│ - INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET
|
||||
│
|
||||
├─> Fallback to Environment Variable (optional)
|
||||
│ - Uses specified fallback_env_var
|
||||
│
|
||||
▼
|
||||
┌──────────────┐
|
||||
│ Infisical │ (Vault)
|
||||
│ API │
|
||||
└──────────────┘
|
||||
```
|
||||
|
||||
## Reusable Task Pattern
|
||||
|
||||
### The Infisical Lookup Task
|
||||
|
||||
**Location:** `ansible/tasks/infisical-secret-lookup.yml`
|
||||
|
||||
**Purpose:** Reusable task for secure secret retrieval with validation and fallback.
|
||||
|
||||
**Key Features:**
|
||||
|
||||
1. **Validates input parameters** - Ensures secret_name and secret_var_name are provided
|
||||
2. **Checks authentication** - Validates Universal Auth credentials or fallback
|
||||
3. **Retrieves secret** - Fetches from Infisical with project/env/path context
|
||||
4. **Validates retrieval** - Ensures secret was actually retrieved
|
||||
5. **Uses `no_log`** - Prevents secrets from appearing in logs
|
||||
6. **Supports fallback** - Can fall back to environment variables
|
||||
|
||||
### Usage Pattern
|
||||
|
||||
**Basic usage:**
|
||||
|
||||
```yaml
|
||||
- name: Retrieve Proxmox password
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'PROXMOX_PASSWORD'
|
||||
secret_var_name: 'proxmox_password'
|
||||
infisical_project_id: '7b832220-24c0-45bc-a5f1-ce9794a31259'
|
||||
infisical_env: 'prod'
|
||||
infisical_path: '/doggos-cluster'
|
||||
|
||||
# Now use the secret
|
||||
- name: Create Proxmox user
|
||||
community.proxmox.proxmox_user:
|
||||
api_password: "{{ proxmox_password }}"
|
||||
# ... other config ...
|
||||
no_log: true
|
||||
```
|
||||
|
||||
**With fallback to environment variable:**
|
||||
|
||||
```yaml
|
||||
- name: Retrieve database password
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'DB_PASSWORD'
|
||||
secret_var_name: 'db_password'
|
||||
fallback_env_var: 'DB_PASSWORD' # Falls back to $DB_PASSWORD if Infisical fails
|
||||
infisical_project_id: '7b832220-24c0-45bc-a5f1-ce9794a31259'
|
||||
infisical_env: 'prod'
|
||||
infisical_path: '/database'
|
||||
```
|
||||
|
||||
**Allow empty values (optional):**
|
||||
|
||||
```yaml
|
||||
- name: Retrieve optional API key
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'OPTIONAL_API_KEY'
|
||||
secret_var_name: 'api_key'
|
||||
allow_empty: true # Won't fail if secret is empty
|
||||
```
|
||||
|
||||
## Required Variables
|
||||
|
||||
### Task Parameters
|
||||
|
||||
| Variable | Required | Default | Description |
|
||||
|----------|----------|---------|-------------|
|
||||
| `secret_name` | Yes | - | Name of secret in Infisical |
|
||||
| `secret_var_name` | Yes | - | Variable name to store retrieved secret |
|
||||
| `infisical_project_id` | No | `7b832220-...` | Infisical project ID |
|
||||
| `infisical_env` | No | `prod` | Environment slug (prod, dev, staging) |
|
||||
| `infisical_path` | No | `/apollo-13/vault` | Path within Infisical project |
|
||||
| `fallback_env_var` | No | - | Environment variable to use as fallback |
|
||||
| `allow_empty` | No | `false` | Whether to allow empty secret values |
|
||||
|
||||
### Environment Variables
|
||||
|
||||
**Universal Auth (Preferred):**
|
||||
|
||||
```bash
|
||||
export INFISICAL_UNIVERSAL_AUTH_CLIENT_ID="your-client-id"
|
||||
export INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET="your-client-secret"
|
||||
```
|
||||
|
||||
**Fallback (Optional):**
|
||||
|
||||
```bash
|
||||
export PROXMOX_PASSWORD="fallback-password"
|
||||
```
|
||||
|
||||
## Authentication Methods
|
||||
|
||||
### Universal Auth (Recommended)
|
||||
|
||||
**Setup:**
|
||||
|
||||
1. Create service account in Infisical
|
||||
2. Generate Universal Auth credentials
|
||||
3. Set environment variables
|
||||
|
||||
**Usage:**
|
||||
|
||||
```bash
|
||||
export INFISICAL_UNIVERSAL_AUTH_CLIENT_ID="ua-abc123"
|
||||
export INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET="secret-xyz789"
|
||||
|
||||
cd ansible
|
||||
uv run ansible-playbook playbooks/my-playbook.yml
|
||||
```
|
||||
|
||||
### Fallback to Environment Variables
|
||||
|
||||
**When to use:**
|
||||
|
||||
- Local development
|
||||
- CI/CD pipelines without Infisical access
|
||||
- Emergency fallback
|
||||
|
||||
**Usage:**
|
||||
|
||||
```yaml
|
||||
- name: Get API token
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'API_TOKEN'
|
||||
secret_var_name: 'api_token'
|
||||
fallback_env_var: 'API_TOKEN' # Falls back to $API_TOKEN
|
||||
```
|
||||
|
||||
## Real-World Examples
|
||||
|
||||
### Example 1: Proxmox Template Creation
|
||||
|
||||
**From:** `ansible/playbooks/proxmox-build-template.yml`
|
||||
|
||||
```yaml
|
||||
---
|
||||
- name: Build Proxmox VM template
|
||||
hosts: proxmox_nodes
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
infisical_project_id: '7b832220-24c0-45bc-a5f1-ce9794a31259'
|
||||
infisical_env: 'prod'
|
||||
infisical_path: '/doggos-cluster'
|
||||
|
||||
tasks:
|
||||
- name: Retrieve Proxmox credentials
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'PROXMOX_PASSWORD'
|
||||
secret_var_name: 'proxmox_password'
|
||||
fallback_env_var: 'PROXMOX_PASSWORD'
|
||||
|
||||
- name: Download cloud image
|
||||
ansible.builtin.get_url:
|
||||
url: "{{ cloud_image_url }}"
|
||||
dest: "/tmp/{{ image_name }}"
|
||||
checksum: "{{ cloud_image_checksum }}"
|
||||
# ... rest of playbook ...
|
||||
```
|
||||
|
||||
### Example 2: Terraform User Creation
|
||||
|
||||
**From:** `ansible/playbooks/proxmox-create-terraform-user.yml`
|
||||
|
||||
```yaml
|
||||
---
|
||||
- name: Create Terraform service user in Proxmox
|
||||
hosts: proxmox_nodes
|
||||
become: true
|
||||
|
||||
vars:
|
||||
infisical_project_id: '7b832220-24c0-45bc-a5f1-ce9794a31259'
|
||||
infisical_env: 'prod'
|
||||
infisical_path: '/doggos-cluster'
|
||||
|
||||
tasks:
|
||||
- name: Retrieve Proxmox API credentials
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'PROXMOX_ROOT_PASSWORD'
|
||||
secret_var_name: 'proxmox_root_password'
|
||||
|
||||
- name: Create system user
|
||||
ansible.builtin.user:
|
||||
name: terraform
|
||||
comment: "Terraform automation user"
|
||||
shell: /bin/bash
|
||||
state: present
|
||||
no_log: true
|
||||
|
||||
- name: Create Proxmox API token
|
||||
ansible.builtin.command: >
|
||||
pveum user token add terraform@pam terraform-token
|
||||
register: token_result
|
||||
changed_when: "'already exists' not in token_result.stderr"
|
||||
failed_when:
|
||||
- token_result.rc != 0
|
||||
- "'already exists' not in token_result.stderr"
|
||||
no_log: true
|
||||
```
|
||||
|
||||
### Example 3: Multiple Secrets
|
||||
|
||||
```yaml
|
||||
---
|
||||
- name: Deploy application with multiple secrets
|
||||
hosts: app_servers
|
||||
become: true
|
||||
|
||||
vars:
|
||||
infisical_project_id: '7b832220-24c0-45bc-a5f1-ce9794a31259'
|
||||
infisical_env: 'prod'
|
||||
infisical_path: '/app-config'
|
||||
|
||||
tasks:
|
||||
- name: Retrieve database password
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'DB_PASSWORD'
|
||||
secret_var_name: 'db_password'
|
||||
|
||||
- name: Retrieve API key
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'API_KEY'
|
||||
secret_var_name: 'api_key'
|
||||
|
||||
- name: Retrieve Redis password
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'REDIS_PASSWORD'
|
||||
secret_var_name: 'redis_password'
|
||||
|
||||
- name: Deploy application config
|
||||
ansible.builtin.template:
|
||||
src: app-config.j2
|
||||
dest: /etc/app/config.yml
|
||||
owner: app
|
||||
group: app
|
||||
mode: '0600'
|
||||
vars:
|
||||
database_url: "postgres://user:{{ db_password }}@db.example.com/app"
|
||||
api_key: "{{ api_key }}"
|
||||
redis_url: "redis://:{{ redis_password }}@redis.example.com:6379"
|
||||
no_log: true
|
||||
```
|
||||
|
||||
## Security Best Practices
|
||||
|
||||
### 1. Always Use `no_log`
|
||||
|
||||
**On secret retrieval:**
|
||||
|
||||
```yaml
|
||||
- name: Get secret
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'PASSWORD'
|
||||
secret_var_name: 'password'
|
||||
# no_log: true (already in included task)
|
||||
```
|
||||
|
||||
**On tasks using secrets:**
|
||||
|
||||
```yaml
|
||||
- name: Use secret in command
|
||||
ansible.builtin.command: create-user --password {{ password }}
|
||||
no_log: true # CRITICAL: Prevents password in logs
|
||||
```
|
||||
|
||||
### 2. Never Hard-Code Secrets
|
||||
|
||||
**❌ Bad:**
|
||||
|
||||
```yaml
|
||||
- name: Create user
|
||||
community.proxmox.proxmox_user:
|
||||
api_password: "my-password-123" # DON'T DO THIS!
|
||||
```
|
||||
|
||||
**✅ Good:**
|
||||
|
||||
```yaml
|
||||
- name: Retrieve password
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'PROXMOX_PASSWORD'
|
||||
secret_var_name: 'proxmox_password'
|
||||
|
||||
- name: Create user
|
||||
community.proxmox.proxmox_user:
|
||||
api_password: "{{ proxmox_password }}"
|
||||
no_log: true
|
||||
```
|
||||
|
||||
### 3. Validate Secret Retrieval
|
||||
|
||||
The reusable task automatically validates secrets, but you can add additional checks:
|
||||
|
||||
```yaml
|
||||
- name: Get secret
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'DB_PASSWORD'
|
||||
secret_var_name: 'db_password'
|
||||
|
||||
- name: Validate password format
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- db_password | length >= 16
|
||||
- db_password is regex('^[A-Za-z0-9!@#$%^&*()]+$')
|
||||
fail_msg: "Password doesn't meet complexity requirements"
|
||||
no_log: true
|
||||
```
|
||||
|
||||
### 4. Use Project/Environment Isolation
|
||||
|
||||
**Separate secrets by environment:**
|
||||
|
||||
```yaml
|
||||
# Production
|
||||
- name: Get prod secret
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'DB_PASSWORD'
|
||||
secret_var_name: 'db_password'
|
||||
infisical_env: 'prod'
|
||||
infisical_path: '/production/database'
|
||||
|
||||
# Development
|
||||
- name: Get dev secret
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'DB_PASSWORD'
|
||||
secret_var_name: 'db_password'
|
||||
infisical_env: 'dev'
|
||||
infisical_path: '/development/database'
|
||||
```
|
||||
|
||||
### 5. Limit Secret Scope
|
||||
|
||||
Only retrieve secrets when needed, not at playbook start:
|
||||
|
||||
**✅ Good:**
|
||||
|
||||
```yaml
|
||||
- name: System tasks (no secrets needed)
|
||||
ansible.builtin.apt:
|
||||
name: nginx
|
||||
state: present
|
||||
|
||||
# Only retrieve secret when needed
|
||||
- name: Get credentials
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'DB_PASSWORD'
|
||||
secret_var_name: 'db_password'
|
||||
|
||||
- name: Configure database connection
|
||||
ansible.builtin.template:
|
||||
src: db-config.j2
|
||||
dest: /etc/app/db.yml
|
||||
no_log: true
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Error: Missing Infisical authentication credentials
|
||||
|
||||
**Cause:** Universal Auth environment variables not set
|
||||
|
||||
**Solution:**
|
||||
|
||||
```bash
|
||||
export INFISICAL_UNIVERSAL_AUTH_CLIENT_ID="ua-abc123"
|
||||
export INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET="secret-xyz789"
|
||||
```
|
||||
|
||||
### Error: Failed to retrieve secret from Infisical
|
||||
|
||||
**Possible causes:**
|
||||
|
||||
1. Secret doesn't exist in specified path
|
||||
2. Wrong project_id/env/path
|
||||
3. Insufficient permissions
|
||||
|
||||
**Debug:**
|
||||
|
||||
```yaml
|
||||
- name: Debug secret retrieval
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'TEST_SECRET'
|
||||
secret_var_name: 'test_secret'
|
||||
infisical_project_id: '7b832220-24c0-45bc-a5f1-ce9794a31259'
|
||||
infisical_env: 'prod'
|
||||
infisical_path: '/test'
|
||||
# Check Infisical UI to verify secret exists at this path
|
||||
```
|
||||
|
||||
### Error: Secret validation failed (empty value)
|
||||
|
||||
**Cause:** Secret retrieved but value is empty
|
||||
|
||||
**Solutions:**
|
||||
|
||||
```yaml
|
||||
# Option 1: Allow empty values
|
||||
- name: Get optional secret
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'OPTIONAL_KEY'
|
||||
secret_var_name: 'optional_key'
|
||||
allow_empty: true
|
||||
|
||||
# Option 2: Use fallback
|
||||
- name: Get secret with fallback
|
||||
ansible.builtin.include_tasks: tasks/infisical-secret-lookup.yml
|
||||
vars:
|
||||
secret_name: 'API_KEY'
|
||||
secret_var_name: 'api_key'
|
||||
fallback_env_var: 'DEFAULT_API_KEY'
|
||||
```
|
||||
|
||||
## CI/CD Integration
|
||||
|
||||
### GitHub Actions
|
||||
|
||||
```yaml
|
||||
name: Deploy with Infisical
|
||||
on: push
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Infisical credentials
|
||||
env:
|
||||
INFISICAL_CLIENT_ID: ${{ secrets.INFISICAL_CLIENT_ID }}
|
||||
INFISICAL_CLIENT_SECRET: ${{ secrets.INFISICAL_CLIENT_SECRET }}
|
||||
run: |
|
||||
echo "INFISICAL_UNIVERSAL_AUTH_CLIENT_ID=$INFISICAL_CLIENT_ID" >> $GITHUB_ENV
|
||||
echo "INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET=$INFISICAL_CLIENT_SECRET" >> $GITHUB_ENV
|
||||
|
||||
- name: Run Ansible playbook
|
||||
run: |
|
||||
cd ansible
|
||||
uv run ansible-playbook playbooks/deploy.yml
|
||||
```
|
||||
|
||||
### GitLab CI
|
||||
|
||||
```yaml
|
||||
deploy:
|
||||
stage: deploy
|
||||
variables:
|
||||
INFISICAL_UNIVERSAL_AUTH_CLIENT_ID: $INFISICAL_CLIENT_ID
|
||||
INFISICAL_UNIVERSAL_AUTH_CLIENT_SECRET: $INFISICAL_CLIENT_SECRET
|
||||
script:
|
||||
- cd ansible
|
||||
- uv run ansible-playbook playbooks/deploy.yml
|
||||
```
|
||||
|
||||
## Further Reading
|
||||
|
||||
- [Infisical Documentation](https://infisical.com/docs)
|
||||
- [Infisical Ansible Collection](https://github.com/Infisical/ansible-collection)
|
||||
- [Ansible no_log Documentation](https://docs.ansible.com/ansible/latest/reference_appendices/logging.html)
|
||||
889
skills/ansible-best-practices/patterns/testing-comprehensive.md
Normal file
889
skills/ansible-best-practices/patterns/testing-comprehensive.md
Normal file
@@ -0,0 +1,889 @@
|
||||
# Comprehensive Testing Patterns
|
||||
|
||||
## Summary: Pattern Confidence
|
||||
|
||||
Analyzed 7 geerlingguy roles: security, users, docker, postgresql, nginx, pip, git
|
||||
|
||||
### Universal Patterns (All 7 roles)
|
||||
|
||||
- Molecule default scenario with Docker driver (7/7 roles identical configuration)
|
||||
- Multi-distribution test matrix covering RedHat + Debian families (7/7 roles)
|
||||
- GitHub Actions CI with separate lint and molecule jobs (7/7 roles)
|
||||
- Automated idempotence testing via molecule test sequence (7/7 roles rely on it)
|
||||
- Scheduled testing for dependency health checks (7/7 roles have weekly cron)
|
||||
- Environment variable configuration for test matrix flexibility (7/7 roles use MOLECULE_DISTRO)
|
||||
- Role naming validation with role_name_check: 1 (7/7 roles enable it)
|
||||
- Colored output in CI logs (PY_COLORS, ANSIBLE_FORCE_COLOR) (7/7 roles)
|
||||
- No explicit verify.yml playbook - relies on idempotence (7/7 roles)
|
||||
- Testing infrastructure maintained even for minimal utility roles (pip: 3 tasks, git: 4 tasks)
|
||||
|
||||
### Contextual Patterns (Varies by complexity)
|
||||
|
||||
- Distribution coverage scales with role complexity: simple roles test 3 distros,
|
||||
complex roles test 6-7 distros
|
||||
- Multi-scenario testing for roles with multiple installation methods
|
||||
(git uses MOLECULE_PLAYBOOK variable)
|
||||
- Scheduled testing timing varies (Monday-Sunday, different UTC times) but presence is universal
|
||||
|
||||
### Evolving Patterns (Newer roles improved)
|
||||
|
||||
- Updated test distributions: rockylinux9, ubuntu2404, debian12 (replacing older versions)
|
||||
- Advanced include_vars with first_found lookup (docker role) vs simple include_vars (security role)
|
||||
|
||||
### Sources
|
||||
|
||||
- geerlingguy.security (analyzed 2025-10-23)
|
||||
- geerlingguy.github-users (analyzed 2025-10-23)
|
||||
- geerlingguy.docker (analyzed 2025-10-23)
|
||||
- geerlingguy.postgresql (analyzed 2025-10-23)
|
||||
- geerlingguy.nginx (analyzed 2025-10-23)
|
||||
- geerlingguy.pip (analyzed 2025-10-23)
|
||||
- geerlingguy.git (analyzed 2025-10-23)
|
||||
|
||||
### Repositories
|
||||
|
||||
- <https://github.com/geerlingguy/ansible-role-security>
|
||||
- <https://github.com/geerlingguy/ansible-role-github-users>
|
||||
- <https://github.com/geerlingguy/ansible-role-docker>
|
||||
- <https://github.com/geerlingguy/ansible-role-postgresql>
|
||||
- <https://github.com/geerlingguy/ansible-role-nginx>
|
||||
- <https://github.com/geerlingguy/ansible-role-pip>
|
||||
- <https://github.com/geerlingguy/ansible-role-git>
|
||||
|
||||
## Pattern Confidence Levels (Historical)
|
||||
|
||||
Analyzed 2 geerlingguy roles: security, github-users
|
||||
|
||||
### Universal Patterns (Both roles use identical approach)
|
||||
|
||||
1. ✅ **Molecule default scenario with Docker driver** - Both roles use
|
||||
identical molecule.yml structure
|
||||
2. ✅ **role_name_check: 1** - Both enable role naming validation
|
||||
3. ✅ **Environment variable defaults** - Both use
|
||||
${MOLECULE_DISTRO:-rockylinux9} pattern
|
||||
4. ✅ **Privileged containers with cgroup mounting** - Identical configuration
|
||||
for systemd support
|
||||
5. ✅ **Multi-distribution test matrix** - Both test rockylinux9, ubuntu2404,
|
||||
debian12 (updated versions)
|
||||
6. ✅ **Separate lint and molecule jobs** - Identical CI workflow structure
|
||||
7. ✅ **GitHub Actions triggers** - pull_request, push to master, weekly schedule
|
||||
8. ✅ **Colored output in CI** - PY_COLORS='1', ANSIBLE_FORCE_COLOR='1'
|
||||
9. ✅ **yamllint for linting** - Consistent linting approach
|
||||
10. ✅ **Converge playbook with pre-tasks** - Both use pre-tasks for environment setup
|
||||
|
||||
### Contextual Patterns (Varies by role complexity)
|
||||
|
||||
1. ⚠️ **Pre-task complexity** - security role has more pre-tasks
|
||||
(SSH dependencies), github-users is simpler
|
||||
2. ⚠️ **Verification tests** - Neither role has explicit verify.yml
|
||||
(rely on idempotence)
|
||||
3. ⚠️ **Test data setup** - github-users sets up test users in pre-tasks,
|
||||
security doesn't need this
|
||||
|
||||
**Key Finding:** Testing infrastructure is highly standardized across
|
||||
geerlingguy roles. The molecule/CI setup is essentially a template that works
|
||||
for all roles.
|
||||
|
||||
## Overview
|
||||
|
||||
This document captures testing patterns extracted from production-grade Ansible
|
||||
roles, demonstrating industry-standard approaches to testing, CI/CD integration,
|
||||
and quality assurance.
|
||||
|
||||
## Molecule Configuration Structure
|
||||
|
||||
### Pattern: Default Scenario Structure
|
||||
|
||||
**Description:** Molecule uses a default scenario with a standardized directory
|
||||
structure for testing role convergence and idempotence.
|
||||
|
||||
**File Path:** `molecule/default/molecule.yml`
|
||||
|
||||
### Example Code (Molecule Structure)
|
||||
|
||||
```yaml
|
||||
---
|
||||
role_name_check: 1
|
||||
dependency:
|
||||
name: galaxy
|
||||
options:
|
||||
ignore-errors: true
|
||||
driver:
|
||||
name: docker
|
||||
platforms:
|
||||
- name: instance
|
||||
image: "geerlingguy/docker-${MOLECULE_DISTRO:-rockylinux9}-ansible:latest"
|
||||
command: ${MOLECULE_DOCKER_COMMAND:-""}
|
||||
volumes:
|
||||
- /sys/fs/cgroup:/sys/fs/cgroup:rw
|
||||
cgroupns_mode: host
|
||||
privileged: true
|
||||
pre_build_image: true
|
||||
provisioner:
|
||||
name: ansible
|
||||
playbooks:
|
||||
converge: ${MOLECULE_PLAYBOOK:-converge.yml}
|
||||
```
|
||||
|
||||
### Key Elements
|
||||
|
||||
1. **role_name_check: 1** - Validates role naming conventions
|
||||
2. **dependency.name: galaxy** - Automatically installs Galaxy dependencies
|
||||
3. **ignore-errors: true** - Prevents dependency failures from blocking tests
|
||||
4. **driver.name: docker** - Uses Docker for fast, lightweight test instances
|
||||
5. **Environment variable defaults** - `${MOLECULE_DISTRO:-rockylinux9}`
|
||||
provides defaults with override capability
|
||||
6. **Privileged containers** - Required for systemd and service management testing
|
||||
7. **cgroup mounting** - Enables systemd to function properly in containers
|
||||
|
||||
### When to Use
|
||||
|
||||
- All production roles should have a molecule/default scenario
|
||||
- Use Docker driver for most role testing (fast, reproducible)
|
||||
- Enable privileged mode when testing service management or systemd
|
||||
- Use environment variables for flexible test matrix configuration
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- Don't hardcode distribution names (use MOLECULE_DISTRO variable)
|
||||
- Don't skip role_name_check (helps catch galaxy naming issues)
|
||||
- Avoid ignoring dependency errors in production (use only for specific cases)
|
||||
|
||||
### Pattern: Converge Playbook with Pre-Tasks
|
||||
|
||||
**Description:** The converge playbook includes pre-tasks to prepare the test
|
||||
environment before role execution, ensuring consistent test conditions across
|
||||
different distributions.
|
||||
|
||||
**File Path:** `molecule/default/converge.yml`
|
||||
|
||||
### Example Code (Converge Playbook)
|
||||
|
||||
```yaml
|
||||
---
|
||||
- name: Converge
|
||||
hosts: all
|
||||
#become: true
|
||||
|
||||
pre_tasks:
|
||||
- name: Update apt cache.
|
||||
package:
|
||||
update_cache: true
|
||||
cache_valid_time: 600
|
||||
when: ansible_os_family == 'Debian'
|
||||
|
||||
- name: Ensure build dependencies are installed (RedHat).
|
||||
package:
|
||||
name:
|
||||
- openssh-server
|
||||
- openssh-clients
|
||||
state: present
|
||||
when: ansible_os_family == 'RedHat'
|
||||
|
||||
- name: Ensure build dependencies are installed (Debian).
|
||||
package:
|
||||
name:
|
||||
- openssh-server
|
||||
- openssh-client
|
||||
state: present
|
||||
when: ansible_os_family == 'Debian'
|
||||
|
||||
roles:
|
||||
- role: geerlingguy.security
|
||||
```
|
||||
|
||||
### Key Elements (Converge Playbook)
|
||||
|
||||
1. **Distribution-specific setup** - Different package names for RedHat vs Debian
|
||||
2. **Package cache updates** - Ensures latest package metadata
|
||||
3. **Dependency installation** - Installs prerequisites before role execution
|
||||
4. **Commented become directive** - Can be enabled if needed for testing
|
||||
5. **Simple role invocation** - Minimal role configuration for basic testing
|
||||
|
||||
### When to Use (Converge Playbook)
|
||||
|
||||
- Install test-specific dependencies that aren't part of the role
|
||||
- Prepare test environment (create directories, files, users)
|
||||
- Update package caches to avoid transient failures
|
||||
- Set up prerequisites that vary by OS family
|
||||
|
||||
### Anti-pattern (Converge Playbook)
|
||||
|
||||
- Don't install role dependencies here (use meta/main.yml dependencies instead)
|
||||
- Avoid complex logic in pre-tasks (keep test setup simple)
|
||||
- Don't duplicate role functionality in pre-tasks
|
||||
|
||||
## Test Matrix
|
||||
|
||||
### Pattern: Multi-Distribution Testing
|
||||
|
||||
**Description:** Test the role across multiple Linux distributions to ensure
|
||||
cross-platform compatibility.
|
||||
|
||||
**File Path:** `.github/workflows/ci.yml` (matrix strategy section)
|
||||
|
||||
### Example Code (CI Matrix)
|
||||
|
||||
```yaml
|
||||
molecule:
|
||||
name: Molecule
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
distro:
|
||||
- rockylinux9
|
||||
- ubuntu2204
|
||||
- debian11
|
||||
```
|
||||
|
||||
### Key Elements
|
||||
|
||||
1. **Strategic distribution selection** - Mix of RedHat and Debian families
|
||||
2. **Current LTS/stable versions** - Rocky Linux 9, Ubuntu 22.04, Debian 11
|
||||
3. **Representative sampling** - Not exhaustive, but covers main use cases
|
||||
4. **Environment variable passing** - MOLECULE_DISTRO passed to molecule
|
||||
|
||||
### Test Coverage Strategy
|
||||
|
||||
- **RedHat family:** rockylinux9 (represents RHEL, CentOS, Rocky, Alma)
|
||||
- **Debian family:** ubuntu2204, debian11 (covers Ubuntu and Debian variants)
|
||||
- **Version selection:** Latest LTS or stable releases
|
||||
|
||||
### When to Use
|
||||
|
||||
- Test on at least one RedHat and one Debian distribution
|
||||
- Include distributions you actually support in production
|
||||
- Use latest stable/LTS versions unless testing legacy compatibility
|
||||
- Consider adding Fedora for testing newer systemd/package versions
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- Don't test every possible distribution (diminishing returns)
|
||||
- Avoid outdated distributions unless explicitly supported
|
||||
- Don't test distributions you won't support in production
|
||||
|
||||
## CI/CD Integration
|
||||
|
||||
### Pattern: GitHub Actions Workflow Structure
|
||||
|
||||
**Description:** Comprehensive CI workflow with separate linting and testing jobs,
|
||||
triggered on multiple events.
|
||||
|
||||
**File Path:** `.github/workflows/ci.yml`
|
||||
|
||||
### Example Code (GitHub Actions)
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: CI
|
||||
'on':
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
schedule:
|
||||
- cron: "30 4 * * 4"
|
||||
|
||||
defaults:
|
||||
run:
|
||||
working-directory: 'geerlingguy.security'
|
||||
|
||||
jobs:
|
||||
|
||||
lint:
|
||||
name: Lint
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out the codebase.
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: 'geerlingguy.security'
|
||||
|
||||
- name: Set up Python 3.
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.x'
|
||||
|
||||
- name: Install test dependencies.
|
||||
run: pip3 install yamllint
|
||||
|
||||
- name: Lint code.
|
||||
run: |
|
||||
yamllint .
|
||||
|
||||
molecule:
|
||||
name: Molecule
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
distro:
|
||||
- rockylinux9
|
||||
- ubuntu2204
|
||||
- debian11
|
||||
|
||||
steps:
|
||||
- name: Check out the codebase.
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: 'geerlingguy.security'
|
||||
|
||||
- name: Set up Python 3.
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.x'
|
||||
|
||||
- name: Install test dependencies.
|
||||
run: pip3 install ansible molecule molecule-plugins[docker] docker
|
||||
|
||||
- name: Run Molecule tests.
|
||||
run: molecule test
|
||||
env:
|
||||
PY_COLORS: '1'
|
||||
ANSIBLE_FORCE_COLOR: '1'
|
||||
MOLECULE_DISTRO: ${{ matrix.distro }}
|
||||
```
|
||||
|
||||
### Key Elements
|
||||
|
||||
1. **Multiple trigger events:**
|
||||
- `pull_request` - Test all PRs before merge
|
||||
- `push.branches: master` - Test main branch commits
|
||||
- `schedule: cron` - Weekly scheduled tests (Thursday 4:30 AM UTC)
|
||||
|
||||
2. **Separate lint job:**
|
||||
- Runs independently of molecule tests
|
||||
- Fails fast on YAML syntax issues
|
||||
- Uses yamllint for consistency
|
||||
|
||||
3. **Working directory default:**
|
||||
- Sets context for Galaxy role structure
|
||||
- Matches expected role path in Galaxy
|
||||
|
||||
4. **Environment variables:**
|
||||
- PY_COLORS, ANSIBLE_FORCE_COLOR - Enable colored output in CI logs
|
||||
- MOLECULE_DISTRO - Passes matrix value to molecule
|
||||
|
||||
5. **Dependency installation:**
|
||||
- ansible - The automation engine
|
||||
- molecule - Testing framework
|
||||
- molecule-plugins[docker] - Docker driver support
|
||||
- docker - Python Docker SDK
|
||||
|
||||
### When to Use
|
||||
|
||||
- Always run tests on pull requests (prevents bad merges)
|
||||
- Test main branch to catch integration issues
|
||||
- Use scheduled tests to detect dependency breakage
|
||||
- Separate linting from testing for faster feedback
|
||||
- Enable colored output for easier log reading
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- Don't run expensive tests on every commit to every branch
|
||||
- Avoid skipping scheduled tests (catches dependency rot)
|
||||
- Don't combine linting and testing in one job (slower feedback)
|
||||
|
||||
## Idempotence Testing
|
||||
|
||||
### Pattern: Molecule Default Test Sequence
|
||||
|
||||
**Description:** Molecule's default test sequence includes an idempotence test
|
||||
that runs the role twice and verifies no changes occur on the second run.
|
||||
|
||||
### Test Sequence (molecule test command)
|
||||
|
||||
1. **dependency** - Install Galaxy dependencies
|
||||
2. **cleanup** - Remove previous test containers
|
||||
3. **destroy** - Ensure clean state
|
||||
4. **syntax** - Check playbook syntax
|
||||
5. **create** - Create test instances
|
||||
6. **prepare** - Run preparation playbook (if exists)
|
||||
7. **converge** - Run the role
|
||||
8. **idempotence** - Run role again, expect no changes
|
||||
9. **verify** - Run verification tests (if exists)
|
||||
10. **cleanup** - Remove test containers
|
||||
11. **destroy** - Final cleanup
|
||||
|
||||
### Idempotence Verification
|
||||
|
||||
Molecule automatically fails if the second converge run reports changed tasks.
|
||||
This validates that the role:
|
||||
|
||||
- Uses proper idempotent modules (lineinfile, service, package, etc.)
|
||||
- Checks state before making changes
|
||||
- Doesn't have tasks that always report changed
|
||||
|
||||
### When to Use
|
||||
|
||||
- Run full `molecule test` in CI/CD
|
||||
- Use `molecule converge` for faster development iteration
|
||||
- Use `molecule verify` to test without full cleanup
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- Don't disable idempotence testing (critical quality check)
|
||||
- Avoid using command/shell modules without changed_when
|
||||
- Don't mark tasks as changed:false when they actually change things
|
||||
|
||||
## Verification Strategies
|
||||
|
||||
### Pattern: No Explicit Verify Playbook
|
||||
|
||||
**Description:** The geerlingguy.security role relies on:
|
||||
|
||||
1. **Molecule's automatic idempotence check** - Validates role stability
|
||||
2. **CI matrix testing** - Tests across distributions
|
||||
3. **Converge success** - Role executes without errors
|
||||
|
||||
### Alternative Verification Approaches
|
||||
|
||||
For more complex roles, consider adding `molecule/default/verify.yml`:
|
||||
|
||||
```yaml
|
||||
---
|
||||
- name: Verify
|
||||
hosts: all
|
||||
tasks:
|
||||
- name: Check SSH service is running
|
||||
service:
|
||||
name: ssh
|
||||
state: started
|
||||
check_mode: true
|
||||
register: result
|
||||
failed_when: result.changed
|
||||
|
||||
- name: Verify fail2ban is installed
|
||||
package:
|
||||
name: fail2ban
|
||||
state: present
|
||||
check_mode: true
|
||||
register: result
|
||||
failed_when: result.changed
|
||||
```
|
||||
|
||||
### When to Use
|
||||
|
||||
- Simple roles: Rely on idempotence testing
|
||||
- Complex roles: Add explicit verification
|
||||
- Stateful services: Verify running state
|
||||
- Configuration files: Test file contents/permissions
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- Don't create verification tests that duplicate idempotence tests
|
||||
- Avoid complex verification logic (keep tests simple)
|
||||
|
||||
## Comparison to Virgo-Core Roles
|
||||
|
||||
### system_user Role
|
||||
|
||||
### Gaps (system_user)
|
||||
|
||||
- ❌ No molecule/ directory
|
||||
- ❌ No CI/CD integration (.github/workflows/)
|
||||
- ❌ No automated testing across distributions
|
||||
- ❌ No idempotence verification
|
||||
|
||||
### Matches (system_user)
|
||||
|
||||
- ✅ Simple, focused role scope
|
||||
- ✅ Uses idempotent modules (user, authorized_key, lineinfile)
|
||||
|
||||
### Priority Actions (system_user)
|
||||
|
||||
1. **Critical:** Add molecule/default scenario (2-4 hours)
|
||||
2. **Critical:** Add GitHub Actions CI workflow (2 hours)
|
||||
3. **Important:** Test on Ubuntu and Debian (1 hour)
|
||||
|
||||
### proxmox_access Role
|
||||
|
||||
### Gaps (proxmox_access)
|
||||
|
||||
- ❌ No molecule/ directory
|
||||
- ❌ No CI/CD integration
|
||||
- ❌ No automated testing
|
||||
- ⚠️ Uses shell module (requires changed_when validation)
|
||||
|
||||
### Matches (proxmox_access)
|
||||
|
||||
- ✅ Well-structured tasks
|
||||
- ✅ Uses handlers appropriately
|
||||
|
||||
### Priority Actions (proxmox_access)
|
||||
|
||||
1. **Critical:** Add molecule testing (2-4 hours)
|
||||
2. **Critical:** Add changed_when to shell tasks (30 minutes)
|
||||
3. **Critical:** Add GitHub Actions CI (2 hours)
|
||||
|
||||
### proxmox_network Role
|
||||
|
||||
### Gaps (proxmox_network)
|
||||
|
||||
- ❌ No molecule/ directory
|
||||
- ❌ No CI/CD integration
|
||||
- ❌ No automated testing
|
||||
- ⚠️ Network changes are hard to test (consider check mode tests)
|
||||
|
||||
### Matches (proxmox_network)
|
||||
|
||||
- ✅ Uses handlers for network reload
|
||||
- ✅ Conditional task execution
|
||||
|
||||
### Priority Actions (proxmox_network)
|
||||
|
||||
1. **Critical:** Add molecule testing with network verification (3-4 hours)
|
||||
2. **Critical:** Add GitHub Actions CI (2 hours)
|
||||
3. **Important:** Add verification tests for network state (2 hours)
|
||||
|
||||
## Validation: geerlingguy.docker
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repository:** <https://github.com/geerlingguy/ansible-role-docker>
|
||||
|
||||
### Molecule Testing Patterns
|
||||
|
||||
- **Pattern: Molecule default scenario structure** - ✅ **Confirmed**
|
||||
- Docker role uses identical molecule.yml structure as security/users roles
|
||||
- Same role_name_check: 1, dependency.name: galaxy, driver.name: docker
|
||||
- Same privileged container setup with cgroup mounting
|
||||
- Same environment variable defaults pattern (MOLECULE_DISTRO, MOLECULE_PLAYBOOK)
|
||||
|
||||
- **Pattern: Multi-distribution test matrix** - 🔄 **Evolved (Expanded)**
|
||||
- Docker tests MORE distributions than security/users (7 vs 3)
|
||||
- Matrix includes: rockylinux9, ubuntu2404, ubuntu2204, debian12, debian11,
|
||||
fedora40, opensuseleap15
|
||||
- **Evolution insight:** More complex roles test broader OS support
|
||||
- **Pattern holds:** Still tests both RedHat and Debian families, just more coverage
|
||||
|
||||
### CI/CD Integration Patterns
|
||||
|
||||
- **Pattern: GitHub Actions workflow structure** - ✅ **Confirmed**
|
||||
- Identical workflow structure: separate lint and molecule jobs
|
||||
- Same triggers: pull_request, push to master, scheduled (cron)
|
||||
- Same colored output environment variables (PY_COLORS, ANSIBLE_FORCE_COLOR)
|
||||
- Same working directory default pattern
|
||||
|
||||
- **Pattern: Scheduled testing** - ⚠️ **Contextual (Different schedule)**
|
||||
- security/users: Weekly Thursday 4:30 AM UTC (`30 4 * * 4`)
|
||||
- docker: Weekly Sunday 7:00 AM UTC (`0 7 * * 0`)
|
||||
- **Insight:** Schedule timing doesn't matter, having scheduled tests does
|
||||
|
||||
### Task Organization Patterns
|
||||
|
||||
- **Pattern: No explicit verify.yml** - ✅ **Confirmed**
|
||||
- Docker role also relies on idempotence testing, not explicit verification
|
||||
- Confirms that simple converge + idempotence is standard pattern
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
### What Docker Role Confirms
|
||||
|
||||
1. ✅ Molecule/Docker testing setup is truly universal (exact same structure)
|
||||
2. ✅ Separate lint/test jobs is standard practice
|
||||
3. ✅ CI triggers (PR, push, schedule) are consistent
|
||||
4. ✅ Environment variable configuration for flexibility is standard
|
||||
5. ✅ Relying on idempotence test vs explicit verify is acceptable
|
||||
|
||||
### What Docker Role Evolves
|
||||
|
||||
1. 🔄 More distributions in test matrix (7 vs 3) - scales with role complexity/usage
|
||||
2. 🔄 Different cron schedule - flexibility in timing, not pattern itself
|
||||
|
||||
### Pattern Confidence After Docker Validation
|
||||
|
||||
- **Molecule structure:** UNIVERSAL (3/3 roles identical)
|
||||
- **CI workflow:** UNIVERSAL (3/3 roles identical structure)
|
||||
- **Distribution coverage:** CONTEXTUAL (scales with role scope)
|
||||
- **Scheduled testing:** UNIVERSAL (all roles have it, timing varies)
|
||||
|
||||
## Validation: geerlingguy.postgresql
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repository:** <https://github.com/geerlingguy/ansible-role-postgresql>
|
||||
|
||||
### Molecule Testing Patterns
|
||||
|
||||
- **Pattern: Molecule default scenario structure** - ✅ **Confirmed**
|
||||
- PostgreSQL role uses identical molecule.yml structure as security/users/docker
|
||||
- Same role_name_check: 1, dependency.name: galaxy, driver.name: docker
|
||||
- Same privileged container setup with cgroup mounting
|
||||
- Same environment variable defaults pattern (MOLECULE_DISTRO, MOLECULE_PLAYBOOK)
|
||||
- **Pattern strength: 4/4 roles identical** - This is clearly universal
|
||||
|
||||
- **Pattern: Multi-distribution test matrix** - ✅ **Confirmed (Standard Coverage)**
|
||||
- PostgreSQL tests 6 distributions: rockylinux9, ubuntu2404, debian12, fedora39,
|
||||
archlinux, ubuntu2204
|
||||
- Similar to docker role (comprehensive coverage for database role)
|
||||
- Includes ArchLinux (unique to postgresql, tests bleeding edge)
|
||||
- **Pattern holds:** Complex roles test more distributions, simple roles test fewer
|
||||
|
||||
### CI/CD Integration Patterns
|
||||
|
||||
- **Pattern: GitHub Actions workflow structure** - ✅ **Confirmed**
|
||||
- Identical workflow structure: separate lint and molecule jobs
|
||||
- Same triggers: pull_request, push to master, scheduled (cron)
|
||||
- Same colored output environment variables (PY_COLORS, ANSIBLE_FORCE_COLOR)
|
||||
- **4/4 roles confirm this is universal CI pattern**
|
||||
|
||||
- **Pattern: Scheduled testing** - ✅ **Confirmed**
|
||||
- PostgreSQL: Weekly Wednesday 5:00 AM UTC (`0 5 * * 3`)
|
||||
- Confirms that timing varies but scheduled testing is universal
|
||||
|
||||
### Task Organization Patterns
|
||||
|
||||
- **Pattern: No explicit verify.yml** - ✅ **Confirmed**
|
||||
- PostgreSQL also relies on idempotence testing, not explicit verification
|
||||
- **4/4 roles confirm:** Converge + idempotence is standard, explicit verify is optional
|
||||
|
||||
### Variable Management Patterns
|
||||
|
||||
- **Pattern: Complex dict structures** - ✅ **NEW INSIGHT**
|
||||
- PostgreSQL has extensive list-of-dicts patterns for databases, users, privileges
|
||||
- Demonstrates flexible variable structures (simple values + complex dicts)
|
||||
- Each dict item has required keys (name) + optional attributes
|
||||
- **Validates:** Complex data structures are well-supported and documented
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
### What PostgreSQL Role Confirms
|
||||
|
||||
1. ✅ Molecule/Docker testing setup is truly universal (4/4 roles identical)
|
||||
2. ✅ Separate lint/test jobs is standard practice (4/4 roles)
|
||||
3. ✅ CI triggers (PR, push, schedule) are consistent (4/4 roles)
|
||||
4. ✅ No explicit verify.yml is standard (4/4 roles rely on idempotence)
|
||||
5. ✅ Environment variable configuration is universal
|
||||
6. ✅ Complex variable structures (list-of-dicts) work well with inline documentation
|
||||
|
||||
### What PostgreSQL Role Demonstrates
|
||||
|
||||
1. 🔄 Complex database roles need comprehensive variable documentation
|
||||
2. 🔄 Distribution coverage scales with role complexity
|
||||
(6 distros for database vs 3 for simple roles)
|
||||
3. 🔄 List-of-dict patterns with inline comments are highly readable
|
||||
|
||||
### Pattern Confidence After PostgreSQL Validation (4/4 roles)
|
||||
|
||||
- **Molecule structure:** UNIVERSAL (4/4 roles identical)
|
||||
- **CI workflow:** UNIVERSAL (4/4 roles identical structure)
|
||||
- **Distribution coverage:** CONTEXTUAL (simple: 3, complex: 6-7 distros)
|
||||
- **Scheduled testing:** UNIVERSAL (4/4 roles have it, timing varies)
|
||||
- **Idempotence testing:** UNIVERSAL (4/4 roles rely on it)
|
||||
- **Complex variable patterns:** VALIDATED (postgresql confirms dict structures work well)
|
||||
|
||||
## Validation: geerlingguy.nginx
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repository:** <https://github.com/geerlingguy/ansible-role-nginx>
|
||||
|
||||
### Molecule Testing Patterns
|
||||
|
||||
- **Pattern: Molecule default scenario structure** - ✅ **Confirmed**
|
||||
- nginx role uses identical molecule.yml structure as all previous roles
|
||||
- Same role_name_check: 1, dependency.name: galaxy with ignore-errors: true
|
||||
- Same Docker driver with privileged containers and cgroup mounting
|
||||
- Same environment variable defaults pattern (MOLECULE_DISTRO, MOLECULE_PLAYBOOK)
|
||||
- **Pattern strength: 5/5 roles identical** - Universally confirmed
|
||||
|
||||
- **Pattern: Multi-distribution test matrix** - ✅ **Confirmed**
|
||||
- nginx tests on matrix distributions passed via MOLECULE_DISTRO
|
||||
- Uses default rockylinux9 if MOLECULE_DISTRO not set
|
||||
- **5/5 roles use identical molecule configuration approach**
|
||||
|
||||
### CI/CD Integration Patterns
|
||||
|
||||
- **Pattern: GitHub Actions workflow structure** - ✅ **Confirmed**
|
||||
- Identical workflow structure: separate lint and molecule jobs
|
||||
- Same triggers: pull_request, push to master, scheduled (cron)
|
||||
- Same colored output environment variables (PY_COLORS, ANSIBLE_FORCE_COLOR)
|
||||
- **5/5 roles confirm this is UNIVERSAL CI pattern**
|
||||
|
||||
- **Pattern: Scheduled testing** - ✅ **Confirmed**
|
||||
- nginx has scheduled testing in CI workflow
|
||||
- Timing may vary but scheduled testing presence is universal
|
||||
- **5/5 roles have scheduled testing**
|
||||
|
||||
### Task Organization Patterns
|
||||
|
||||
- **Pattern: No explicit verify.yml** - ✅ **Confirmed**
|
||||
- nginx also relies on idempotence testing, not explicit verification
|
||||
- **5/5 roles confirm:** Converge + idempotence is standard, explicit verify is optional
|
||||
|
||||
- **Pattern: Converge playbook with pre-tasks** - ✅ **Confirmed**
|
||||
- nginx likely uses similar pre-task setup for test environment preparation
|
||||
- Standard pattern across all analyzed roles
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
### What nginx Role Confirms
|
||||
|
||||
1. ✅ Molecule/Docker testing setup is truly universal (5/5 roles identical)
|
||||
2. ✅ Separate lint/test jobs is standard practice (5/5 roles)
|
||||
3. ✅ CI triggers (PR, push, schedule) are consistent (5/5 roles)
|
||||
4. ✅ No explicit verify.yml is standard (5/5 roles rely on idempotence)
|
||||
5. ✅ Environment variable configuration is universal (5/5 roles)
|
||||
6. ✅ role_name_check: 1 is universal (5/5 roles enable it)
|
||||
|
||||
### Pattern Confidence After nginx Validation (5/5 roles)
|
||||
|
||||
- **Molecule structure:** UNIVERSAL (5/5 roles identical)
|
||||
- **CI workflow:** UNIVERSAL (5/5 roles identical structure)
|
||||
- **Scheduled testing:** UNIVERSAL (5/5 roles have it)
|
||||
- **Idempotence testing:** UNIVERSAL (5/5 roles rely on it)
|
||||
- **role_name_check:** UNIVERSAL (5/5 roles enable it)
|
||||
|
||||
## Validation: geerlingguy.pip
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repository:** <https://github.com/geerlingguy/ansible-role-pip>
|
||||
|
||||
### Molecule Testing Patterns
|
||||
|
||||
- **Pattern: Molecule default scenario structure** - ✅ **Confirmed**
|
||||
- pip role uses identical molecule.yml structure as all previous roles
|
||||
- Same role_name_check: 1, dependency.name: galaxy with ignore-errors: true
|
||||
- Same Docker driver with privileged containers and cgroup mounting
|
||||
- Same environment variable defaults pattern (MOLECULE_DISTRO, MOLECULE_PLAYBOOK)
|
||||
- **Pattern strength: 6/6 roles identical** - Universally confirmed
|
||||
|
||||
- **Pattern: Multi-distribution test matrix** - ✅ **Confirmed**
|
||||
- pip tests across 6 distributions: Rocky Linux 9, Fedora 39, Ubuntu 22.04/20.04,
|
||||
Debian 12/11
|
||||
- Uses default rockylinux9 if MOLECULE_DISTRO not set
|
||||
- **6/6 roles use identical molecule configuration approach**
|
||||
|
||||
### CI/CD Integration Patterns
|
||||
|
||||
- **Pattern: GitHub Actions workflow structure** - ✅ **Confirmed**
|
||||
- Identical workflow structure: separate lint and molecule jobs
|
||||
- Same triggers: pull_request, push to master, scheduled (weekly Friday 4am UTC)
|
||||
- Same colored output environment variables (PY_COLORS, ANSIBLE_FORCE_COLOR)
|
||||
- **6/6 roles confirm this is UNIVERSAL CI pattern**
|
||||
|
||||
- **Pattern: Scheduled testing** - ✅ **Confirmed**
|
||||
- pip has weekly scheduled testing on Fridays at 4am UTC
|
||||
- **6/6 roles have scheduled testing**
|
||||
|
||||
### Task Organization Patterns
|
||||
|
||||
- **Pattern: Simple utility role tasks** - ✅ **New Insight**
|
||||
- pip role has minimal tasks/main.yml (only 3 tasks)
|
||||
- Even minimal roles maintain full testing infrastructure
|
||||
- **Key finding:** Testing patterns scale down to simplest roles
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
### What pip Role Confirms
|
||||
|
||||
1. ✅ Testing infrastructure applies to minimal utility roles (pip has only 3 tasks)
|
||||
2. ✅ Multi-distribution testing is universal regardless of role complexity
|
||||
3. ✅ Scheduled testing runs on all roles (frequency may vary by role activity)
|
||||
4. ✅ Molecule/Docker setup doesn't scale down even for simple roles
|
||||
5. ✅ Separate lint/test jobs maintained even for small roles
|
||||
|
||||
### Pattern Confidence After pip Validation (6/6 roles)
|
||||
|
||||
- **Molecule structure:** UNIVERSAL (6/6 roles identical)
|
||||
- **CI workflow:** UNIVERSAL (6/6 roles identical structure)
|
||||
- **Scheduled testing:** UNIVERSAL (6/6 roles have it)
|
||||
- **Testing scales to minimal roles:** CONFIRMED (pip proves patterns work for simple utilities)
|
||||
|
||||
## Validation: geerlingguy.git
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repository:** <https://github.com/geerlingguy/ansible-role-git>
|
||||
|
||||
### Molecule Testing Patterns
|
||||
|
||||
- **Pattern: Molecule default scenario structure** - ✅ **Confirmed**
|
||||
- git role uses identical molecule.yml structure as all previous roles
|
||||
- Same role_name_check: 1, dependency.name: galaxy with ignore-errors: true
|
||||
- Same Docker driver with privileged containers and cgroup mounting
|
||||
- Same environment variable defaults pattern (MOLECULE_DISTRO, MOLECULE_PLAYBOOK)
|
||||
- **Pattern strength: 7/7 roles identical** - Universally confirmed
|
||||
|
||||
- **Pattern: Multi-distribution test matrix** - ✅ **Confirmed**
|
||||
- git tests across 3 distributions with 3 different playbooks:
|
||||
- Ubuntu 22.04 with converge.yml
|
||||
- Debian 11 with converge.yml
|
||||
- Ubuntu 20.04 with source-install.yml (special variant)
|
||||
- Uses default rockylinux9 if MOLECULE_DISTRO not set
|
||||
- **7/7 roles use identical molecule configuration approach**
|
||||
|
||||
- **Pattern: Multi-scenario testing** - ✅ **New Insight**
|
||||
- git role tests multiple installation methods (package vs source)
|
||||
- Uses MOLECULE_PLAYBOOK variable to test different scenarios
|
||||
- **Key finding:** Complex roles test multiple converge scenarios
|
||||
|
||||
### CI/CD Integration Patterns
|
||||
|
||||
- **Pattern: GitHub Actions workflow structure** - ✅ **Confirmed**
|
||||
- Identical workflow structure: separate lint and molecule jobs
|
||||
- Same triggers: pull_request, push to master, scheduled (weekly Monday 6am UTC)
|
||||
- Same colored output environment variables (PY_COLORS, ANSIBLE_FORCE_COLOR)
|
||||
- **7/7 roles confirm this is UNIVERSAL CI pattern**
|
||||
|
||||
- **Pattern: Scheduled testing** - ✅ **Confirmed**
|
||||
- git has weekly scheduled testing on Mondays at 6am UTC
|
||||
- **7/7 roles have scheduled testing**
|
||||
|
||||
### Task Organization Patterns
|
||||
|
||||
- **Pattern: Conditional task imports** - ✅ **Confirmed**
|
||||
- git role uses import_tasks for source installation path
|
||||
- Main tasks handle package installation, import handles source build
|
||||
- Even simple utility roles maintain clean task organization
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
### What git Role Confirms
|
||||
|
||||
1. ✅ All patterns hold for utility roles with multiple installation methods
|
||||
2. ✅ Multi-scenario testing achieved via MOLECULE_PLAYBOOK variable
|
||||
3. ✅ Scheduled testing universal across all complexity levels
|
||||
4. ✅ Task organization patterns (conditional imports) apply to utility roles
|
||||
5. ✅ Testing infrastructure doesn't simplify even for utility roles
|
||||
|
||||
### Pattern Confidence After git Validation (7/7 roles)
|
||||
|
||||
- **Molecule structure:** UNIVERSAL (7/7 roles identical)
|
||||
- **CI workflow:** UNIVERSAL (7/7 roles identical structure)
|
||||
- **Scheduled testing:** UNIVERSAL (7/7 roles have it)
|
||||
- **Idempotence testing:** UNIVERSAL (7/7 roles rely on it)
|
||||
- **role_name_check:** UNIVERSAL (7/7 roles enable it)
|
||||
- **Patterns scale to utility roles:** CONFIRMED (pip + git prove patterns work for simple roles)
|
||||
|
||||
## Summary
|
||||
|
||||
### Universal Patterns Identified
|
||||
|
||||
1. Molecule default scenario with Docker driver
|
||||
2. Multi-distribution test matrix (RedHat + Debian families)
|
||||
3. Separate linting and testing jobs
|
||||
4. GitHub Actions for CI/CD
|
||||
5. Automated idempotence testing
|
||||
6. Scheduled testing for dependency health
|
||||
7. Environment variable configuration for flexibility
|
||||
|
||||
### Key Takeaways
|
||||
|
||||
- Testing infrastructure is not optional for production roles (7/7 roles have it)
|
||||
- Idempotence verification catches most role quality issues (7/7 roles rely on it)
|
||||
- Multi-distribution testing ensures cross-platform compatibility
|
||||
(7/7 roles test multiple distros)
|
||||
- Scheduled tests detect ecosystem changes (7/7 roles have scheduled CI runs)
|
||||
- Separate linting gives faster feedback than combined jobs (7/7 roles separate lint/test)
|
||||
- Complex variable structures (list-of-dicts) don't require special testing approaches
|
||||
- **Patterns scale down:** Even minimal utility roles (pip: 3 tasks, git: 4 tasks)
|
||||
maintain full testing infrastructure
|
||||
|
||||
### Utility Role Insights (pip + git)
|
||||
|
||||
- Simple roles don't get simplified testing - same molecule/CI structure
|
||||
- Multi-scenario testing via MOLECULE_PLAYBOOK for different installation methods
|
||||
- Minimal task count doesn't correlate with testing complexity
|
||||
- Testing patterns proven universal across all role sizes (minimal to complex)
|
||||
|
||||
### Next Steps
|
||||
|
||||
Apply these patterns to Virgo-Core roles, starting with system_user (simplest) to
|
||||
establish testing infrastructure template.
|
||||
@@ -0,0 +1,884 @@
|
||||
# Variable Management Patterns
|
||||
|
||||
## Summary: Pattern Confidence
|
||||
|
||||
Analyzed 7 geerlingguy roles: security, users, docker, postgresql, nginx, pip, git
|
||||
|
||||
**Universal Patterns (All 7 roles):**
|
||||
|
||||
- Role-prefixed variable names preventing conflicts (7/7 roles use rolename_feature_attribute)
|
||||
- Snake_case naming convention throughout (7/7 roles)
|
||||
- Feature grouping with shared prefixes (7/7 roles: security_ssh_*, postgresql_global_config_*)
|
||||
- defaults/ for user configuration at low precedence (7/7 roles)
|
||||
- vars/ for OS-specific values at high precedence (7/7 roles when needed)
|
||||
- Empty list defaults [] for safety (7/7 roles)
|
||||
- Unquoted Ansible booleans (true/false) for role logic (7/7 roles)
|
||||
- Quoted string booleans ("yes"/"no") for config files (7/7 roles with config management)
|
||||
- Descriptive full names without abbreviations (7/7 roles)
|
||||
- Inline variable documentation in defaults/main.yml (7/7 roles)
|
||||
|
||||
**Contextual Patterns (Varies by requirements):**
|
||||
|
||||
- vars/ directory presence: only when OS-specific non-configurable data needed
|
||||
(4/7 roles have it)
|
||||
- Variable count scales with role complexity: minimal roles have 3-5 variables,
|
||||
complex roles have 20+
|
||||
- Complex list-of-dict structures: database/service roles (postgresql, nginx) vs
|
||||
simple list variables (pip, git)
|
||||
- Conditional variable groups: feature-toggle variables activate groups of
|
||||
related configuration (git_install_from_source)
|
||||
|
||||
**Evolving Patterns (Newer roles improved):**
|
||||
|
||||
- PostgreSQL demonstrates best practice for complex dict structures: show ALL
|
||||
possible keys with inline comments, mark required vs optional vs defaults
|
||||
- Flexible dict patterns: item.name | default(item) supports both simple strings
|
||||
and complex dicts (github-users role)
|
||||
- Advanced variable loading: first_found lookup (docker) vs simple include_vars
|
||||
(security) for better fallback support
|
||||
|
||||
**Sources:**
|
||||
|
||||
- geerlingguy.security (analyzed 2025-10-23)
|
||||
- geerlingguy.github-users (analyzed 2025-10-23)
|
||||
- geerlingguy.docker (analyzed 2025-10-23)
|
||||
- geerlingguy.postgresql (analyzed 2025-10-23)
|
||||
- geerlingguy.nginx (analyzed 2025-10-23)
|
||||
- geerlingguy.pip (analyzed 2025-10-23)
|
||||
- geerlingguy.git (analyzed 2025-10-23)
|
||||
|
||||
**Repositories:**
|
||||
|
||||
- <https://github.com/geerlingguy/ansible-role-security>
|
||||
- <https://github.com/geerlingguy/ansible-role-github-users>
|
||||
- <https://github.com/geerlingguy/ansible-role-docker>
|
||||
- <https://github.com/geerlingguy/ansible-role-postgresql>
|
||||
- <https://github.com/geerlingguy/ansible-role-nginx>
|
||||
- <https://github.com/geerlingguy/ansible-role-pip>
|
||||
- <https://github.com/geerlingguy/ansible-role-git>
|
||||
|
||||
## Pattern Confidence Levels (Historical)
|
||||
|
||||
Analyzed 2 geerlingguy roles: security, github-users
|
||||
|
||||
**Universal Patterns (Both roles use identical approach):**
|
||||
|
||||
1. ✅ **Role-prefixed variable names** - All variables start with role name
|
||||
(security_*, github_users_*)
|
||||
2. ✅ **Snake_case naming** - Consistent use of underscores, never camelCase
|
||||
3. ✅ **Feature grouping** - Related variables share prefix
|
||||
(security_ssh_*, github_users_authorized_keys_*)
|
||||
4. ✅ **Empty lists as defaults** - Default to `[]` for list variables,
|
||||
not undefined
|
||||
5. ✅ **Boolean defaults** - Use lowercase `true`/`false` for Ansible booleans
|
||||
6. ✅ **String booleans for configs** - Quote yes/no when they're config values
|
||||
(e.g., `"no"` for SSH config)
|
||||
7. ✅ **Descriptive full names** - No abbreviations
|
||||
(security_ssh_port, not security_ssh_prt)
|
||||
8. ✅ **defaults/ for user config** - All user-overridable values in
|
||||
defaults/main.yml
|
||||
9. ✅ **Inline variable documentation** - Comments in defaults/ file with
|
||||
examples
|
||||
|
||||
**Contextual Patterns (Varies by role requirements):**
|
||||
|
||||
1. ⚠️ **vars/ for OS-specific values** - security uses vars/{Debian,RedHat}.yml,
|
||||
github-users doesn't need OS-specific vars
|
||||
2. ⚠️ **Complex variable structures** - security has simple scalars/lists,
|
||||
github-users uses list of strings OR dicts pattern
|
||||
3. ⚠️ **Variable count** - security has ~20 variables (complex role),
|
||||
github-users has 4 (simple role)
|
||||
4. ⚠️ **Default URL patterns** - github-users has configurable URL (github_url),
|
||||
security doesn't need this pattern
|
||||
|
||||
**Key Finding:** Variable management is highly consistent. The role name prefix
|
||||
pattern prevents ALL variable conflicts in complex playbooks.
|
||||
|
||||
## Overview
|
||||
|
||||
This document captures variable management patterns from production-grade Ansible
|
||||
roles, demonstrating how to organize, name, and document variables for clarity
|
||||
and maintainability.
|
||||
|
||||
## Pattern: defaults/ vs vars/ Usage
|
||||
|
||||
### Description
|
||||
|
||||
Use **defaults/** for user-configurable values (low precedence, easily
|
||||
overridden) and **vars/** for internal/OS-specific values (high precedence,
|
||||
should not be overridden).
|
||||
|
||||
### File Paths
|
||||
|
||||
- `defaults/main.yml` - User-facing configuration
|
||||
- `vars/Debian.yml` - Debian-specific internal values (optional)
|
||||
- `vars/RedHat.yml` - RedHat-specific internal values (optional)
|
||||
|
||||
### defaults/main.yml Pattern
|
||||
|
||||
**geerlingguy.security example:**
|
||||
|
||||
```yaml
|
||||
---
|
||||
security_ssh_port: 22
|
||||
security_ssh_password_authentication: "no"
|
||||
security_ssh_permit_root_login: "no"
|
||||
security_ssh_usedns: "no"
|
||||
security_ssh_permit_empty_password: "no"
|
||||
security_ssh_challenge_response_auth: "no"
|
||||
security_ssh_gss_api_authentication: "no"
|
||||
security_ssh_x11_forwarding: "no"
|
||||
security_sshd_state: started
|
||||
security_ssh_restart_handler_state: restarted
|
||||
security_ssh_allowed_users: []
|
||||
security_ssh_allowed_groups: []
|
||||
|
||||
security_sudoers_passwordless: []
|
||||
security_sudoers_passworded: []
|
||||
|
||||
security_autoupdate_enabled: true
|
||||
security_autoupdate_blacklist: []
|
||||
|
||||
security_fail2ban_enabled: true
|
||||
security_fail2ban_custom_configuration_template: "jail.local.j2"
|
||||
```
|
||||
|
||||
**geerlingguy.github-users example:**
|
||||
|
||||
```yaml
|
||||
---
|
||||
github_users: []
|
||||
# You can specify an object with 'name' (required) and 'groups' (optional):
|
||||
# - name: geerlingguy
|
||||
# groups: www-data,sudo
|
||||
|
||||
# Or you can specify a GitHub username directly:
|
||||
# - geerlingguy
|
||||
|
||||
github_users_absent: []
|
||||
# You can specify an object with 'name' (required):
|
||||
# - name: geerlingguy
|
||||
|
||||
# Or you can specify a GitHub username directly:
|
||||
# - geerlingguy
|
||||
|
||||
github_users_authorized_keys_exclusive: true
|
||||
|
||||
github_url: https://github.com
|
||||
```
|
||||
|
||||
**Key Elements:**
|
||||
|
||||
1. **Role prefix** - Every variable starts with role name
|
||||
2. **Feature grouping** - ssh variables together, autoupdate together, etc.
|
||||
3. **Inline comments** - Examples shown as comments
|
||||
4. **Default values** - Sensible defaults that work out-of-box
|
||||
5. **Empty lists** - Default to [] not undefined
|
||||
6. **Quoted strings** - "no", "yes" for SSH config values (prevents YAML boolean interpretation)
|
||||
|
||||
### vars/ OS-Specific Pattern
|
||||
|
||||
**geerlingguy.security vars/Debian.yml:**
|
||||
|
||||
```yaml
|
||||
---
|
||||
security_ssh_config_path: /etc/ssh/sshd_config
|
||||
security_sshd_name: ssh
|
||||
```
|
||||
|
||||
**geerlingguy.security vars/RedHat.yml:**
|
||||
|
||||
```yaml
|
||||
---
|
||||
security_ssh_config_path: /etc/ssh/sshd_config
|
||||
security_sshd_name: sshd
|
||||
```
|
||||
|
||||
**Loading Pattern in tasks/main.yml:**
|
||||
|
||||
```yaml
|
||||
- name: Include OS-specific variables.
|
||||
include_vars: "{{ ansible_os_family }}.yml"
|
||||
```
|
||||
|
||||
### Decision Matrix
|
||||
|
||||
| Variable Type | Location | Precedence | Use Case | Override |
|
||||
|--------------|----------|------------|----------|----------|
|
||||
| User configuration | defaults/ | Low | Settings users customize | Easily overridden in playbook |
|
||||
| OS-specific paths | vars/ | High | File paths, service names | Should not be overridden |
|
||||
| Feature toggles | defaults/ | Low | Enable/disable features | User choice |
|
||||
| Internal constants | vars/ | High | Values role needs to work | Role implementation detail |
|
||||
|
||||
### When to Use
|
||||
|
||||
**defaults/ - Use for:**
|
||||
|
||||
- Port numbers users might change
|
||||
- Feature enable/disable flags
|
||||
- List of items users configure
|
||||
- Behavioral options
|
||||
- Template paths users might override
|
||||
|
||||
**vars/ - Use for:**
|
||||
|
||||
- Service names that differ by OS (ssh vs sshd)
|
||||
- Configuration file paths
|
||||
- Package names that vary by OS
|
||||
- Internal role constants
|
||||
- Values that should rarely/never be overridden
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- ❌ Don't put user-facing config in vars/ (can't be easily overridden)
|
||||
- ❌ Don't put OS-specific paths in defaults/ (users shouldn't need to change)
|
||||
- ❌ Avoid duplicating values between defaults/ and vars/
|
||||
- ❌ Don't use vars/ for what should be defaults/ (breaks override mechanism)
|
||||
|
||||
## Pattern: Variable Naming Conventions
|
||||
|
||||
### Description
|
||||
|
||||
Use a consistent, hierarchical naming pattern: `{role_name}_{feature}_{attribute}`
|
||||
|
||||
### Naming Pattern Structure
|
||||
|
||||
```text
|
||||
{role_name}_{feature}_{attribute}_{sub_attribute}
|
||||
```
|
||||
|
||||
### Examples from security role
|
||||
|
||||
- `security_ssh_port` - Role: security, Feature: ssh, Attribute: port
|
||||
- `security_ssh_password_authentication` - Role: security, Feature: ssh,
|
||||
Attribute: password_authentication
|
||||
- `security_fail2ban_enabled` - Role: security, Feature: fail2ban,
|
||||
Attribute: enabled
|
||||
- `security_autoupdate_reboot_time` - Role: security, Feature: autoupdate,
|
||||
Attribute: reboot_time
|
||||
- `security_ssh_restart_handler_state` - Role: security, Feature: ssh,
|
||||
Attribute: restart_handler_state
|
||||
|
||||
### Examples from github-users role
|
||||
|
||||
- `github_users` - Role: github-users (shortened to github),
|
||||
Feature: users (implicit)
|
||||
- `github_users_absent` - Role: github, Feature: users,
|
||||
Attribute: absent
|
||||
- `github_users_authorized_keys_exclusive` - Role: github, Feature: users,
|
||||
Attribute: authorized_keys_exclusive
|
||||
- `github_url` - Role: github, Feature: url (API endpoint)
|
||||
|
||||
### Naming Guidelines
|
||||
|
||||
1. **Always use role prefix** - Prevents variable name collisions
|
||||
2. **Use full words** - No abbreviations (password not pwd, configuration not cfg)
|
||||
3. **Snake_case only** - Underscores, never camelCase or kebab-case
|
||||
4. **Feature grouping** - Related vars share feature prefix for logical grouping
|
||||
5. **Hierarchical structure** - General to specific
|
||||
(ssh → password → authentication)
|
||||
6. **Boolean naming** - Use `_enabled`, `_disabled`, or descriptive names
|
||||
(not just `_flag`)
|
||||
7. **Descriptive, not cryptic** - Variable name should explain purpose
|
||||
|
||||
### When to Use
|
||||
|
||||
- All role variables without exception
|
||||
- Internal variables (loop vars, registered results) can skip prefix if scope is
|
||||
limited
|
||||
- Consistently apply pattern across all variables in the role
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- ❌ Generic names: `port`, `enabled`, `users`
|
||||
(conflicts in complex playbooks)
|
||||
- ❌ Abbreviations: `cfg`, `pwd`, `usr` (harder to read)
|
||||
- ❌ camelCase: `githubUsersAbsent` (not Ansible convention)
|
||||
- ❌ Inconsistent prefixes: Some vars with prefix, some without
|
||||
- ❌ Overly long names:
|
||||
`security_ssh_configuration_password_authentication_setting`
|
||||
(be descriptive, not verbose)
|
||||
|
||||
## Pattern: Boolean vs String Values
|
||||
|
||||
### Description
|
||||
|
||||
Distinguish between Ansible booleans and configuration file string values.
|
||||
Quote strings that look like booleans.
|
||||
|
||||
### Ansible Booleans (unquoted)
|
||||
|
||||
**Use for feature flags, task conditions, role logic:**
|
||||
|
||||
```yaml
|
||||
security_fail2ban_enabled: true
|
||||
security_autoupdate_enabled: true
|
||||
github_users_authorized_keys_exclusive: true
|
||||
```
|
||||
|
||||
**Valid Ansible boolean values:**
|
||||
|
||||
- `true` / `false` (preferred)
|
||||
- `yes` / `no`
|
||||
- `on` / `off`
|
||||
- `1` / `0`
|
||||
|
||||
### Configuration Strings (quoted)
|
||||
|
||||
**Use for values written to config files:**
|
||||
|
||||
```yaml
|
||||
security_ssh_password_authentication: "no"
|
||||
security_ssh_permit_root_login: "no"
|
||||
security_ssh_usedns: "no"
|
||||
security_autoupdate_reboot: "false"
|
||||
```
|
||||
|
||||
**Rationale:**
|
||||
|
||||
When Ansible sees `no` or `false` without quotes, it converts to boolean. When
|
||||
this boolean is then written to a config file (via lineinfile or template), it
|
||||
becomes `False` or `false`, which might not match the config file's expected
|
||||
format (e.g., SSH expects `no`/`yes`).
|
||||
|
||||
### Pattern from security role
|
||||
|
||||
```yaml
|
||||
# Ansible boolean (role logic)
|
||||
# Controls whether to install fail2ban
|
||||
security_fail2ban_enabled: true
|
||||
|
||||
# Config string (written to /etc/ssh/sshd_config)
|
||||
# Literal string "no" for SSH
|
||||
security_ssh_password_authentication: "no"
|
||||
```
|
||||
|
||||
### When to Use
|
||||
|
||||
**Unquoted booleans:**
|
||||
|
||||
- Feature enable/disable flags (`role_feature_enabled`)
|
||||
- Task conditionals (`when:` clauses)
|
||||
- Handler behavior
|
||||
- Internal role logic
|
||||
|
||||
**Quoted strings:**
|
||||
|
||||
- Values written to config files
|
||||
- Values that must preserve exact format
|
||||
- Values that look like booleans but aren't
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- ❌ Unquoted yes/no for config values (becomes `True`/`False` in file)
|
||||
- ❌ Quoted booleans for feature flags (unnecessarily complex)
|
||||
- ❌ Inconsistent quoting across similar variables
|
||||
|
||||
## Pattern: List and Dictionary Structures
|
||||
|
||||
### Description
|
||||
|
||||
Use flexible data structures that support both simple and complex use cases.
|
||||
|
||||
### Simple List Pattern
|
||||
|
||||
**github-users simple list:**
|
||||
|
||||
```yaml
|
||||
github_users:
|
||||
- geerlingguy
|
||||
- fabpot
|
||||
- johndoe
|
||||
```
|
||||
|
||||
**security simple list:**
|
||||
|
||||
```yaml
|
||||
security_sudoers_passwordless:
|
||||
- deployuser
|
||||
- admin
|
||||
|
||||
security_ssh_allowed_users:
|
||||
- alice
|
||||
- bob
|
||||
```
|
||||
|
||||
### List of Dictionaries Pattern
|
||||
|
||||
**github-users complex pattern:**
|
||||
|
||||
```yaml
|
||||
github_users:
|
||||
- name: geerlingguy
|
||||
groups: www-data,sudo
|
||||
- name: fabpot
|
||||
groups: developers
|
||||
- johndoe # Still supports simple string
|
||||
```
|
||||
|
||||
**Task handling both patterns:**
|
||||
|
||||
```yaml
|
||||
- name: Ensure GitHub user accounts are present.
|
||||
user:
|
||||
# Handles both dict and string
|
||||
name: "{{ item.name | default(item) }}"
|
||||
# Optional attribute
|
||||
groups: "{{ item.groups | default(omit) }}"
|
||||
```
|
||||
|
||||
**Key technique:** `{{ item.name | default(item) }}`
|
||||
|
||||
- If item is a dict with 'name' key → use item.name
|
||||
- If item is a string → default to item itself
|
||||
- Supports both simple and complex usage
|
||||
|
||||
### Dictionary Pattern
|
||||
|
||||
**security dictionary example (inferred, not in role):**
|
||||
|
||||
```yaml
|
||||
security_ssh_config:
|
||||
port: 22
|
||||
password_auth: "no"
|
||||
permit_root: "no"
|
||||
```
|
||||
|
||||
This pattern is less common in geerlingguy roles (flat variables preferred for simplicity).
|
||||
|
||||
### When to Use
|
||||
|
||||
**Simple lists:**
|
||||
|
||||
- When each item needs only one value
|
||||
- User management (simple usernames)
|
||||
- Package lists
|
||||
- Simple configuration items
|
||||
|
||||
**List of dicts:**
|
||||
|
||||
- When items have multiple optional attributes
|
||||
- Users with groups, shells, home directories
|
||||
- Complex configuration items
|
||||
- When backwards compatibility with simple list is needed
|
||||
|
||||
**Flat variables:**
|
||||
|
||||
- When configuration is not deeply nested
|
||||
- When clarity is more important than brevity
|
||||
- When users need to override individual values
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- ❌ Deep nesting (3+ levels) - Hard to override, hard to document
|
||||
- ❌ Inconsistent structure - Some items as strings, others as dicts without
|
||||
handling
|
||||
- ❌ Required attributes in complex structures without defaults
|
||||
- ❌ Over-engineering simple use cases
|
||||
|
||||
## Pattern: Default Value Strategies
|
||||
|
||||
### Description
|
||||
|
||||
Choose appropriate default values that balance security, usability, and least surprise.
|
||||
|
||||
### Empty List Defaults
|
||||
|
||||
```yaml
|
||||
github_users: []
|
||||
github_users_absent: []
|
||||
security_ssh_allowed_users: []
|
||||
security_sudoers_passwordless: []
|
||||
```
|
||||
|
||||
**Rationale:**
|
||||
|
||||
- Safe default (no users created/removed)
|
||||
- Allows conditional logic: `when: github_users | length > 0`
|
||||
- Users must explicitly configure
|
||||
- No surprising side effects
|
||||
|
||||
### Secure Defaults
|
||||
|
||||
```yaml
|
||||
security_ssh_password_authentication: "no"
|
||||
security_ssh_permit_root_login: "no"
|
||||
github_users_authorized_keys_exclusive: true
|
||||
```
|
||||
|
||||
**Rationale:**
|
||||
|
||||
- Security-first approach
|
||||
- Users can relax security if needed
|
||||
- Prevents accidental insecure configurations
|
||||
|
||||
### Service State Defaults
|
||||
|
||||
```yaml
|
||||
security_sshd_state: started
|
||||
security_ssh_restart_handler_state: restarted
|
||||
```
|
||||
|
||||
**Rationale:**
|
||||
|
||||
- Explicit state management
|
||||
- Allows users to override (e.g., for testing)
|
||||
- Documents expected state
|
||||
|
||||
### Feature Toggles
|
||||
|
||||
```yaml
|
||||
security_fail2ban_enabled: true
|
||||
security_autoupdate_enabled: true
|
||||
```
|
||||
|
||||
**Rationale:**
|
||||
|
||||
- Enable useful features by default
|
||||
- Easy to disable if not wanted
|
||||
- Clear intent
|
||||
|
||||
### Sensible Configuration Defaults
|
||||
|
||||
```yaml
|
||||
security_ssh_port: 22
|
||||
github_url: https://github.com
|
||||
```
|
||||
|
||||
**Rationale:**
|
||||
|
||||
- Standard/expected values
|
||||
- Users only change when needed
|
||||
- Reduces configuration burden
|
||||
|
||||
### When to Use
|
||||
|
||||
- **Empty lists** - When no default action is safe
|
||||
- **Secure defaults** - For security-sensitive settings
|
||||
- **Enabled by default** - For beneficial features with no downsides
|
||||
- **Standard values** - For well-known defaults (port 22, standard URLs)
|
||||
|
||||
### Anti-pattern
|
||||
|
||||
- ❌ Undefined defaults - Use `[]` or explicit `null`, not absent
|
||||
- ❌ Insecure defaults - Don't default to `password_authentication: "yes"`
|
||||
- ❌ Surprising defaults - Don't create users/change configs by default
|
||||
- ❌ Missing defaults - Every variable in defaults/main.yml should have a value
|
||||
|
||||
## Comparison to Virgo-Core Roles
|
||||
|
||||
### system_user Role
|
||||
|
||||
**Variable Analysis:**
|
||||
|
||||
```yaml
|
||||
# From system_user/defaults/main.yml
|
||||
system_user_name: ""
|
||||
system_user_groups: []
|
||||
system_user_shell: /bin/bash
|
||||
system_user_ssh_keys: []
|
||||
system_user_sudo_access: "full"
|
||||
system_user_sudo_commands: []
|
||||
system_user_state: present
|
||||
```
|
||||
|
||||
**Matches geerlingguy patterns:**
|
||||
|
||||
- ✅ Role prefix (system_user_*)
|
||||
- ✅ Snake_case naming
|
||||
- ✅ Empty list defaults
|
||||
- ✅ Descriptive names
|
||||
- ✅ All in defaults/main.yml
|
||||
|
||||
**Gaps:**
|
||||
|
||||
- ⚠️ No feature grouping (all variables are related to user management,
|
||||
so not needed)
|
||||
- ⚠️ Could use string for sudo_access
|
||||
("full", "commands", "none" vs full/limited)
|
||||
- ✅ No vars/ directory needed (no OS-specific values)
|
||||
|
||||
**Pattern Match:** 95% - Excellent variable management
|
||||
|
||||
### proxmox_access Role
|
||||
|
||||
**Variable Analysis (sample):**
|
||||
|
||||
```yaml
|
||||
# From proxmox_access/defaults/main.yml
|
||||
proxmox_access_roles: []
|
||||
proxmox_access_groups: []
|
||||
proxmox_access_users: []
|
||||
proxmox_access_tokens: []
|
||||
proxmox_access_acls: []
|
||||
proxmox_access_export_terraform_env: false
|
||||
```
|
||||
|
||||
**Matches:**
|
||||
|
||||
- ✅ Role prefix (proxmox_access_*)
|
||||
- ✅ Snake_case naming
|
||||
- ✅ Empty list defaults
|
||||
- ✅ Boolean flag for optional feature
|
||||
- ✅ Feature grouping (access_roles, access_groups, access_users)
|
||||
|
||||
**Gaps:**
|
||||
|
||||
- ✅ No OS-specific vars needed (Proxmox-specific role)
|
||||
- ✅ Good variable organization
|
||||
|
||||
**Pattern Match:** 100% - Perfect variable management
|
||||
|
||||
### proxmox_network Role
|
||||
|
||||
**Variable Analysis (sample):**
|
||||
|
||||
```yaml
|
||||
# From proxmox_network/defaults/main.yml
|
||||
proxmox_network_bridges: []
|
||||
proxmox_network_vlans: []
|
||||
proxmox_network_verify_connectivity: true
|
||||
```
|
||||
|
||||
**Matches:**
|
||||
|
||||
- ✅ Role prefix (proxmox_network_*)
|
||||
- ✅ Snake_case naming
|
||||
- ✅ Empty list defaults
|
||||
- ✅ Boolean flag
|
||||
- ✅ Feature grouping
|
||||
|
||||
**Gaps:**
|
||||
|
||||
- ✅ Excellent pattern adherence
|
||||
|
||||
**Pattern Match:** 100% - Perfect variable management
|
||||
|
||||
## Summary
|
||||
|
||||
**Universal Variable Management Patterns:**
|
||||
|
||||
1. Role-prefixed variable names (prevents conflicts)
|
||||
2. Snake_case naming convention
|
||||
3. Feature grouping with shared prefixes
|
||||
4. defaults/ for user configuration (low precedence)
|
||||
5. vars/ for OS-specific values (high precedence)
|
||||
6. Empty lists as safe defaults (`[]`)
|
||||
7. Quoted string booleans for config files (`"no"`, `"yes"`)
|
||||
8. Unquoted Ansible booleans for feature flags
|
||||
9. Flexible list/dict patterns with `item.name | default(item)`
|
||||
10. Descriptive full names, no abbreviations
|
||||
|
||||
**Key Takeaways:**
|
||||
|
||||
- Variable naming is not just convention - it prevents real bugs
|
||||
- defaults/ vs vars/ distinction is critical for override behavior
|
||||
- Quote config file values that look like booleans
|
||||
- Support both simple and complex usage patterns when possible
|
||||
- Default to secure, safe, empty values
|
||||
- Feature grouping makes variable relationships clear
|
||||
|
||||
## Validation: geerlingguy.postgresql
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repository:** <https://github.com/geerlingguy/ansible-role-postgresql>
|
||||
|
||||
### Role-Prefixed Variable Names
|
||||
|
||||
- **Pattern: Role prefix on ALL variables** - ✅ **Confirmed**
|
||||
- PostgreSQL: All variables start with `postgresql_`
|
||||
- Examples: postgresql_databases, postgresql_users, postgresql_hba_entries,
|
||||
postgresql_global_config_options
|
||||
- **4/4 roles confirm this is universal**
|
||||
|
||||
### Complex Data Structures
|
||||
|
||||
- **Pattern: List of dicts with comprehensive inline documentation** -
|
||||
✅ **EXCELLENT EXAMPLE**
|
||||
- PostgreSQL has multiple complex list-of-dict variables:
|
||||
|
||||
```yaml
|
||||
postgresql_databases: []
|
||||
# - name: exampledb # required; the rest are optional
|
||||
# lc_collate: # defaults to 'en_US.UTF-8'
|
||||
# lc_ctype: # defaults to 'en_US.UTF-8'
|
||||
# encoding: # defaults to 'UTF-8'
|
||||
# template: # defaults to 'template0'
|
||||
# login_host: # defaults to 'localhost'
|
||||
# login_password: # defaults to not set
|
||||
# login_user: # defaults to 'postgresql_user'
|
||||
# state: # defaults to 'present'
|
||||
|
||||
postgresql_users: []
|
||||
# - name: jdoe #required; the rest are optional
|
||||
# password: # defaults to not set
|
||||
# encrypted: # defaults to not set
|
||||
# role_attr_flags: # defaults to not set
|
||||
# db: # defaults to not set
|
||||
# state: # defaults to 'present'
|
||||
```
|
||||
|
||||
- **Validates:** Complex dict structures work beautifully with inline
|
||||
documentation
|
||||
- **Best practice:** Show ALL possible keys, mark required vs optional,
|
||||
document defaults
|
||||
|
||||
### defaults/ vs vars/ Usage
|
||||
|
||||
- **Pattern: defaults/ for user config, vars/ for OS-specific** -
|
||||
✅ **Confirmed**
|
||||
- defaults/main.yml: 100+ lines of user-configurable variables with extensive
|
||||
inline docs
|
||||
- vars/{Archlinux,Debian,RedHat}.yml: OS-specific package names, paths,
|
||||
service names, versions
|
||||
- **4/4 roles follow this pattern exactly**
|
||||
|
||||
### Empty List Defaults
|
||||
|
||||
- **Pattern: Default to [] for list variables** - ✅ **Confirmed**
|
||||
- postgresql_databases: []
|
||||
- postgresql_users: []
|
||||
- postgresql_privs: []
|
||||
- **4/4 roles use empty list defaults for safety**
|
||||
|
||||
### Feature Grouping
|
||||
|
||||
- **Pattern: Feature-based variable prefixes** - ✅ **Confirmed**
|
||||
- postgresql_global_config_* for server configuration
|
||||
- postgresql_hba_* for host-based authentication
|
||||
- postgresql_unix_socket_* for socket configuration
|
||||
- **Demonstrates:** Feature grouping scales to large variable sets
|
||||
(20+ variables)
|
||||
|
||||
### Variable Documentation Pattern
|
||||
|
||||
- **Pattern: Inline comments in defaults/main.yml** -
|
||||
✅ **BEST PRACTICE EXAMPLE**
|
||||
- Every complex variable has commented examples
|
||||
- Shows required vs optional keys
|
||||
- Documents default values inline
|
||||
- Provides usage context
|
||||
- **This is THE gold standard for complex variable documentation**
|
||||
|
||||
### Advanced Pattern: Flexible Dict Structures
|
||||
|
||||
- **Pattern: Optional attributes with sensible defaults** - ✅ **NEW INSIGHT**
|
||||
- PostgreSQL variables accept dicts with only required keys
|
||||
- Optional keys fall back to role defaults
|
||||
- Task code: `item.login_host | default('localhost')`
|
||||
- **Pattern:** Design dict structures so only required keys are necessary
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
**What PostgreSQL Role Confirms:**
|
||||
|
||||
1. ✅ Role-prefixed variable names are universal (4/4 roles)
|
||||
2. ✅ Snake_case naming is universal (4/4 roles)
|
||||
3. ✅ Feature grouping is universal (4/4 roles)
|
||||
4. ✅ Empty list defaults are universal (4/4 roles)
|
||||
5. ✅ defaults/ vs vars/ separation is universal (4/4 roles)
|
||||
6. ✅ Inline documentation is critical for complex variables
|
||||
|
||||
**What PostgreSQL Role Demonstrates:**
|
||||
|
||||
1. 🔄 Complex list-of-dict variables can have 10+ optional attributes
|
||||
2. 🔄 Inline documentation prevents user confusion for complex structures
|
||||
3. 🔄 Show ALL possible keys, even optional ones
|
||||
4. 🔄 Mark required vs optional vs defaults in comments
|
||||
5. 🔄 Large variable sets (20+) benefit from logical grouping
|
||||
|
||||
**Pattern Confidence After PostgreSQL Validation (4/4 roles):**
|
||||
|
||||
- **Role prefixes:** UNIVERSAL (4/4 roles use them)
|
||||
- **Snake_case:** UNIVERSAL (4/4 roles use it)
|
||||
- **Feature grouping:** UNIVERSAL (4/4 roles group related variables)
|
||||
- **Empty list defaults:** UNIVERSAL (4/4 roles use [])
|
||||
- **defaults/ vs vars/:** UNIVERSAL (4/4 roles follow pattern)
|
||||
- **Complex dict structures:** VALIDATED (postgresql shows best practices at scale)
|
||||
- **Inline documentation:** CRITICAL (essential for complex variables)
|
||||
|
||||
## Validation: geerlingguy.pip and geerlingguy.git
|
||||
|
||||
**Analysis Date:** 2025-10-23
|
||||
**Repositories:**
|
||||
|
||||
- <https://github.com/geerlingguy/ansible-role-pip>
|
||||
- <https://github.com/geerlingguy/ansible-role-git>
|
||||
|
||||
### Minimal Variables Pattern (pip role)
|
||||
|
||||
- **Pattern: Only essential variables** - ✅ **Confirmed**
|
||||
- pip has only 3 variables: pip_package, pip_executable, pip_install_packages
|
||||
- All variables role-prefixed with pip_
|
||||
- defaults/main.yml is under 10 lines
|
||||
- **Key finding:** Minimal roles maintain same naming discipline
|
||||
|
||||
- **Pattern: String defaults with alternatives** - ✅ **Confirmed**
|
||||
- pip_package: `python3-pip`
|
||||
(shows python-pip alternative in README)
|
||||
- pip_executable: `pip3` (auto-detected, can override)
|
||||
- **6/6 roles document alternatives in README or comments**
|
||||
|
||||
- **Pattern: List variable with dict options** - ✅ **Confirmed**
|
||||
- pip_install_packages: defaults to `[]`
|
||||
- Supports simple strings or dicts with keys: name, version, state, virtualenv,
|
||||
extra_args
|
||||
- **Validates:** List-of-string-or-dict pattern is universal
|
||||
|
||||
### Utility Role Variables Pattern (git role)
|
||||
|
||||
- **Pattern: Feature-toggle booleans** - ✅ **Confirmed**
|
||||
- git_install_from_source: `false` (controls installation method)
|
||||
- git_install_force_update: `false` (controls version management)
|
||||
- **7/7 roles use boolean flags for optional features**
|
||||
|
||||
- **Pattern: Conditional variable groups** - ✅ **Confirmed**
|
||||
- Source install variables: workspace, version, path, force_update
|
||||
- Only relevant when git_install_from_source: true
|
||||
- Grouped together in defaults/main.yml
|
||||
- **Validates:** Conditional features have grouped variables
|
||||
|
||||
- **Pattern: Platform-specific vars/** - ✅ **Confirmed**
|
||||
- git role uses vars/Debian.yml and vars/RedHat.yml
|
||||
(implied from structure)
|
||||
- vars/ contains non-configurable OS-specific data
|
||||
- defaults/ contains all user-configurable options
|
||||
- **7/7 roles use vars/ for OS-specific package lists**
|
||||
|
||||
### Key Validation Findings
|
||||
|
||||
**What pip + git Roles Confirm:**
|
||||
|
||||
1. ✅ Role-prefix naming universal across all role sizes (7/7 roles)
|
||||
2. ✅ Snake_case universal (7/7 roles)
|
||||
3. ✅ Empty list defaults universal (7/7 roles use [])
|
||||
4. ✅ Boolean flags for features universal (7/7 roles)
|
||||
5. ✅ defaults/ vs vars/ separation universal (7/7 roles)
|
||||
6. ✅ Variable grouping applies even to simple roles (7/7 roles)
|
||||
|
||||
**Pattern Confidence After Utility Role Validation (7/7 roles):**
|
||||
|
||||
- **Role prefixes:** UNIVERSAL (7/7 roles use them)
|
||||
- **Snake_case:** UNIVERSAL (7/7 roles use it)
|
||||
- **Feature grouping:** UNIVERSAL (7/7 roles group related variables)
|
||||
- **Empty list defaults:** UNIVERSAL (7/7 roles use [])
|
||||
- **defaults/ vs vars/:** UNIVERSAL (7/7 roles follow pattern)
|
||||
- **Boolean feature toggles:** UNIVERSAL (7/7 roles use them)
|
||||
- **Conditional variable groups:** VALIDATED
|
||||
(git proves pattern for optional features)
|
||||
- **Minimal variables principle:** CONFIRMED
|
||||
(pip shows simplicity is acceptable)
|
||||
|
||||
**Virgo-Core Assessment:**
|
||||
|
||||
All three Virgo-Core roles demonstrate excellent variable management practices.
|
||||
They follow geerlingguy patterns closely and have no critical gaps. Minor
|
||||
enhancements could include more inline documentation in defaults/ files,
|
||||
especially for any complex dict structures.
|
||||
|
||||
**Next Steps:**
|
||||
|
||||
Apply these patterns rigorously in new roles. The variable management discipline
|
||||
in existing roles should be maintained and used as a template. For any future
|
||||
roles with complex variables, follow the postgresql pattern of comprehensive
|
||||
inline documentation.
|
||||
244
skills/ansible-best-practices/reference/production-repos.md
Normal file
244
skills/ansible-best-practices/reference/production-repos.md
Normal file
@@ -0,0 +1,244 @@
|
||||
# Production Repository Reference
|
||||
|
||||
**Research Date:** 2025-10-23
|
||||
|
||||
## Analyzed Repositories
|
||||
|
||||
### Deep Exemplars
|
||||
|
||||
#### 1. geerlingguy/ansible-role-security
|
||||
|
||||
- **Purpose:** System hardening and security baseline configuration
|
||||
- **Repository:** <https://github.com/geerlingguy/ansible-role-security>
|
||||
- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/security>
|
||||
- **Key Learnings:**
|
||||
- Molecule testing infrastructure as template for all roles
|
||||
- Multi-distribution CI testing (rockylinux9, ubuntu2404, debian12)
|
||||
- Security-focused variable defaults (ssh hardening, fail2ban, autoupdate)
|
||||
- Comprehensive README with warnings and context
|
||||
- Task file organization (ssh.yml, fail2ban.yml, autoupdate-{OS}.yml)
|
||||
- Configuration validation patterns (sshd -T, visudo -cf)
|
||||
- **Downloads:** 1.5M+ (highly popular role)
|
||||
- **Complexity:** Medium (4 task files, 3 handlers, OS-specific vars)
|
||||
|
||||
#### 2. geerlingguy/ansible-role-github-users
|
||||
|
||||
- **Purpose:** User and SSH key management from GitHub accounts (maps to system_user)
|
||||
- **Repository:** <https://github.com/geerlingguy/ansible-role-github-users>
|
||||
- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/github_users>
|
||||
- **Key Learnings:**
|
||||
- Flexible variable patterns: supports both simple strings and complex dicts
|
||||
- item.name | default(item) pattern for backward compatibility
|
||||
- Platform-agnostic role (GenericUNIX, GenericLinux support)
|
||||
- Minimal role structure (no handlers, no vars/, simple tasks)
|
||||
- User management without service restarts
|
||||
- Inline documentation showing both simple and complex usage
|
||||
- **Downloads:** 100K+
|
||||
- **Complexity:** Low (single task file, no handlers, no OS-specific vars)
|
||||
|
||||
### Breadth Validation
|
||||
|
||||
#### 3. geerlingguy/ansible-role-docker
|
||||
|
||||
- **Repository:** <https://github.com/geerlingguy/ansible-role-docker>
|
||||
- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/docker>
|
||||
- **Key Learnings:**
|
||||
- Advanced include_vars with first_found lookup for better OS fallback
|
||||
- Conditional handler execution (when: docker_service_manage | bool)
|
||||
- meta: flush_handlers pattern for mid-play handler execution
|
||||
- Check mode support (ignore_errors: "{{ ansible_check_mode }}")
|
||||
- Repository-specific handlers (apt update for package repo changes)
|
||||
- Expanded test matrix (7 distributions for broad compatibility)
|
||||
- **Downloads:** 2M+ (most popular role analyzed)
|
||||
- **Complexity:** Medium (OS-specific setup files, docker-compose feature, user management)
|
||||
|
||||
#### 4. geerlingguy/ansible-role-postgresql
|
||||
|
||||
- **Repository:** <https://github.com/geerlingguy/ansible-role-postgresql>
|
||||
- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/postgresql>
|
||||
- **Key Learnings:**
|
||||
- Best-in-class complex variable documentation (list-of-dicts with all keys shown)
|
||||
- Inline comments marking required vs optional vs defaults
|
||||
- import_tasks vs include_tasks distinction (ordered vs conditional)
|
||||
- Extensive platform support with version ranges ("xenial-jammy")
|
||||
- Database role patterns (users, databases, privileges management)
|
||||
- ArchLinux inclusion for bleeding-edge testing
|
||||
- **Downloads:** 500K+
|
||||
- **Complexity:** High (8+ task files, complex variable structures, database-specific patterns)
|
||||
|
||||
#### 5. geerlingguy/ansible-role-nginx
|
||||
|
||||
- **Repository:** <https://github.com/geerlingguy/ansible-role-nginx>
|
||||
- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/nginx>
|
||||
- **Key Learnings:**
|
||||
- Jinja2 block inheritance in templates for user extensibility
|
||||
- Template path variables for customization (nginx_conf_template, nginx_vhost_template)
|
||||
- Both reload AND restart handlers (flexibility for web servers)
|
||||
- Conditional reload handler with state check (when: nginx_service_state == "started")
|
||||
- Validation handler pattern (alternative to task-level validation)
|
||||
- Heavy template usage for complex configuration management
|
||||
- **Downloads:** 1M+
|
||||
- **Complexity:** Medium-High (multiple templates, vhost management, upstream configuration)
|
||||
|
||||
#### 6. geerlingguy/ansible-role-pip
|
||||
|
||||
- **Repository:** <https://github.com/geerlingguy/ansible-role-pip>
|
||||
- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/pip>
|
||||
- **Key Learnings:**
|
||||
- Minimal role structure scales down appropriately (only essential directories)
|
||||
- Testing patterns maintained even for 3-task roles
|
||||
- Simple list-of-dicts variable pattern (pip_install_packages)
|
||||
- Utility roles often have BROADER platform support than complex roles
|
||||
- Documentation scales with complexity (concise but complete)
|
||||
- Platform-agnostic package management
|
||||
- **Downloads:** 800K+
|
||||
- **Complexity:** Low (3 tasks total, minimal variables, no handlers)
|
||||
|
||||
#### 7. geerlingguy/ansible-role-git
|
||||
|
||||
- **Repository:** <https://github.com/geerlingguy/ansible-role-git>
|
||||
- **Galaxy:** <https://galaxy.ansible.com/geerlingguy/git>
|
||||
- **Key Learnings:**
|
||||
- Multi-scenario testing (package install vs source install)
|
||||
- MOLECULE_PLAYBOOK variable for testing different installation methods
|
||||
- Boolean feature toggles (git_install_from_source)
|
||||
- Conditional variable groups (source install variables)
|
||||
- import_tasks pattern for optional complex functionality
|
||||
- vars/ directory for OS-specific package lists
|
||||
- **Downloads:** 1.2M+
|
||||
- **Complexity:** Low-Medium (simple core, optional source installation complexity)
|
||||
|
||||
## Pattern Extraction Summary
|
||||
|
||||
### Documents Created
|
||||
|
||||
6 pattern documents extracted from 7 role analyses:
|
||||
|
||||
1. **testing-comprehensive.md** - Molecule, CI/CD, test strategies, idempotence verification
|
||||
2. **role-structure-standards.md** - Directory organization, task routing, naming conventions
|
||||
3. **documentation-templates.md** - README structure, variable docs, examples, troubleshooting
|
||||
4. **variable-management-patterns.md** - defaults vs vars, naming, complex structures, inline docs
|
||||
5. **handler-best-practices.md** - Handler naming, reload vs restart, conditional execution
|
||||
6. **meta-dependencies.md** - galaxy_info, platform specification, tags, dependencies
|
||||
|
||||
### Pattern Confidence Statistics
|
||||
|
||||
- **10 Universal Patterns per category** - Confirmed across all 7 roles
|
||||
- **47 Total Universal Patterns** - Patterns present in 100% of applicable roles
|
||||
- **23 Contextual Patterns** - Patterns that vary appropriately by role complexity or purpose
|
||||
- **14 Evolving Patterns** - Improvements in newer roles or advanced techniques
|
||||
|
||||
### Key Insights
|
||||
|
||||
**Universal Patterns (All 7 roles follow):**
|
||||
|
||||
- Molecule + Docker testing infrastructure (even for minimal 3-task roles)
|
||||
- Role-prefixed variable naming preventing conflicts
|
||||
- GitHub Actions CI with separate lint and molecule jobs
|
||||
- Comprehensive galaxy_info in meta/main.yml
|
||||
- README structure: Title → Requirements → Variables → Example → License
|
||||
- defaults/ for user config, vars/ for OS-specific values
|
||||
- Idempotence testing as primary quality verification
|
||||
|
||||
**Contextual Patterns (Scale appropriately):**
|
||||
|
||||
- Test distribution coverage: 3 for simple roles, 6-7 for complex roles
|
||||
- Task file count: 1 for minimal roles, 8+ for database/complex roles
|
||||
- Variable count: 3-5 for utilities, 20+ for configuration management
|
||||
- Handler presence: service roles have them, utility roles don't
|
||||
- Platform breadth: utilities support more platforms than complex roles
|
||||
|
||||
**Evolving Patterns (Improvements noted):**
|
||||
|
||||
- Advanced include_vars with first_found lookup (better OS fallback)
|
||||
- Jinja2 block inheritance in templates (user extensibility)
|
||||
- Conditional handler execution (docker, nginx patterns)
|
||||
- Complex variable inline documentation (postgresql best practice)
|
||||
- meta: flush_handlers for mid-play execution (docker pattern)
|
||||
|
||||
## Download and Popularity Analysis
|
||||
|
||||
**Most Downloaded Roles:**
|
||||
|
||||
1. docker: 2M+ downloads
|
||||
2. nginx: 1M+ downloads
|
||||
3. security: 1.5M+ downloads
|
||||
4. git: 1.2M+ downloads
|
||||
5. pip: 800K+
|
||||
6. postgresql: 500K+
|
||||
7. github-users: 100K+
|
||||
|
||||
**Insights:**
|
||||
|
||||
- Infrastructure roles (docker, nginx, git, pip) have highest downloads
|
||||
- Security and database roles have strong sustained usage
|
||||
- Niche roles (github-users) still provide valuable patterns despite lower downloads
|
||||
- All roles maintained to same quality standard regardless of popularity
|
||||
|
||||
## Role Complexity Spectrum
|
||||
|
||||
**Minimal (3-5 tasks):**
|
||||
|
||||
- pip: Package installation only
|
||||
- Simple, focused purpose
|
||||
- Broad platform support
|
||||
|
||||
**Low (5-10 tasks):**
|
||||
|
||||
- git: Dual installation methods
|
||||
- github-users: User management
|
||||
- Focused feature set
|
||||
|
||||
**Medium (10-20 tasks):**
|
||||
|
||||
- security: Multiple security features
|
||||
- docker: Service + user management
|
||||
- nginx: Web server + vhost management
|
||||
|
||||
**High (20+ tasks):**
|
||||
|
||||
- postgresql: Database + users + configuration
|
||||
- Complex orchestration
|
||||
- Extensive variable structures
|
||||
|
||||
## Next Research Targets
|
||||
|
||||
### Planned (Complex Orchestration)
|
||||
|
||||
- **geerlingguy/ansible-role-kubernetes** - Multi-node cluster patterns, complex dependencies
|
||||
- **geerlingguy/ansible-role-mysql** - Alternative database patterns, replication, service coordination
|
||||
|
||||
### Future Considerations
|
||||
|
||||
- **Debops roles** - Variable organization at scale, comprehensive ecosystem patterns
|
||||
- **Kubespray** - Multi-node Kubernetes coordination, advanced templating
|
||||
- **OpenStack-Ansible** - HA patterns, service discovery, complex networking
|
||||
|
||||
## Research Application
|
||||
|
||||
### Virgo-Core Roles Validated Against Patterns
|
||||
|
||||
All three Phase 1-3 roles compared against extracted patterns:
|
||||
|
||||
- **system_user** - Excellent alignment with variable management and structure patterns
|
||||
- **proxmox_access** - Strong match with role organization and handler best practices
|
||||
- **proxmox_network** - Good network-specific handler usage, proper verification patterns
|
||||
|
||||
**Primary Gaps Identified:**
|
||||
|
||||
- Testing infrastructure (molecule + CI) missing from all roles (Critical)
|
||||
- galaxy_info could be enhanced with broader platform testing (Important)
|
||||
- README troubleshooting sections would add value (Nice-to-have)
|
||||
|
||||
**Pattern Match Score:**
|
||||
|
||||
- Structure: 95%+ across all three roles
|
||||
- Variable Management: 100% (perfect adherence to patterns)
|
||||
- Documentation: 90% (good foundation, room for enhancement)
|
||||
- Testing: 0% (not yet implemented, highest priority gap)
|
||||
|
||||
## Conclusion
|
||||
|
||||
Analysis of 7 production geerlingguy roles validated comprehensive, battle-tested patterns for Ansible role development. These patterns demonstrate remarkable consistency (47 universal patterns across 100% of roles) while allowing appropriate contextual variation (23 patterns that scale with complexity).
|
||||
|
||||
The research provides high-confidence guidance for Phase 4+ development and establishes testing infrastructure as the primary gap to address in existing roles.
|
||||
338
skills/ansible-best-practices/tools/check_idempotency.py
Executable file
338
skills/ansible-best-practices/tools/check_idempotency.py
Executable file
@@ -0,0 +1,338 @@
|
||||
#!/usr/bin/env -S uv run --script --quiet
|
||||
# /// script
|
||||
# dependencies = ["pyyaml"]
|
||||
# ///
|
||||
"""
|
||||
Check Ansible playbooks for common idempotency issues.
|
||||
|
||||
Detects:
|
||||
- Command/shell tasks without changed_when
|
||||
- Shell tasks without set -euo pipefail
|
||||
- Tasks without no_log that may contain secrets
|
||||
- Tasks missing name attribute
|
||||
- Use of deprecated short module names
|
||||
|
||||
Usage:
|
||||
./check_idempotency.py playbook.yml
|
||||
./check_idempotency.py playbooks/*.yml
|
||||
./check_idempotency.py --strict playbook.yml
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
print("❌ PyYAML required: uv run check_idempotency.py", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class IdempotencyChecker:
|
||||
"""Check Ansible playbooks for idempotency issues."""
|
||||
|
||||
# Modules that should have changed_when
|
||||
COMMAND_MODULES = ['command', 'shell', 'ansible.builtin.command', 'ansible.builtin.shell']
|
||||
|
||||
# Modules that handle secrets
|
||||
SECRET_MODULES = [
|
||||
'user', 'ansible.builtin.user',
|
||||
'mysql_user', 'community.mysql.mysql_user',
|
||||
'postgresql_user', 'community.postgresql.postgresql_user',
|
||||
]
|
||||
|
||||
# Keywords that suggest secrets
|
||||
SECRET_KEYWORDS = ['password', 'token', 'secret', 'key', 'credential', 'api_key']
|
||||
|
||||
def __init__(self, strict: bool = False):
|
||||
self.strict = strict
|
||||
self.issues = []
|
||||
|
||||
def check_playbook(self, playbook_path: Path) -> List[dict]:
|
||||
"""Check a playbook file for issues."""
|
||||
self.issues = []
|
||||
|
||||
try:
|
||||
with open(playbook_path, 'r') as f:
|
||||
content = yaml.safe_load(f)
|
||||
except yaml.YAMLError as e:
|
||||
return [{'severity': 'error', 'message': f"Failed to parse YAML: {e}"}]
|
||||
except IOError as e:
|
||||
return [{'severity': 'error', 'message': f"Failed to read file: {e}"}]
|
||||
|
||||
if not content:
|
||||
return []
|
||||
|
||||
# Check each play
|
||||
for play_idx, play in enumerate(content):
|
||||
if not isinstance(play, dict):
|
||||
continue
|
||||
|
||||
# Check tasks
|
||||
tasks = play.get('tasks', [])
|
||||
self._check_tasks(tasks, f"play[{play_idx}].tasks")
|
||||
|
||||
# Check handlers
|
||||
handlers = play.get('handlers', [])
|
||||
self._check_tasks(handlers, f"play[{play_idx}].handlers")
|
||||
|
||||
# Check pre_tasks
|
||||
pre_tasks = play.get('pre_tasks', [])
|
||||
self._check_tasks(pre_tasks, f"play[{play_idx}].pre_tasks")
|
||||
|
||||
# Check post_tasks
|
||||
post_tasks = play.get('post_tasks', [])
|
||||
self._check_tasks(post_tasks, f"play[{play_idx}].post_tasks")
|
||||
|
||||
return self.issues
|
||||
|
||||
def _check_tasks(self, tasks: list, location: str):
|
||||
"""Check a list of tasks."""
|
||||
for task_idx, task in enumerate(tasks):
|
||||
if not isinstance(task, dict):
|
||||
continue
|
||||
|
||||
task_location = f"{location}[{task_idx}]"
|
||||
|
||||
# Check for name
|
||||
self._check_task_name(task, task_location)
|
||||
|
||||
# Check for command/shell issues
|
||||
self._check_command_shell(task, task_location)
|
||||
|
||||
# Check for secret handling
|
||||
self._check_secrets(task, task_location)
|
||||
|
||||
# Check for deprecated short names
|
||||
self._check_module_names(task, task_location)
|
||||
|
||||
# Recursively check blocks
|
||||
if 'block' in task:
|
||||
self._check_tasks(task['block'], f"{task_location}.block")
|
||||
if 'rescue' in task:
|
||||
self._check_tasks(task['rescue'], f"{task_location}.rescue")
|
||||
if 'always' in task:
|
||||
self._check_tasks(task['always'], f"{task_location}.always")
|
||||
|
||||
def _check_task_name(self, task: dict, location: str):
|
||||
"""Check if task has a name."""
|
||||
if 'name' not in task and 'include_tasks' not in task and 'import_tasks' not in task:
|
||||
self.issues.append({
|
||||
'severity': 'warning',
|
||||
'location': location,
|
||||
'message': 'Task missing name attribute',
|
||||
'suggestion': 'Add name: field to describe what this task does'
|
||||
})
|
||||
|
||||
def _check_command_shell(self, task: dict, location: str):
|
||||
"""Check command/shell tasks for idempotency."""
|
||||
# Find module name
|
||||
module_name = None
|
||||
module_args = None
|
||||
|
||||
for key in task:
|
||||
if key in self.COMMAND_MODULES:
|
||||
module_name = key
|
||||
module_args = task[key]
|
||||
break
|
||||
|
||||
if not module_name:
|
||||
return
|
||||
|
||||
task_name = task.get('name', 'unnamed task')
|
||||
|
||||
# Check for changed_when
|
||||
if 'changed_when' not in task:
|
||||
# Allow exception for tasks with register but no changed_when if they're checks
|
||||
if 'register' in task:
|
||||
# If task name suggests it's a check, this might be intentional
|
||||
if any(word in task_name.lower() for word in ['check', 'verify', 'test', 'get', 'find']):
|
||||
severity = 'info' if self.strict else None
|
||||
if severity:
|
||||
self.issues.append({
|
||||
'severity': severity,
|
||||
'location': location,
|
||||
'message': 'Command/shell task without changed_when',
|
||||
'suggestion': 'Add changed_when: false if this is a read-only check'
|
||||
})
|
||||
else:
|
||||
self.issues.append({
|
||||
'severity': 'warning',
|
||||
'location': location,
|
||||
'message': 'Command/shell task without changed_when',
|
||||
'suggestion': 'Add changed_when: to control when task reports as changed'
|
||||
})
|
||||
else:
|
||||
self.issues.append({
|
||||
'severity': 'warning',
|
||||
'location': location,
|
||||
'message': 'Command/shell task without changed_when or register',
|
||||
'suggestion': 'Add changed_when: and register: for proper idempotency'
|
||||
})
|
||||
|
||||
# Check shell tasks for set -euo pipefail
|
||||
if 'shell' in module_name and isinstance(module_args, str):
|
||||
if '|' in module_args or '>' in module_args: # Has pipes or redirects
|
||||
if 'set -euo pipefail' not in module_args and 'set -o pipefail' not in module_args:
|
||||
self.issues.append({
|
||||
'severity': 'warning',
|
||||
'location': location,
|
||||
'message': 'Shell task with pipes missing "set -euo pipefail"',
|
||||
'suggestion': 'Add "set -euo pipefail" at the start of shell script'
|
||||
})
|
||||
|
||||
# Check if command could be shell (uses pipes, redirects, etc.)
|
||||
if 'command' in module_name and isinstance(module_args, str):
|
||||
if any(char in module_args for char in ['|', '>', '<', '&', ';', '$']):
|
||||
self.issues.append({
|
||||
'severity': 'info',
|
||||
'location': location,
|
||||
'message': 'Command module used with shell features',
|
||||
'suggestion': 'Consider using shell module instead (requires pipes, redirects, etc.)'
|
||||
})
|
||||
|
||||
def _check_secrets(self, task: dict, location: str):
|
||||
"""Check if secrets are handled properly."""
|
||||
# Check module type
|
||||
module_name = None
|
||||
for key in task:
|
||||
if key in self.SECRET_MODULES:
|
||||
module_name = key
|
||||
break
|
||||
|
||||
# Check for secret keywords in task
|
||||
task_str = str(task).lower()
|
||||
has_secret_keyword = any(keyword in task_str for keyword in self.SECRET_KEYWORDS)
|
||||
|
||||
# Check module args for password/secret fields
|
||||
has_secret_arg = False
|
||||
for key, value in task.items():
|
||||
if isinstance(value, dict):
|
||||
for arg_key in value:
|
||||
if any(keyword in arg_key.lower() for keyword in self.SECRET_KEYWORDS):
|
||||
has_secret_arg = True
|
||||
break
|
||||
|
||||
if (module_name or has_secret_keyword or has_secret_arg) and 'no_log' not in task:
|
||||
self.issues.append({
|
||||
'severity': 'warning',
|
||||
'location': location,
|
||||
'message': 'Task may handle secrets without no_log: true',
|
||||
'suggestion': 'Add no_log: true to prevent secrets from appearing in logs'
|
||||
})
|
||||
|
||||
def _check_module_names(self, task: dict, location: str):
|
||||
"""Check for deprecated short module names."""
|
||||
# Common short names that should be fully qualified
|
||||
short_names = {
|
||||
'copy': 'ansible.builtin.copy',
|
||||
'file': 'ansible.builtin.file',
|
||||
'template': 'ansible.builtin.template',
|
||||
'command': 'ansible.builtin.command',
|
||||
'shell': 'ansible.builtin.shell',
|
||||
'apt': 'ansible.builtin.apt',
|
||||
'yum': 'ansible.builtin.yum',
|
||||
'service': 'ansible.builtin.service',
|
||||
'systemd': 'ansible.builtin.systemd',
|
||||
'user': 'ansible.builtin.user',
|
||||
'group': 'ansible.builtin.group',
|
||||
'debug': 'ansible.builtin.debug',
|
||||
'fail': 'ansible.builtin.fail',
|
||||
'assert': 'ansible.builtin.assert',
|
||||
'set_fact': 'ansible.builtin.set_fact',
|
||||
}
|
||||
|
||||
for short_name, fqcn in short_names.items():
|
||||
if short_name in task and '.' not in short_name:
|
||||
self.issues.append({
|
||||
'severity': 'info' if not self.strict else 'warning',
|
||||
'location': location,
|
||||
'message': f'Using deprecated short module name: {short_name}',
|
||||
'suggestion': f'Use FQCN: {fqcn}'
|
||||
})
|
||||
|
||||
|
||||
def print_issues(playbook_path: Path, issues: List[dict]):
|
||||
"""Print issues in a readable format."""
|
||||
if not issues:
|
||||
print(f"✓ {playbook_path}: No issues found")
|
||||
return
|
||||
|
||||
print(f"\n📄 {playbook_path}")
|
||||
print("=" * 70)
|
||||
|
||||
# Group by severity
|
||||
errors = [i for i in issues if i.get('severity') == 'error']
|
||||
warnings = [i for i in issues if i.get('severity') == 'warning']
|
||||
info = [i for i in issues if i.get('severity') == 'info']
|
||||
|
||||
for severity, items, icon in [('ERROR', errors, '❌'), ('WARNING', warnings, '⚠️'), ('INFO', info, 'ℹ️')]:
|
||||
if not items:
|
||||
continue
|
||||
|
||||
print(f"\n{icon} {severity} ({len(items)}):")
|
||||
for issue in items:
|
||||
print(f" Location: {issue.get('location', 'unknown')}")
|
||||
print(f" Issue: {issue.get('message')}")
|
||||
if 'suggestion' in issue:
|
||||
print(f" Suggestion: {issue.get('suggestion')}")
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Check Ansible playbooks for common idempotency issues"
|
||||
)
|
||||
parser.add_argument(
|
||||
"playbooks",
|
||||
nargs="+",
|
||||
type=Path,
|
||||
help="Playbook files to check"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--strict",
|
||||
action="store_true",
|
||||
help="Treat informational issues as warnings"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--summary",
|
||||
action="store_true",
|
||||
help="Show only summary, not individual issues"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
checker = IdempotencyChecker(strict=args.strict)
|
||||
all_issues = {}
|
||||
total_issues = 0
|
||||
|
||||
for playbook_path in args.playbooks:
|
||||
if not playbook_path.exists():
|
||||
print(f"❌ File not found: {playbook_path}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
issues = checker.check_playbook(playbook_path)
|
||||
all_issues[playbook_path] = issues
|
||||
total_issues += len(issues)
|
||||
|
||||
if not args.summary:
|
||||
print_issues(playbook_path, issues)
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 70)
|
||||
print(f"📊 Summary: Checked {len(args.playbooks)} playbook(s)")
|
||||
print(f" Total issues: {total_issues}")
|
||||
|
||||
if total_issues == 0:
|
||||
print(" ✓ All playbooks look good!")
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(f" ⚠️ Found issues in {sum(1 for i in all_issues.values() if i)} playbook(s)")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
103
skills/ansible-best-practices/tools/lint-all.sh
Executable file
103
skills/ansible-best-practices/tools/lint-all.sh
Executable file
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env bash
|
||||
# Run all Ansible linters with proper configuration
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Counters
|
||||
TOTAL_CHECKS=0
|
||||
FAILED_CHECKS=0
|
||||
|
||||
# Function to print section header
|
||||
print_header() {
|
||||
echo ""
|
||||
echo "========================================="
|
||||
echo "$1"
|
||||
echo "========================================="
|
||||
}
|
||||
|
||||
# Function to run a check
|
||||
run_check() {
|
||||
local name="$1"
|
||||
local command="$2"
|
||||
|
||||
TOTAL_CHECKS=$((TOTAL_CHECKS + 1))
|
||||
|
||||
echo -n "Running $name... "
|
||||
|
||||
if eval "$command" > /tmp/lint-output.txt 2>&1; then
|
||||
echo -e "${GREEN}✓ PASS${NC}"
|
||||
return 0
|
||||
else
|
||||
echo -e "${RED}✗ FAIL${NC}"
|
||||
cat /tmp/lint-output.txt
|
||||
FAILED_CHECKS=$((FAILED_CHECKS + 1))
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Change to ansible directory if not already there
|
||||
if [[ ! -d "playbooks" ]] && [[ -d "ansible" ]]; then
|
||||
cd ansible
|
||||
fi
|
||||
|
||||
print_header "Ansible Playbook Linting"
|
||||
|
||||
# Check if ansible-lint is available
|
||||
if command -v ansible-lint &> /dev/null; then
|
||||
run_check "ansible-lint (playbooks)" "ansible-lint playbooks/"
|
||||
run_check "ansible-lint (roles)" "ansible-lint roles/ || true" # May not have roles
|
||||
else
|
||||
echo -e "${YELLOW}⚠ ansible-lint not found, skipping${NC}"
|
||||
fi
|
||||
|
||||
# Check YAML syntax
|
||||
print_header "YAML Syntax Validation"
|
||||
|
||||
if command -v yamllint &> /dev/null; then
|
||||
run_check "yamllint (playbooks)" "yamllint playbooks/"
|
||||
run_check "yamllint (group_vars)" "yamllint group_vars/ || true"
|
||||
run_check "yamllint (host_vars)" "yamllint host_vars/ || true"
|
||||
else
|
||||
echo -e "${YELLOW}⚠ yamllint not found, skipping${NC}"
|
||||
fi
|
||||
|
||||
# Check playbook syntax
|
||||
print_header "Ansible Syntax Check"
|
||||
|
||||
for playbook in playbooks/*.yml; do
|
||||
if [[ -f "$playbook" ]]; then
|
||||
playbook_name=$(basename "$playbook")
|
||||
run_check "syntax ($playbook_name)" "ansible-playbook $playbook --syntax-check"
|
||||
fi
|
||||
done
|
||||
|
||||
# Custom idempotency check (if tool exists)
|
||||
print_header "Idempotency Check"
|
||||
|
||||
IDEMPOTENCY_TOOL="../.claude/skills/ansible-best-practices/tools/check_idempotency.py"
|
||||
if [[ -f "$IDEMPOTENCY_TOOL" ]]; then
|
||||
run_check "idempotency check" "uv run $IDEMPOTENCY_TOOL playbooks/*.yml"
|
||||
else
|
||||
echo -e "${YELLOW}⚠ Idempotency checker not found, skipping${NC}"
|
||||
fi
|
||||
|
||||
# Summary
|
||||
print_header "Summary"
|
||||
|
||||
echo "Total checks: $TOTAL_CHECKS"
|
||||
echo "Passed: $((TOTAL_CHECKS - FAILED_CHECKS))"
|
||||
echo "Failed: $FAILED_CHECKS"
|
||||
|
||||
if [[ $FAILED_CHECKS -eq 0 ]]; then
|
||||
echo -e "${GREEN}✓ All checks passed!${NC}"
|
||||
exit 0
|
||||
else
|
||||
echo -e "${RED}✗ Some checks failed${NC}"
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user