Files
gh-dieshen-claude-marketpla…/commands/devops-patterns.md
2025-11-29 18:21:22 +08:00

1013 lines
20 KiB
Markdown

# DevOps & Infrastructure as Code Patterns
Comprehensive DevOps practices and Infrastructure as Code patterns using Terraform, Docker, Kubernetes, and CI/CD tools.
## Infrastructure as Code with Terraform
### Project Structure
```
terraform/
├── environments/
│ ├── dev/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ └── terraform.tfvars
│ ├── staging/
│ └── production/
├── modules/
│ ├── vpc/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ └── outputs.tf
│ ├── eks/
│ └── rds/
└── global/
├── s3/
└── iam/
```
### Terraform Best Practices
#### Module Design
```hcl
# modules/vpc/main.tf
terraform {
required_version = ">= 1.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
}
resource "aws_vpc" "main" {
cidr_block = var.vpc_cidr
enable_dns_hostnames = true
enable_dns_support = true
tags = merge(
var.common_tags,
{
Name = "${var.environment}-vpc"
}
)
}
resource "aws_subnet" "public" {
count = length(var.public_subnet_cidrs)
vpc_id = aws_vpc.main.id
cidr_block = var.public_subnet_cidrs[count.index]
availability_zone = var.availability_zones[count.index]
map_public_ip_on_launch = true
tags = merge(
var.common_tags,
{
Name = "${var.environment}-public-${count.index + 1}"
Type = "public"
}
)
}
resource "aws_subnet" "private" {
count = length(var.private_subnet_cidrs)
vpc_id = aws_vpc.main.id
cidr_block = var.private_subnet_cidrs[count.index]
availability_zone = var.availability_zones[count.index]
tags = merge(
var.common_tags,
{
Name = "${var.environment}-private-${count.index + 1}"
Type = "private"
}
)
}
resource "aws_internet_gateway" "main" {
vpc_id = aws_vpc.main.id
tags = merge(
var.common_tags,
{
Name = "${var.environment}-igw"
}
)
}
resource "aws_nat_gateway" "main" {
count = var.enable_nat_gateway ? length(var.public_subnet_cidrs) : 0
allocation_id = aws_eip.nat[count.index].id
subnet_id = aws_subnet.public[count.index].id
tags = merge(
var.common_tags,
{
Name = "${var.environment}-nat-${count.index + 1}"
}
)
depends_on = [aws_internet_gateway.main]
}
resource "aws_eip" "nat" {
count = var.enable_nat_gateway ? length(var.public_subnet_cidrs) : 0
domain = "vpc"
tags = merge(
var.common_tags,
{
Name = "${var.environment}-nat-eip-${count.index + 1}"
}
)
}
# modules/vpc/variables.tf
variable "environment" {
description = "Environment name"
type = string
}
variable "vpc_cidr" {
description = "CIDR block for VPC"
type = string
}
variable "public_subnet_cidrs" {
description = "CIDR blocks for public subnets"
type = list(string)
}
variable "private_subnet_cidrs" {
description = "CIDR blocks for private subnets"
type = list(string)
}
variable "availability_zones" {
description = "Availability zones"
type = list(string)
}
variable "enable_nat_gateway" {
description = "Enable NAT gateway"
type = bool
default = true
}
variable "common_tags" {
description = "Common tags for all resources"
type = map(string)
default = {}
}
# modules/vpc/outputs.tf
output "vpc_id" {
description = "VPC ID"
value = aws_vpc.main.id
}
output "public_subnet_ids" {
description = "Public subnet IDs"
value = aws_subnet.public[*].id
}
output "private_subnet_ids" {
description = "Private subnet IDs"
value = aws_subnet.private[*].id
}
output "nat_gateway_ips" {
description = "NAT Gateway public IPs"
value = aws_eip.nat[*].public_ip
}
```
#### Environment Configuration
```hcl
# environments/production/main.tf
terraform {
backend "s3" {
bucket = "myapp-terraform-state"
key = "production/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-state-lock"
}
}
provider "aws" {
region = var.aws_region
default_tags {
tags = {
Environment = "production"
ManagedBy = "Terraform"
Project = "MyApp"
}
}
}
module "vpc" {
source = "../../modules/vpc"
environment = "production"
vpc_cidr = "10.0.0.0/16"
public_subnet_cidrs = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
private_subnet_cidrs = ["10.0.11.0/24", "10.0.12.0/24", "10.0.13.0/24"]
availability_zones = ["us-east-1a", "us-east-1b", "us-east-1c"]
enable_nat_gateway = true
common_tags = {
CostCenter = "Engineering"
}
}
module "eks" {
source = "../../modules/eks"
cluster_name = "production-eks"
cluster_version = "1.28"
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnet_ids
node_groups = {
general = {
desired_size = 3
min_size = 3
max_size = 10
instance_types = ["t3.large"]
}
compute = {
desired_size = 2
min_size = 2
max_size = 20
instance_types = ["c5.2xlarge"]
}
}
}
# environments/production/terraform.tfvars
aws_region = "us-east-1"
```
#### State Management
```hcl
# Setup S3 backend with state locking
resource "aws_s3_bucket" "terraform_state" {
bucket = "myapp-terraform-state"
lifecycle {
prevent_destroy = true
}
tags = {
Name = "Terraform State"
}
}
resource "aws_s3_bucket_versioning" "terraform_state" {
bucket = aws_s3_bucket.terraform_state.id
versioning_configuration {
status = "Enabled"
}
}
resource "aws_s3_bucket_server_side_encryption_configuration" "terraform_state" {
bucket = aws_s3_bucket.terraform_state.id
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
}
}
}
resource "aws_dynamodb_table" "terraform_locks" {
name = "terraform-state-lock"
billing_mode = "PAY_PER_REQUEST"
hash_key = "LockID"
attribute {
name = "LockID"
type = "S"
}
tags = {
Name = "Terraform State Lock"
}
}
```
## Docker Patterns
### Multi-Stage Builds
#### Node.js Application
```dockerfile
# Base stage for dependencies
FROM node:20-alpine AS base
WORKDIR /app
COPY package*.json ./
# Development stage
FROM base AS development
RUN npm install
COPY . .
CMD ["npm", "run", "dev"]
# Build stage
FROM base AS build
RUN npm ci --only=production
COPY . .
RUN npm run build
# Production stage
FROM node:20-alpine AS production
WORKDIR /app
# Create non-root user
RUN addgroup -g 1001 -S nodejs && \
adduser -S nodejs -u 1001
# Copy only necessary files
COPY --from=build --chown=nodejs:nodejs /app/node_modules ./node_modules
COPY --from=build --chown=nodejs:nodejs /app/dist ./dist
COPY --chown=nodejs:nodejs package*.json ./
# Switch to non-root user
USER nodejs
EXPOSE 3000
CMD ["node", "dist/index.js"]
```
#### Go Application
```dockerfile
# Build stage
FROM golang:1.21-alpine AS builder
WORKDIR /build
# Cache dependencies
COPY go.mod go.sum ./
RUN go mod download
# Copy source code
COPY . .
# Build with optimizations
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
-ldflags='-w -s -extldflags "-static"' \
-a -installsuffix cgo \
-o app ./cmd/server
# Production stage
FROM scratch
# Copy CA certificates for HTTPS
COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
# Copy binary
COPY --from=builder /build/app /app
# Non-root user
USER 1000:1000
EXPOSE 8080
ENTRYPOINT ["/app"]
```
#### Python Application
```dockerfile
# Build stage
FROM python:3.11-slim AS builder
WORKDIR /app
# Install dependencies
COPY requirements.txt .
RUN pip install --user --no-cache-dir -r requirements.txt
# Production stage
FROM python:3.11-slim
WORKDIR /app
# Copy dependencies from builder
COPY --from=builder /root/.local /root/.local
# Copy application code
COPY . .
# Create non-root user
RUN useradd -m -u 1000 appuser && \
chown -R appuser:appuser /app
USER appuser
# Update PATH
ENV PATH=/root/.local/bin:$PATH
EXPOSE 8000
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
```
### Docker Compose for Development
```yaml
# docker-compose.yml
version: '3.8'
services:
app:
build:
context: .
dockerfile: Dockerfile
target: development
ports:
- "3000:3000"
volumes:
- .:/app
- /app/node_modules
environment:
- NODE_ENV=development
- DATABASE_URL=postgresql://postgres:password@db:5432/myapp
- REDIS_URL=redis://redis:6379
depends_on:
db:
condition: service_healthy
redis:
condition: service_started
networks:
- app-network
db:
image: postgres:16-alpine
ports:
- "5432:5432"
environment:
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=password
- POSTGRES_DB=myapp
volumes:
- postgres_data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 5s
timeout: 5s
retries: 5
networks:
- app-network
redis:
image: redis:7-alpine
ports:
- "6379:6379"
volumes:
- redis_data:/data
networks:
- app-network
nginx:
image: nginx:alpine
ports:
- "80:80"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
depends_on:
- app
networks:
- app-network
volumes:
postgres_data:
redis_data:
networks:
app-network:
driver: bridge
```
## Kubernetes Patterns
### Deployment with Best Practices
```yaml
# deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: myapp
namespace: production
labels:
app: myapp
version: v1.0.0
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: myapp
template:
metadata:
labels:
app: myapp
version: v1.0.0
spec:
serviceAccountName: myapp-sa
# Security context
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
# Init containers
initContainers:
- name: migrate
image: myapp:v1.0.0
command: ['npm', 'run', 'migrate']
envFrom:
- secretRef:
name: myapp-secrets
containers:
- name: app
image: myapp:v1.0.0
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 3000
protocol: TCP
# Environment variables
env:
- name: NODE_ENV
value: "production"
- name: PORT
value: "3000"
envFrom:
- configMapRef:
name: myapp-config
- secretRef:
name: myapp-secrets
# Resource limits
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
# Liveness probe
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
# Readiness probe
readinessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 2
# Security context
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
# Volume mounts
volumeMounts:
- name: tmp
mountPath: /tmp
- name: cache
mountPath: /app/.cache
volumes:
- name: tmp
emptyDir: {}
- name: cache
emptyDir: {}
# Pod disruption budget
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
app: myapp
topologyKey: kubernetes.io/hostname
---
# service.yaml
apiVersion: v1
kind: Service
metadata:
name: myapp
namespace: production
spec:
type: ClusterIP
selector:
app: myapp
ports:
- name: http
port: 80
targetPort: http
protocol: TCP
---
# hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: myapp
namespace: production
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: myapp
minReplicas: 3
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 50
periodSeconds: 60
scaleUp:
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 100
periodSeconds: 30
- type: Pods
value: 2
periodSeconds: 30
selectPolicy: Max
---
# pdb.yaml
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: myapp
namespace: production
spec:
minAvailable: 2
selector:
matchLabels:
app: myapp
---
# configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: myapp-config
namespace: production
data:
LOG_LEVEL: "info"
API_TIMEOUT: "30000"
MAX_CONNECTIONS: "100"
---
# secret.yaml
apiVersion: v1
kind: Secret
metadata:
name: myapp-secrets
namespace: production
type: Opaque
stringData:
DATABASE_URL: "postgresql://user:password@db:5432/myapp"
REDIS_URL: "redis://redis:6379"
JWT_SECRET: "your-jwt-secret-here"
```
### Ingress with TLS
```yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: myapp
namespace: production
annotations:
cert-manager.io/cluster-issuer: "letsencrypt-prod"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/rate-limit: "100"
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
spec:
ingressClassName: nginx
tls:
- hosts:
- myapp.example.com
secretName: myapp-tls
rules:
- host: myapp.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: myapp
port:
number: 80
```
## CI/CD with GitHub Actions
### Complete Pipeline
```yaml
# .github/workflows/ci-cd.yml
name: CI/CD Pipeline
on:
push:
branches: [main, develop]
pull_request:
branches: [main, develop]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
test:
runs-on: ubuntu-latest
services:
postgres:
image: postgres:16
env:
POSTGRES_PASSWORD: postgres
POSTGRES_DB: test_db
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
ports:
- 5432:5432
redis:
image: redis:7
options: >-
--health-cmd "redis-cli ping"
--health-interval 10s
--health-timeout 5s
--health-retries 5
ports:
- 6379:6379
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Run linter
run: npm run lint
- name: Run type check
run: npm run type-check
- name: Run unit tests
run: npm run test:unit
env:
NODE_ENV: test
- name: Run integration tests
run: npm run test:integration
env:
NODE_ENV: test
DATABASE_URL: postgresql://postgres:postgres@localhost:5432/test_db
REDIS_URL: redis://localhost:6379
- name: Generate coverage report
run: npm run test:coverage
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
files: ./coverage/lcov.info
fail_ci_if_error: true
build:
needs: test
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Container Registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=ref,event=branch
type=ref,event=pr
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=sha,prefix={{branch}}-
- name: Build and push Docker image
uses: docker/build-push-action@v5
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
platforms: linux/amd64,linux/arm64
deploy-staging:
needs: build
if: github.ref == 'refs/heads/develop'
runs-on: ubuntu-latest
environment:
name: staging
url: https://staging.example.com
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Configure kubectl
uses: azure/k8s-set-context@v3
with:
method: kubeconfig
kubeconfig: ${{ secrets.KUBE_CONFIG_STAGING }}
- name: Deploy to Kubernetes
run: |
kubectl set image deployment/myapp \
app=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:develop \
-n staging
kubectl rollout status deployment/myapp -n staging --timeout=5m
- name: Run smoke tests
run: |
npm run test:e2e -- --baseUrl=https://staging.example.com
deploy-production:
needs: build
if: github.ref == 'refs/heads/main'
runs-on: ubuntu-latest
environment:
name: production
url: https://example.com
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Configure kubectl
uses: azure/k8s-set-context@v3
with:
method: kubeconfig
kubeconfig: ${{ secrets.KUBE_CONFIG_PROD }}
- name: Deploy to Kubernetes
run: |
kubectl set image deployment/myapp \
app=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }} \
-n production
kubectl rollout status deployment/myapp -n production --timeout=10m
- name: Verify deployment
run: |
kubectl get pods -n production -l app=myapp
kubectl get svc -n production myapp
- name: Run smoke tests
run: |
npm run test:e2e -- --baseUrl=https://example.com
- name: Notify Slack
if: always()
uses: slackapi/slack-github-action@v1
with:
webhook-url: ${{ secrets.SLACK_WEBHOOK }}
payload: |
{
"text": "Production deployment ${{ job.status }}",
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "Deployment to production *${{ job.status }}*\nCommit: ${{ github.sha }}\nActor: ${{ github.actor }}"
}
}
]
}
```
## Best Practices
### 1. Infrastructure as Code
- Version control all infrastructure code
- Use modules for reusability
- Implement remote state with locking
- Tag all resources consistently
- Use workspaces for environments
### 2. Container Security
- Use minimal base images
- Run as non-root user
- Scan images for vulnerabilities
- Sign and verify images
- Keep images up to date
### 3. Kubernetes Security
- Use RBAC for access control
- Enable Pod Security Standards
- Implement network policies
- Use secrets management (e.g., Sealed Secrets, External Secrets)
- Regular security audits
### 4. CI/CD
- Automate testing at all levels
- Implement continuous deployment
- Use deployment strategies (blue-green, canary)
- Monitor deployments
- Quick rollback capability
### 5. Monitoring and Observability
- Centralized logging
- Metrics and alerting
- Distributed tracing
- Health checks and probes
- Regular audits and reviews