# DevOps & Infrastructure as Code Patterns Comprehensive DevOps practices and Infrastructure as Code patterns using Terraform, Docker, Kubernetes, and CI/CD tools. ## Infrastructure as Code with Terraform ### Project Structure ``` terraform/ ├── environments/ │ ├── dev/ │ │ ├── main.tf │ │ ├── variables.tf │ │ └── terraform.tfvars │ ├── staging/ │ └── production/ ├── modules/ │ ├── vpc/ │ │ ├── main.tf │ │ ├── variables.tf │ │ └── outputs.tf │ ├── eks/ │ └── rds/ └── global/ ├── s3/ └── iam/ ``` ### Terraform Best Practices #### Module Design ```hcl # modules/vpc/main.tf terraform { required_version = ">= 1.0" required_providers { aws = { source = "hashicorp/aws" version = "~> 5.0" } } } resource "aws_vpc" "main" { cidr_block = var.vpc_cidr enable_dns_hostnames = true enable_dns_support = true tags = merge( var.common_tags, { Name = "${var.environment}-vpc" } ) } resource "aws_subnet" "public" { count = length(var.public_subnet_cidrs) vpc_id = aws_vpc.main.id cidr_block = var.public_subnet_cidrs[count.index] availability_zone = var.availability_zones[count.index] map_public_ip_on_launch = true tags = merge( var.common_tags, { Name = "${var.environment}-public-${count.index + 1}" Type = "public" } ) } resource "aws_subnet" "private" { count = length(var.private_subnet_cidrs) vpc_id = aws_vpc.main.id cidr_block = var.private_subnet_cidrs[count.index] availability_zone = var.availability_zones[count.index] tags = merge( var.common_tags, { Name = "${var.environment}-private-${count.index + 1}" Type = "private" } ) } resource "aws_internet_gateway" "main" { vpc_id = aws_vpc.main.id tags = merge( var.common_tags, { Name = "${var.environment}-igw" } ) } resource "aws_nat_gateway" "main" { count = var.enable_nat_gateway ? length(var.public_subnet_cidrs) : 0 allocation_id = aws_eip.nat[count.index].id subnet_id = aws_subnet.public[count.index].id tags = merge( var.common_tags, { Name = "${var.environment}-nat-${count.index + 1}" } ) depends_on = [aws_internet_gateway.main] } resource "aws_eip" "nat" { count = var.enable_nat_gateway ? length(var.public_subnet_cidrs) : 0 domain = "vpc" tags = merge( var.common_tags, { Name = "${var.environment}-nat-eip-${count.index + 1}" } ) } # modules/vpc/variables.tf variable "environment" { description = "Environment name" type = string } variable "vpc_cidr" { description = "CIDR block for VPC" type = string } variable "public_subnet_cidrs" { description = "CIDR blocks for public subnets" type = list(string) } variable "private_subnet_cidrs" { description = "CIDR blocks for private subnets" type = list(string) } variable "availability_zones" { description = "Availability zones" type = list(string) } variable "enable_nat_gateway" { description = "Enable NAT gateway" type = bool default = true } variable "common_tags" { description = "Common tags for all resources" type = map(string) default = {} } # modules/vpc/outputs.tf output "vpc_id" { description = "VPC ID" value = aws_vpc.main.id } output "public_subnet_ids" { description = "Public subnet IDs" value = aws_subnet.public[*].id } output "private_subnet_ids" { description = "Private subnet IDs" value = aws_subnet.private[*].id } output "nat_gateway_ips" { description = "NAT Gateway public IPs" value = aws_eip.nat[*].public_ip } ``` #### Environment Configuration ```hcl # environments/production/main.tf terraform { backend "s3" { bucket = "myapp-terraform-state" key = "production/terraform.tfstate" region = "us-east-1" encrypt = true dynamodb_table = "terraform-state-lock" } } provider "aws" { region = var.aws_region default_tags { tags = { Environment = "production" ManagedBy = "Terraform" Project = "MyApp" } } } module "vpc" { source = "../../modules/vpc" environment = "production" vpc_cidr = "10.0.0.0/16" public_subnet_cidrs = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"] private_subnet_cidrs = ["10.0.11.0/24", "10.0.12.0/24", "10.0.13.0/24"] availability_zones = ["us-east-1a", "us-east-1b", "us-east-1c"] enable_nat_gateway = true common_tags = { CostCenter = "Engineering" } } module "eks" { source = "../../modules/eks" cluster_name = "production-eks" cluster_version = "1.28" vpc_id = module.vpc.vpc_id subnet_ids = module.vpc.private_subnet_ids node_groups = { general = { desired_size = 3 min_size = 3 max_size = 10 instance_types = ["t3.large"] } compute = { desired_size = 2 min_size = 2 max_size = 20 instance_types = ["c5.2xlarge"] } } } # environments/production/terraform.tfvars aws_region = "us-east-1" ``` #### State Management ```hcl # Setup S3 backend with state locking resource "aws_s3_bucket" "terraform_state" { bucket = "myapp-terraform-state" lifecycle { prevent_destroy = true } tags = { Name = "Terraform State" } } resource "aws_s3_bucket_versioning" "terraform_state" { bucket = aws_s3_bucket.terraform_state.id versioning_configuration { status = "Enabled" } } resource "aws_s3_bucket_server_side_encryption_configuration" "terraform_state" { bucket = aws_s3_bucket.terraform_state.id rule { apply_server_side_encryption_by_default { sse_algorithm = "AES256" } } } resource "aws_dynamodb_table" "terraform_locks" { name = "terraform-state-lock" billing_mode = "PAY_PER_REQUEST" hash_key = "LockID" attribute { name = "LockID" type = "S" } tags = { Name = "Terraform State Lock" } } ``` ## Docker Patterns ### Multi-Stage Builds #### Node.js Application ```dockerfile # Base stage for dependencies FROM node:20-alpine AS base WORKDIR /app COPY package*.json ./ # Development stage FROM base AS development RUN npm install COPY . . CMD ["npm", "run", "dev"] # Build stage FROM base AS build RUN npm ci --only=production COPY . . RUN npm run build # Production stage FROM node:20-alpine AS production WORKDIR /app # Create non-root user RUN addgroup -g 1001 -S nodejs && \ adduser -S nodejs -u 1001 # Copy only necessary files COPY --from=build --chown=nodejs:nodejs /app/node_modules ./node_modules COPY --from=build --chown=nodejs:nodejs /app/dist ./dist COPY --chown=nodejs:nodejs package*.json ./ # Switch to non-root user USER nodejs EXPOSE 3000 CMD ["node", "dist/index.js"] ``` #### Go Application ```dockerfile # Build stage FROM golang:1.21-alpine AS builder WORKDIR /build # Cache dependencies COPY go.mod go.sum ./ RUN go mod download # Copy source code COPY . . # Build with optimizations RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \ -ldflags='-w -s -extldflags "-static"' \ -a -installsuffix cgo \ -o app ./cmd/server # Production stage FROM scratch # Copy CA certificates for HTTPS COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ # Copy binary COPY --from=builder /build/app /app # Non-root user USER 1000:1000 EXPOSE 8080 ENTRYPOINT ["/app"] ``` #### Python Application ```dockerfile # Build stage FROM python:3.11-slim AS builder WORKDIR /app # Install dependencies COPY requirements.txt . RUN pip install --user --no-cache-dir -r requirements.txt # Production stage FROM python:3.11-slim WORKDIR /app # Copy dependencies from builder COPY --from=builder /root/.local /root/.local # Copy application code COPY . . # Create non-root user RUN useradd -m -u 1000 appuser && \ chown -R appuser:appuser /app USER appuser # Update PATH ENV PATH=/root/.local/bin:$PATH EXPOSE 8000 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] ``` ### Docker Compose for Development ```yaml # docker-compose.yml version: '3.8' services: app: build: context: . dockerfile: Dockerfile target: development ports: - "3000:3000" volumes: - .:/app - /app/node_modules environment: - NODE_ENV=development - DATABASE_URL=postgresql://postgres:password@db:5432/myapp - REDIS_URL=redis://redis:6379 depends_on: db: condition: service_healthy redis: condition: service_started networks: - app-network db: image: postgres:16-alpine ports: - "5432:5432" environment: - POSTGRES_USER=postgres - POSTGRES_PASSWORD=password - POSTGRES_DB=myapp volumes: - postgres_data:/var/lib/postgresql/data healthcheck: test: ["CMD-SHELL", "pg_isready -U postgres"] interval: 5s timeout: 5s retries: 5 networks: - app-network redis: image: redis:7-alpine ports: - "6379:6379" volumes: - redis_data:/data networks: - app-network nginx: image: nginx:alpine ports: - "80:80" volumes: - ./nginx.conf:/etc/nginx/nginx.conf:ro depends_on: - app networks: - app-network volumes: postgres_data: redis_data: networks: app-network: driver: bridge ``` ## Kubernetes Patterns ### Deployment with Best Practices ```yaml # deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: myapp namespace: production labels: app: myapp version: v1.0.0 spec: replicas: 3 strategy: type: RollingUpdate rollingUpdate: maxSurge: 1 maxUnavailable: 0 selector: matchLabels: app: myapp template: metadata: labels: app: myapp version: v1.0.0 spec: serviceAccountName: myapp-sa # Security context securityContext: runAsNonRoot: true runAsUser: 1000 fsGroup: 1000 # Init containers initContainers: - name: migrate image: myapp:v1.0.0 command: ['npm', 'run', 'migrate'] envFrom: - secretRef: name: myapp-secrets containers: - name: app image: myapp:v1.0.0 imagePullPolicy: IfNotPresent ports: - name: http containerPort: 3000 protocol: TCP # Environment variables env: - name: NODE_ENV value: "production" - name: PORT value: "3000" envFrom: - configMapRef: name: myapp-config - secretRef: name: myapp-secrets # Resource limits resources: requests: cpu: 100m memory: 128Mi limits: cpu: 500m memory: 512Mi # Liveness probe livenessProbe: httpGet: path: /health port: http initialDelaySeconds: 30 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 3 # Readiness probe readinessProbe: httpGet: path: /ready port: http initialDelaySeconds: 10 periodSeconds: 5 timeoutSeconds: 3 failureThreshold: 2 # Security context securityContext: allowPrivilegeEscalation: false readOnlyRootFilesystem: true capabilities: drop: - ALL # Volume mounts volumeMounts: - name: tmp mountPath: /tmp - name: cache mountPath: /app/.cache volumes: - name: tmp emptyDir: {} - name: cache emptyDir: {} # Pod disruption budget affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchLabels: app: myapp topologyKey: kubernetes.io/hostname --- # service.yaml apiVersion: v1 kind: Service metadata: name: myapp namespace: production spec: type: ClusterIP selector: app: myapp ports: - name: http port: 80 targetPort: http protocol: TCP --- # hpa.yaml apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: name: myapp namespace: production spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment name: myapp minReplicas: 3 maxReplicas: 10 metrics: - type: Resource resource: name: cpu target: type: Utilization averageUtilization: 70 - type: Resource resource: name: memory target: type: Utilization averageUtilization: 80 behavior: scaleDown: stabilizationWindowSeconds: 300 policies: - type: Percent value: 50 periodSeconds: 60 scaleUp: stabilizationWindowSeconds: 0 policies: - type: Percent value: 100 periodSeconds: 30 - type: Pods value: 2 periodSeconds: 30 selectPolicy: Max --- # pdb.yaml apiVersion: policy/v1 kind: PodDisruptionBudget metadata: name: myapp namespace: production spec: minAvailable: 2 selector: matchLabels: app: myapp --- # configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: myapp-config namespace: production data: LOG_LEVEL: "info" API_TIMEOUT: "30000" MAX_CONNECTIONS: "100" --- # secret.yaml apiVersion: v1 kind: Secret metadata: name: myapp-secrets namespace: production type: Opaque stringData: DATABASE_URL: "postgresql://user:password@db:5432/myapp" REDIS_URL: "redis://redis:6379" JWT_SECRET: "your-jwt-secret-here" ``` ### Ingress with TLS ```yaml apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: myapp namespace: production annotations: cert-manager.io/cluster-issuer: "letsencrypt-prod" nginx.ingress.kubernetes.io/ssl-redirect: "true" nginx.ingress.kubernetes.io/rate-limit: "100" nginx.ingress.kubernetes.io/proxy-body-size: "10m" spec: ingressClassName: nginx tls: - hosts: - myapp.example.com secretName: myapp-tls rules: - host: myapp.example.com http: paths: - path: / pathType: Prefix backend: service: name: myapp port: number: 80 ``` ## CI/CD with GitHub Actions ### Complete Pipeline ```yaml # .github/workflows/ci-cd.yml name: CI/CD Pipeline on: push: branches: [main, develop] pull_request: branches: [main, develop] env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} jobs: test: runs-on: ubuntu-latest services: postgres: image: postgres:16 env: POSTGRES_PASSWORD: postgres POSTGRES_DB: test_db options: >- --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 ports: - 5432:5432 redis: image: redis:7 options: >- --health-cmd "redis-cli ping" --health-interval 10s --health-timeout 5s --health-retries 5 ports: - 6379:6379 steps: - name: Checkout code uses: actions/checkout@v4 - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: '20' cache: 'npm' - name: Install dependencies run: npm ci - name: Run linter run: npm run lint - name: Run type check run: npm run type-check - name: Run unit tests run: npm run test:unit env: NODE_ENV: test - name: Run integration tests run: npm run test:integration env: NODE_ENV: test DATABASE_URL: postgresql://postgres:postgres@localhost:5432/test_db REDIS_URL: redis://localhost:6379 - name: Generate coverage report run: npm run test:coverage - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: files: ./coverage/lcov.info fail_ci_if_error: true build: needs: test runs-on: ubuntu-latest permissions: contents: read packages: write steps: - name: Checkout code uses: actions/checkout@v4 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Log in to Container Registry uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Extract metadata id: meta uses: docker/metadata-action@v5 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | type=ref,event=branch type=ref,event=pr type=semver,pattern={{version}} type=semver,pattern={{major}}.{{minor}} type=sha,prefix={{branch}}- - name: Build and push Docker image uses: docker/build-push-action@v5 with: context: . push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: type=gha cache-to: type=gha,mode=max platforms: linux/amd64,linux/arm64 deploy-staging: needs: build if: github.ref == 'refs/heads/develop' runs-on: ubuntu-latest environment: name: staging url: https://staging.example.com steps: - name: Checkout code uses: actions/checkout@v4 - name: Configure kubectl uses: azure/k8s-set-context@v3 with: method: kubeconfig kubeconfig: ${{ secrets.KUBE_CONFIG_STAGING }} - name: Deploy to Kubernetes run: | kubectl set image deployment/myapp \ app=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:develop \ -n staging kubectl rollout status deployment/myapp -n staging --timeout=5m - name: Run smoke tests run: | npm run test:e2e -- --baseUrl=https://staging.example.com deploy-production: needs: build if: github.ref == 'refs/heads/main' runs-on: ubuntu-latest environment: name: production url: https://example.com steps: - name: Checkout code uses: actions/checkout@v4 - name: Configure kubectl uses: azure/k8s-set-context@v3 with: method: kubeconfig kubeconfig: ${{ secrets.KUBE_CONFIG_PROD }} - name: Deploy to Kubernetes run: | kubectl set image deployment/myapp \ app=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }} \ -n production kubectl rollout status deployment/myapp -n production --timeout=10m - name: Verify deployment run: | kubectl get pods -n production -l app=myapp kubectl get svc -n production myapp - name: Run smoke tests run: | npm run test:e2e -- --baseUrl=https://example.com - name: Notify Slack if: always() uses: slackapi/slack-github-action@v1 with: webhook-url: ${{ secrets.SLACK_WEBHOOK }} payload: | { "text": "Production deployment ${{ job.status }}", "blocks": [ { "type": "section", "text": { "type": "mrkdwn", "text": "Deployment to production *${{ job.status }}*\nCommit: ${{ github.sha }}\nActor: ${{ github.actor }}" } } ] } ``` ## Best Practices ### 1. Infrastructure as Code - Version control all infrastructure code - Use modules for reusability - Implement remote state with locking - Tag all resources consistently - Use workspaces for environments ### 2. Container Security - Use minimal base images - Run as non-root user - Scan images for vulnerabilities - Sign and verify images - Keep images up to date ### 3. Kubernetes Security - Use RBAC for access control - Enable Pod Security Standards - Implement network policies - Use secrets management (e.g., Sealed Secrets, External Secrets) - Regular security audits ### 4. CI/CD - Automate testing at all levels - Implement continuous deployment - Use deployment strategies (blue-green, canary) - Monitor deployments - Quick rollback capability ### 5. Monitoring and Observability - Centralized logging - Metrics and alerting - Distributed tracing - Health checks and probes - Regular audits and reviews