From a91d4d5a1c050c96dcbc64ddd995d0256e520700 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sat, 29 Nov 2025 17:51:15 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 11 + README.md | 3 + SKILL.md | 568 ++++++++++++++++++ assets/applicationsets/cluster-generator.yaml | 32 + assets/argocd/install-argocd-3.x.yaml | 92 +++ assets/flux/flux-bootstrap-github.sh | 49 ++ assets/flux/oci-helmrelease.yaml | 38 ++ .../argo-rollouts-canary.yaml | 62 ++ plugin.lock.json | 133 ++++ references/argocd_vs_flux.md | 243 ++++++++ references/best_practices.md | 160 +++++ references/multi_cluster.md | 80 +++ references/oci_artifacts.md | 290 +++++++++ references/progressive_delivery.md | 94 +++ references/repo_patterns.md | 184 ++++++ references/secret_management.md | 213 +++++++ references/troubleshooting.md | 134 +++++ scripts/applicationset_generator.py | 156 +++++ scripts/check_argocd_health.py | 275 +++++++++ scripts/check_flux_health.py | 418 +++++++++++++ scripts/oci_artifact_checker.py | 150 +++++ scripts/promotion_validator.py | 88 +++ scripts/secret_audit.py | 178 ++++++ scripts/sync_drift_detector.py | 144 +++++ scripts/validate_gitops_repo.py | 299 +++++++++ 25 files changed, 4094 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 SKILL.md create mode 100644 assets/applicationsets/cluster-generator.yaml create mode 100644 assets/argocd/install-argocd-3.x.yaml create mode 100644 assets/flux/flux-bootstrap-github.sh create mode 100644 assets/flux/oci-helmrelease.yaml create mode 100644 assets/progressive-delivery/argo-rollouts-canary.yaml create mode 100644 plugin.lock.json create mode 100644 references/argocd_vs_flux.md create mode 100644 references/best_practices.md create mode 100644 references/multi_cluster.md create mode 100644 references/oci_artifacts.md create mode 100644 references/progressive_delivery.md create mode 100644 references/repo_patterns.md create mode 100644 references/secret_management.md create mode 100644 references/troubleshooting.md create mode 100644 scripts/applicationset_generator.py create mode 100644 scripts/check_argocd_health.py create mode 100644 scripts/check_flux_health.py create mode 100644 scripts/oci_artifact_checker.py create mode 100644 scripts/promotion_validator.py create mode 100644 scripts/secret_audit.py create mode 100644 scripts/sync_drift_detector.py create mode 100644 scripts/validate_gitops_repo.py diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..1daabfa --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,11 @@ +{ + "name": "gitops-workflows", + "description": "GitOps workflows with ArgoCD and Flux CD including multi-cluster management, secrets, and progressive delivery", + "version": "1.0.0", + "author": { + "name": "DevOps Claude Skills" + }, + "skills": [ + "./" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..3e419a5 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# gitops-workflows + +GitOps workflows with ArgoCD and Flux CD including multi-cluster management, secrets, and progressive delivery diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..0a67585 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,568 @@ +--- +name: gitops-workflows +description: GitOps deployment workflows with ArgoCD and Flux. Use for setting up GitOps (ArgoCD 3.x, Flux 2.7), designing repository structures (monorepo/polyrepo, app-of-apps), multi-cluster deployments (ApplicationSets, hub-spoke), secrets management (SOPS+age, Sealed Secrets, External Secrets Operator), progressive delivery (Argo Rollouts, Flagger), troubleshooting sync issues, and OCI artifact management. Covers latest 2024-2025 features: ArgoCD annotation-based tracking, fine-grained RBAC, Flux OCI artifacts GA, image automation, source-watcher. +--- + +# GitOps Workflows + +## Overview + +This skill provides comprehensive GitOps workflows for continuous deployment to Kubernetes using ArgoCD 3.x and Flux 2.7+. + +**When to use this skill**: +- Setting up GitOps from scratch (ArgoCD or Flux) +- Designing Git repository structures +- Multi-cluster deployments +- Troubleshooting sync/reconciliation issues +- Implementing secrets management +- Progressive delivery (canary, blue-green) +- Migrating between GitOps tools + +--- + +## Core Workflow: GitOps Implementation + +Use this decision tree to determine your starting point: + +``` +Do you have GitOps installed? +├─ NO → Need to choose a tool +│ └─ Want UI + easy onboarding? → ArgoCD (Workflow 1) +│ └─ Want modularity + platform engineering? → Flux (Workflow 2) +└─ YES → What's your goal? + ├─ Sync issues / troubleshooting → Workflow 7 + ├─ Multi-cluster deployment → Workflow 4 + ├─ Secrets management → Workflow 5 + ├─ Progressive delivery → Workflow 6 + ├─ Repository structure → Workflow 3 + └─ Tool comparison → Read references/argocd_vs_flux.md +``` + +--- + +## 1. Initial Setup: ArgoCD 3.x + +**Latest Version**: v3.1.9 (stable), v3.2.0-rc4 (October 2025) + +### Quick Install + +```bash +# Create namespace +kubectl create namespace argocd + +# Install ArgoCD 3.x +kubectl apply -n argocd -f https://raw.githubusercontent.com/argoproj/argo-cd/v3.1.9/manifests/install.yaml + +# Get admin password +kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath="{.data.password}" | base64 -d + +# Port forward to access UI +kubectl port-forward svc/argocd-server -n argocd 8080:443 +# Access: https://localhost:8080 +``` + +**→ Template**: [assets/argocd/install-argocd-3.x.yaml](assets/argocd/install-argocd-3.x.yaml) + +### ArgoCD 3.x New Features + +**Breaking Changes**: +- ✅ Annotation-based tracking (default, was labels) +- ✅ RBAC logs enforcement enabled +- ✅ Legacy metrics removed + +**New Features**: +- ✅ Fine-grained RBAC (per-resource permissions) +- ✅ Better defaults (resource exclusions for performance) +- ✅ Secrets operators endorsement + +### Deploy Your First Application + +```bash +# CLI method +argocd app create guestbook \ + --repo https://github.com/argoproj/argocd-example-apps.git \ + --path guestbook \ + --dest-server https://kubernetes.default.svc \ + --dest-namespace default + +# Sync application +argocd app sync guestbook +``` + +### Health Check + +```bash +# Check application health +python3 scripts/check_argocd_health.py \ + --server https://argocd.example.com \ + --token $ARGOCD_TOKEN +``` + +**→ Script**: [scripts/check_argocd_health.py](scripts/check_argocd_health.py) + +--- + +## 2. Initial Setup: Flux 2.7 + +**Latest Version**: v2.7.1 (October 2025) + +### Quick Install + +```bash +# Install Flux CLI +brew install fluxcd/tap/flux # macOS +# or: curl -s https://fluxcd.io/install.sh | sudo bash + +# Check prerequisites +flux check --pre + +# Bootstrap Flux (GitHub) +export GITHUB_TOKEN= +flux bootstrap github \ + --owner= \ + --repository=fleet-infra \ + --branch=main \ + --path=clusters/production \ + --personal + +# Enable source-watcher (Flux 2.7+) +flux install --components-extra=source-watcher +``` + +**→ Template**: [assets/flux/flux-bootstrap-github.sh](assets/flux/flux-bootstrap-github.sh) + +### Flux 2.7 New Features + +- ✅ Image automation GA +- ✅ ExternalArtifact and ArtifactGenerator APIs +- ✅ Source-watcher component for better performance +- ✅ OpenTelemetry tracing support +- ✅ CEL expressions for readiness evaluation + +### Deploy Your First Application + +```yaml +# gitrepository.yaml +apiVersion: source.toolkit.fluxcd.io/v1 +kind: GitRepository +metadata: + name: podinfo + namespace: flux-system +spec: + interval: 1m + url: https://github.com/stefanprodan/podinfo + ref: + branch: master +--- +# kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: podinfo + namespace: flux-system +spec: + interval: 5m + path: "./kustomize" + prune: true + sourceRef: + kind: GitRepository + name: podinfo +``` + +### Health Check + +```bash +# Check Flux health +python3 scripts/check_flux_health.py --namespace flux-system +``` + +**→ Script**: [scripts/check_flux_health.py](scripts/check_flux_health.py) + +--- + +## 3. Repository Structure Design + +**Decision: Monorepo or Polyrepo?** + +### Monorepo Pattern + +**Best for**: Startups, small teams (< 20 apps), single team + +``` +gitops-repo/ +├── apps/ +│ ├── frontend/ +│ ├── backend/ +│ └── database/ +├── infrastructure/ +│ ├── ingress/ +│ ├── monitoring/ +│ └── secrets/ +└── clusters/ + ├── dev/ + ├── staging/ + └── production/ +``` + +### Polyrepo Pattern + +**Best for**: Large orgs, multiple teams, clear boundaries + +``` +infrastructure-repo/ (Platform team) +app-team-1-repo/ (Team 1) +app-team-2-repo/ (Team 2) +``` + +### Environment Structure (Kustomize) + +``` +app/ +├── base/ +│ ├── deployment.yaml +│ ├── service.yaml +│ └── kustomization.yaml +└── overlays/ + ├── dev/ + │ ├── kustomization.yaml + │ └── replica-patch.yaml + ├── staging/ + └── production/ +``` + +**→ Reference**: [references/repo_patterns.md](references/repo_patterns.md) + +### Validate Repository Structure + +```bash +python3 scripts/validate_gitops_repo.py /path/to/repo +``` + +**→ Script**: [scripts/validate_gitops_repo.py](scripts/validate_gitops_repo.py) + +--- + +## 4. Multi-Cluster Deployments + +### ArgoCD ApplicationSets + +**Cluster Generator** (deploy to all clusters): + +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: cluster-apps +spec: + generators: + - cluster: + selector: + matchLabels: + environment: production + template: + metadata: + name: '{{name}}-myapp' + spec: + source: + repoURL: https://github.com/org/apps + path: myapp + destination: + server: '{{server}}' +``` + +**→ Template**: [assets/applicationsets/cluster-generator.yaml](assets/applicationsets/cluster-generator.yaml) + +**Performance Benefit**: 83% faster deployments (30min → 5min) + +### Generate ApplicationSets + +```bash +# Cluster generator +python3 scripts/applicationset_generator.py cluster \ + --name my-apps \ + --repo-url https://github.com/org/repo \ + --output appset.yaml + +# Matrix generator (cluster x apps) +python3 scripts/applicationset_generator.py matrix \ + --name my-apps \ + --cluster-label production \ + --directories app1,app2,app3 \ + --output appset.yaml +``` + +**→ Script**: [scripts/applicationset_generator.py](scripts/applicationset_generator.py) + +### Flux Multi-Cluster + +**Hub-and-Spoke**: Management cluster manages all clusters + +```bash +# Bootstrap each cluster +flux bootstrap github --context prod-cluster --path clusters/production +flux bootstrap github --context staging-cluster --path clusters/staging +``` + +**→ Reference**: [references/multi_cluster.md](references/multi_cluster.md) + +--- + +## 5. Secrets Management + +**Never commit plain secrets to Git.** Choose a solution: + +### Decision Matrix + +| Solution | Complexity | Best For | 2025 Trend | +|----------|-----------|----------|------------| +| **SOPS + age** | Medium | Git-centric, flexible | ↗️ Preferred | +| **External Secrets Operator** | Medium | Cloud-native, dynamic | ↗️ Growing | +| **Sealed Secrets** | Low | Simple, GitOps-first | → Stable | + +### Option 1: SOPS + age (Recommended 2025) + +**Setup**: +```bash +# Generate age key +age-keygen -o key.txt +# Public key: age1... + +# Create .sops.yaml +cat < .sops.yaml +creation_rules: + - path_regex: .*.yaml + encrypted_regex: ^(data|stringData)$ + age: age1ql3z7hjy54pw3hyww5ayyfg7zqgvc7w3j2elw8zmrj2kg5sfn9aqmcac8p +EOF + +# Encrypt secret +kubectl create secret generic my-secret --dry-run=client -o yaml \ + --from-literal=password=supersecret > secret.yaml +sops -e secret.yaml > secret.enc.yaml + +# Commit encrypted version +git add secret.enc.yaml .sops.yaml +``` + +**→ Template**: [assets/secrets/sops-age-config.yaml](assets/secrets/sops-age-config.yaml) + +### Option 2: External Secrets Operator (v0.20+) + +**Best for**: Cloud-native apps, dynamic secrets, automatic rotation + +### Option 3: Sealed Secrets + +**Best for**: Simple setup, static secrets, no external dependencies + +**→ Reference**: [references/secret_management.md](references/secret_management.md) + +### Audit Secrets + +```bash +python3 scripts/secret_audit.py /path/to/repo +``` + +**→ Script**: [scripts/secret_audit.py](scripts/secret_audit.py) + +--- + +## 6. Progressive Delivery + +### Argo Rollouts (with ArgoCD) + +**Canary Deployment**: +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: Rollout +metadata: + name: my-app +spec: + strategy: + canary: + steps: + - setWeight: 20 + - pause: {duration: 2m} + - setWeight: 50 + - pause: {duration: 2m} + - setWeight: 100 +``` + +**→ Template**: [assets/progressive-delivery/argo-rollouts-canary.yaml](assets/progressive-delivery/argo-rollouts-canary.yaml) + +### Flagger (with Flux) + +**Canary with Metrics Analysis**: +```yaml +apiVersion: flagger.app/v1beta1 +kind: Canary +metadata: + name: my-app +spec: + analysis: + interval: 1m + threshold: 5 + maxWeight: 50 + stepWeight: 10 + metrics: + - name: request-success-rate + thresholdRange: + min: 99 +``` + +**→ Reference**: [references/progressive_delivery.md](references/progressive_delivery.md) + +--- + +## 7. Troubleshooting + +### Common Issues + +**ArgoCD OutOfSync**: +```bash +# Check differences +argocd app diff my-app + +# Sync application +argocd app sync my-app + +# Check health +python3 scripts/check_argocd_health.py --server https://argocd.example.com --token $TOKEN +``` + +**Flux Not Reconciling**: +```bash +# Check resources +flux get all + +# Check specific kustomization +flux get kustomizations +kubectl describe kustomization my-app -n flux-system + +# Force reconcile +flux reconcile kustomization my-app +``` + +**Detect Drift**: +```bash +# ArgoCD drift detection +python3 scripts/sync_drift_detector.py --argocd --app my-app + +# Flux drift detection +python3 scripts/sync_drift_detector.py --flux +``` + +**→ Script**: [scripts/sync_drift_detector.py](scripts/sync_drift_detector.py) + +**→ Reference**: [references/troubleshooting.md](references/troubleshooting.md) + +--- + +## 8. OCI Artifacts (Flux 2.6+) + +**GA Status**: Flux v2.6 (June 2025) + +### Use OCIRepository for Helm Charts + +```yaml +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: OCIRepository +metadata: + name: podinfo-oci +spec: + interval: 5m + url: oci://ghcr.io/stefanprodan/charts/podinfo + ref: + semver: ">=6.0.0" + verify: + provider: cosign +``` + +**→ Template**: [assets/flux/oci-helmrelease.yaml](assets/flux/oci-helmrelease.yaml) + +### Verify OCI Artifacts + +```bash +python3 scripts/oci_artifact_checker.py \ + --verify ghcr.io/org/app:v1.0.0 \ + --provider cosign +``` + +**→ Script**: [scripts/oci_artifact_checker.py](scripts/oci_artifact_checker.py) + +**→ Reference**: [references/oci_artifacts.md](references/oci_artifacts.md) + +--- + +## Quick Reference Commands + +### ArgoCD + +```bash +# List applications +argocd app list + +# Get application details +argocd app get + +# Sync application +argocd app sync + +# View diff +argocd app diff + +# Delete application +argocd app delete +``` + +### Flux + +```bash +# Check Flux status +flux check + +# Get all resources +flux get all + +# Reconcile immediately +flux reconcile source git +flux reconcile kustomization + +# Suspend/Resume +flux suspend kustomization +flux resume kustomization + +# Export resources +flux export source git --all > sources.yaml +``` + +--- + +## Resources Summary + +### Scripts (automation and diagnostics) +- `check_argocd_health.py` - Diagnose ArgoCD sync issues (3.x compatible) +- `check_flux_health.py` - Diagnose Flux reconciliation issues (2.7+ compatible) +- `validate_gitops_repo.py` - Validate repository structure and manifests +- `sync_drift_detector.py` - Detect drift between Git and cluster +- `secret_audit.py` - Audit secrets management (SOPS, Sealed Secrets, ESO) +- `applicationset_generator.py` - Generate ApplicationSet manifests +- `promotion_validator.py` - Validate environment promotion workflows +- `oci_artifact_checker.py` - Validate Flux OCI artifacts and verify signatures + +### References (deep-dive documentation) +- `argocd_vs_flux.md` - Comprehensive comparison (2024-2025), decision matrix +- `repo_patterns.md` - Monorepo vs polyrepo, app-of-apps, environment structures +- `secret_management.md` - SOPS+age, Sealed Secrets, ESO (2025 best practices) +- `progressive_delivery.md` - Argo Rollouts, Flagger, canary/blue-green patterns +- `multi_cluster.md` - ApplicationSets, Flux multi-tenancy, hub-spoke patterns +- `troubleshooting.md` - Common sync issues, debugging commands +- `best_practices.md` - CNCF GitOps principles, security, 2025 recommendations +- `oci_artifacts.md` - Flux OCI artifacts (GA v2.6), signature verification + +### Templates (production-ready configurations) +- `argocd/install-argocd-3.x.yaml` - ArgoCD 3.x installation with best practices +- `applicationsets/cluster-generator.yaml` - Multi-cluster ApplicationSet example +- `flux/flux-bootstrap-github.sh` - Flux 2.7 bootstrap script +- `flux/oci-helmrelease.yaml` - OCI artifact + HelmRelease example +- `secrets/sops-age-config.yaml` - SOPS + age configuration +- `progressive-delivery/argo-rollouts-canary.yaml` - Canary deployment with analysis diff --git a/assets/applicationsets/cluster-generator.yaml b/assets/applicationsets/cluster-generator.yaml new file mode 100644 index 0000000..38b3a96 --- /dev/null +++ b/assets/applicationsets/cluster-generator.yaml @@ -0,0 +1,32 @@ +# ApplicationSet with Cluster Generator +# Automatically deploys to all clusters matching label selector +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: cluster-apps + namespace: argocd +spec: + goTemplate: true + generators: + - cluster: + selector: + matchLabels: + environment: production + template: + metadata: + name: '{{.name}}-guestbook' + spec: + project: default + source: + repoURL: https://github.com/argoproj/argocd-example-apps + targetRevision: HEAD + path: guestbook + destination: + server: '{{.server}}' + namespace: guestbook + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/assets/argocd/install-argocd-3.x.yaml b/assets/argocd/install-argocd-3.x.yaml new file mode 100644 index 0000000..6152b8d --- /dev/null +++ b/assets/argocd/install-argocd-3.x.yaml @@ -0,0 +1,92 @@ +# ArgoCD 3.x Installation with best practices +# Updated for ArgoCD v3.1+ +apiVersion: v1 +kind: Namespace +metadata: + name: argocd +--- +# Install ArgoCD using official manifests +# kubectl apply -n argocd -f https://raw.githubusercontent.com/argoproj/argo-cd/v3.1.9/manifests/install.yaml + +# Configuration with ArgoCD 3.x best practices +apiVersion: v1 +kind: ConfigMap +metadata: + name: argocd-cmd-params-cm + namespace: argocd +data: + # Enable fine-grained RBAC (ArgoCD 3.0+) + server.enable.gzip: "true" + # Resource exclusions (default in 3.x) + resource.exclusions: | + - apiGroups: + - "" + kinds: + - Endpoints + - EndpointSlice + clusters: + - "*" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: argocd-cm + namespace: argocd +data: + # Annotation-based tracking (default in ArgoCD 3.x) + application.resourceTrackingMethod: annotation + + # Resource exclusions for performance + resource.exclusions: | + - apiGroups: + - "*" + kinds: + - Lease + clusters: + - "*" +--- +# Expose ArgoCD Server (choose one method) + +# Option 1: LoadBalancer +apiVersion: v1 +kind: Service +metadata: + name: argocd-server-lb + namespace: argocd +spec: + type: LoadBalancer + ports: + - port: 80 + targetPort: 8080 + protocol: TCP + selector: + app.kubernetes.io/name: argocd-server + +# Option 2: Ingress (recommended) +# --- +# apiVersion: networking.k8s.io/v1 +# kind: Ingress +# metadata: +# name: argocd-server-ingress +# namespace: argocd +# annotations: +# cert-manager.io/cluster-issuer: letsencrypt-prod +# nginx.ingress.kubernetes.io/ssl-passthrough: "true" +# nginx.ingress.kubernetes.io/backend-protocol: "HTTPS" +# spec: +# ingressClassName: nginx +# rules: +# - host: argocd.example.com +# http: +# paths: +# - path: / +# pathType: Prefix +# backend: +# service: +# name: argocd-server +# port: +# number: 443 +# tls: +# - hosts: +# - argocd.example.com +# secretName: argocd-server-tls diff --git a/assets/flux/flux-bootstrap-github.sh b/assets/flux/flux-bootstrap-github.sh new file mode 100644 index 0000000..e0e0f63 --- /dev/null +++ b/assets/flux/flux-bootstrap-github.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Flux 2.7+ Bootstrap Script for GitHub + +set -e + +# Configuration +GITHUB_USER="${GITHUB_USER:-your-org}" +GITHUB_REPO="${GITHUB_REPO:-fleet-infra}" +GITHUB_TOKEN="${GITHUB_TOKEN:-}" +CLUSTER_NAME="${CLUSTER_NAME:-production}" +CLUSTER_PATH="clusters/${CLUSTER_NAME}" + +# Check prerequisites +command -v flux >/dev/null 2>&1 || { echo "flux CLI required"; exit 1; } +command -v kubectl >/dev/null 2>&1 || { echo "kubectl required"; exit 1; } + +# Check GitHub token +if [ -z "$GITHUB_TOKEN" ]; then + echo "Error: GITHUB_TOKEN environment variable not set" + exit 1 +fi + +# Bootstrap Flux +echo "🚀 Bootstrapping Flux for cluster: $CLUSTER_NAME" + +flux bootstrap github \ + --owner="$GITHUB_USER" \ + --repository="$GITHUB_REPO" \ + --branch=main \ + --path="$CLUSTER_PATH" \ + --personal \ + --token-auth + +# Enable source-watcher (Flux 2.7+) +echo "✨ Enabling source-watcher component..." +flux install --components-extra=source-watcher + +# Verify installation +echo "✅ Verifying Flux installation..." +flux check + +echo " +✅ Flux bootstrapped successfully! + +Next steps: +1. Add your applications to ${CLUSTER_PATH}/apps/ +2. Commit and push to trigger Flux reconciliation +3. Monitor with: flux get all +" diff --git a/assets/flux/oci-helmrelease.yaml b/assets/flux/oci-helmrelease.yaml new file mode 100644 index 0000000..afbe2a6 --- /dev/null +++ b/assets/flux/oci-helmrelease.yaml @@ -0,0 +1,38 @@ +# Flux OCI Repository + HelmRelease (Flux 2.6+) +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: OCIRepository +metadata: + name: podinfo-oci + namespace: flux-system +spec: + interval: 5m + url: oci://ghcr.io/stefanprodan/charts/podinfo + ref: + semver: ">=6.0.0" + verify: + provider: cosign + secretRef: + name: cosign-public-key +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: podinfo + namespace: default +spec: + interval: 10m + chart: + spec: + chart: podinfo + sourceRef: + kind: OCIRepository + name: podinfo-oci + namespace: flux-system + values: + replicaCount: 2 + resources: + limits: + memory: 256Mi + requests: + cpu: 100m + memory: 64Mi diff --git a/assets/progressive-delivery/argo-rollouts-canary.yaml b/assets/progressive-delivery/argo-rollouts-canary.yaml new file mode 100644 index 0000000..9aaec20 --- /dev/null +++ b/assets/progressive-delivery/argo-rollouts-canary.yaml @@ -0,0 +1,62 @@ +# Argo Rollouts Canary Deployment with Analysis +apiVersion: argoproj.io/v1alpha1 +kind: Rollout +metadata: + name: my-app +spec: + replicas: 5 + strategy: + canary: + steps: + - setWeight: 20 + - pause: {duration: 2m} + - setWeight: 40 + - pause: {duration: 2m} + - setWeight: 60 + - pause: {duration: 2m} + - setWeight: 80 + - pause: {duration: 2m} + analysis: + templates: + - templateName: success-rate + startingStep: 2 + args: + - name: service-name + value: my-app + selector: + matchLabels: + app: my-app + template: + metadata: + labels: + app: my-app + spec: + containers: + - name: my-app + image: myapp:v2.0.0 + ports: + - containerPort: 8080 +--- +# Analysis Template using Prometheus +apiVersion: argoproj.io/v1alpha1 +kind: AnalysisTemplate +metadata: + name: success-rate +spec: + args: + - name: service-name + metrics: + - name: success-rate + interval: 1m + successCondition: result[0] >= 0.95 + failureLimit: 3 + provider: + prometheus: + address: http://prometheus.monitoring:9090 + query: | + sum(rate( + http_requests_total{job="{{args.service-name}}",status!~"5.."}[2m] + )) / + sum(rate( + http_requests_total{job="{{args.service-name}}"}[2m] + )) diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..690fd91 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,133 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:ahmedasmar/devops-claude-skills:gitops-workflows", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "9d9aba99c48eab607e17775890549925e3cf492c", + "treeHash": "d0ab5ad5352a26f2e20ecbe92fe6a75ea200b094e9bdd53fbdd7314b921ea051", + "generatedAt": "2025-11-28T10:13:03.655231Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "gitops-workflows", + "description": "GitOps workflows with ArgoCD and Flux CD including multi-cluster management, secrets, and progressive delivery", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "867bba9b73becced4d98602771881b393eed69f879e82e3db15e03caa1495553" + }, + { + "path": "SKILL.md", + "sha256": "ca26dd3567959c1ef4fe83111e21b53cdd7681355a62e38122acb0c517322ccb" + }, + { + "path": "references/progressive_delivery.md", + "sha256": "c9873ae80def5528aa91cf977047f2d4e743f22a03ab1f07855aaf6c620a7ae2" + }, + { + "path": "references/troubleshooting.md", + "sha256": "00e5fc7d8752b25b8a51a358d4d7510e20ee9b698dd511545980142dfd5c6510" + }, + { + "path": "references/argocd_vs_flux.md", + "sha256": "86f5977b45f7f2a38292d937153c91c1806f70a05ee5f2593ffcebbe6530fbc6" + }, + { + "path": "references/multi_cluster.md", + "sha256": "018343bfb8c3b6f061f0a95aea8fced73c3e0b57ecc1892b7481525f2e3f3c2c" + }, + { + "path": "references/oci_artifacts.md", + "sha256": "b2545bd256a61012b87407ecbcf7c3452c2e28e23b6c2db664f8d8b33a33a5c1" + }, + { + "path": "references/repo_patterns.md", + "sha256": "b9fdf169b26f7f225d2ca89422f4ae6475f2413b31a12237d9a551a8de00eeee" + }, + { + "path": "references/secret_management.md", + "sha256": "6c4dd5098220438397fc05032bcc506982b196be76a38d7c931adef317009a00" + }, + { + "path": "references/best_practices.md", + "sha256": "136045d07ac582349ac6d211823855255c9a8364ba8fcd892579dc6cdfbf25e0" + }, + { + "path": "scripts/applicationset_generator.py", + "sha256": "0f179e4c990c95decc0e721558cb6283f975abf353999ef2d6c68458262c6a4c" + }, + { + "path": "scripts/promotion_validator.py", + "sha256": "834e8ffab247627717bbf289b63f35b9c49776dbe0478bd5eb2537c0be7a9475" + }, + { + "path": "scripts/sync_drift_detector.py", + "sha256": "d7e0abad75ec1eb406edd643918b0d7a99cf0e457b9349ed517a79560a08d6ab" + }, + { + "path": "scripts/secret_audit.py", + "sha256": "b0fd6209a363724c8982319363e51a3a7e3256d6120acd1c56ef23e697d5b539" + }, + { + "path": "scripts/check_argocd_health.py", + "sha256": "6ed7bffeedf5f862d945dc2c50facd2a07e49170bb5314fafe9d39fdcc84f2f2" + }, + { + "path": "scripts/oci_artifact_checker.py", + "sha256": "109d02231138a5ca09f4304a862b9268742628b902b6ca16e826ebeae958b949" + }, + { + "path": "scripts/validate_gitops_repo.py", + "sha256": "bb81659411d59bdc0fe028e1089ce69ca20fbb9f695f40a8f910bdebdc71d39a" + }, + { + "path": "scripts/check_flux_health.py", + "sha256": "a9d3acc40aee91c12049486f7807c6e0b6a0a78cc1ca68187184d440447fe2fa" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "d4e4e6ab1b5616e5c05550abac625b7af113cf32e12581709be24c4481af4ccb" + }, + { + "path": "assets/progressive-delivery/argo-rollouts-canary.yaml", + "sha256": "c94189fb722e9da934c56dc7b3168737cb9b2aa3e8750a4bed236337ad339e4e" + }, + { + "path": "assets/flux/flux-bootstrap-github.sh", + "sha256": "199535c78799bc13865d079c9d179351fe2991d9487e30d6e3a6ff692e58606f" + }, + { + "path": "assets/flux/oci-helmrelease.yaml", + "sha256": "7959cfed54faffd8346927a01461dcc1296a61bc4f6c543ba46089cc8161cc34" + }, + { + "path": "assets/argocd/install-argocd-3.x.yaml", + "sha256": "ab1f6555a685d0070858378071de0749d1bcc3a821fbecf6f4f353a05862f27c" + }, + { + "path": "assets/secrets/sops-age-config.yaml", + "sha256": "e20729d61388ba4a3746e105801e0944d94ed7d5dd5e58f7d5bb561831c9ed08" + }, + { + "path": "assets/applicationsets/cluster-generator.yaml", + "sha256": "767f16f17c1b60dc802b9e6e140737c6ca5cf56a8769012f1d5605b3cb43041a" + } + ], + "dirSha256": "d0ab5ad5352a26f2e20ecbe92fe6a75ea200b094e9bdd53fbdd7314b921ea051" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/references/argocd_vs_flux.md b/references/argocd_vs_flux.md new file mode 100644 index 0000000..8872f7c --- /dev/null +++ b/references/argocd_vs_flux.md @@ -0,0 +1,243 @@ +# ArgoCD vs Flux: Comprehensive Comparison (2024-2025) + +## Current Versions (October 2025) + +- **ArgoCD**: v3.1.9 (stable), v3.2.0-rc4 (release candidate) +- **Flux**: v2.7.1 (latest) + +## Quick Decision Matrix + +| Criteria | Choose ArgoCD | Choose Flux | +|----------|---------------|-------------| +| **Primary Focus** | Developer experience, UI | Platform engineering, modularity | +| **Team Size** | Medium-large teams | Small teams, platform engineers | +| **UI Required** | Yes | No (CLI-driven) | +| **Complexity** | Simpler onboarding | Steeper learning curve | +| **Customization** | Less modular | Highly modular | +| **Multi-tenancy** | Built-in with Projects | Manual configuration | +| **Best For** | Application teams, demos | Infrastructure teams, advanced users | + +## Key Differences + +### Architecture + +**ArgoCD**: +- Monolithic design with integrated components +- Web UI, API server, application controller in one system +- Centralized control plane + +**Flux**: +- Modular microservices architecture +- Separate controllers: source, kustomize, helm, notification, image-automation +- Distributed reconciliation + +### User Experience + +**ArgoCD**: +- Rich web UI for visualization and management +- GUI dashboard for deployment, syncing, troubleshooting +- Easier onboarding for developers +- Better for demos and presentations + +**Flux**: +- CLI-driven (flux CLI + kubectl) +- No built-in UI (can integrate with Weave GitOps UI separately) +- Requires comfort with command-line tools +- Steeper learning curve + +### Application Management + +**ArgoCD 3.x**: +- Application and ApplicationSet CRDs +- App-of-apps pattern for organizing applications +- Fine-grained RBAC (new in v3.0) +- Annotation-based tracking (default in v3.0, changed from labels) + +**Flux 2.7**: +- Kustomization and HelmRelease CRDs +- No built-in grouping mechanism +- RBAC through Kubernetes RBAC +- Label-based tracking + +### Multi-Cluster Support + +**ArgoCD ApplicationSets**: +- Cluster generator for auto-discovery +- Matrix generator for cluster x app combinations +- Hub-and-spoke pattern (one ArgoCD manages multiple clusters) +- 83% faster deployments vs manual (30min → 5min) + +**Flux Multi-Tenancy**: +- Manual cluster configuration +- Separate Flux installations per cluster or shared +- More flexible but requires more setup +- No built-in cluster generator + +### Secrets Management + +Both support: +- Sealed Secrets +- External Secrets Operator +- SOPS + +**ArgoCD 3.0 Change**: +- Now explicitly endorses secrets operators +- Cautions against config management plugins for secrets +- Better integration with ESO + +**Flux**: +- Native SOPS integration with age encryption +- Decryption happens in-cluster +- .sops.yaml configuration support + +### Progressive Delivery + +**ArgoCD + Argo Rollouts**: +- Separate project but tight integration +- Rich UI for visualizing rollouts +- Supports canary, blue-green, A/B testing +- Metric analysis with Prometheus, Datadog, etc. + +**Flux + Flagger**: +- Flagger as companion project +- CLI-driven +- Supports canary, blue-green, A/B testing +- Metric analysis with Prometheus, Datadog, etc. + +## Feature Comparison + +| Feature | ArgoCD 3.x | Flux 2.7 | +|---------|-----------|----------| +| **Web UI** | ✅ Built-in | ❌ (3rd party available) | +| **CLI** | ✅ argocd | ✅ flux | +| **Git Sources** | ✅ | ✅ | +| **OCI Artifacts** | ❌ | ✅ (GA in v2.6) | +| **Helm Support** | ✅ | ✅ | +| **Kustomize** | ✅ (v5.7.0) | ✅ (v5.7.0) | +| **Multi-tenancy** | ✅ Projects | Manual | +| **Image Automation** | ⚠️ Via Image Updater | ✅ GA in v2.7 | +| **Notifications** | ✅ | ✅ | +| **RBAC** | ✅ Fine-grained (v3.0) | Kubernetes RBAC | +| **Progressive Delivery** | Argo Rollouts | Flagger | +| **Signature Verification** | ⚠️ Limited | ✅ cosign/notation | + +## Performance & Scale + +**ArgoCD**: +- Can manage 1000+ applications per instance +- Better defaults in v3.0 (resource exclusions reduce API load) +- ApplicationSets reduce management overhead + +**Flux**: +- Lighter resource footprint +- Better for large-scale monorepos +- Source-watcher (v2.7) improves reconciliation efficiency + +## Community & Support + +**ArgoCD**: +- CNCF Graduated project +- Large community, many contributors +- Akuity offers commercial support +- Annual ArgoCon conference + +**Flux**: +- CNCF Graduated project +- Weaveworks shutdown (Feb 2024) but project remains strong +- Grafana Labs offers Grafana Cloud integration +- GitOpsCon events + +## Version 3.0 Changes (ArgoCD) + +**Breaking Changes**: +- Annotation-based tracking (default, was labels) +- RBAC logs enforcement (no longer optional) +- Removed legacy metrics (argocd_app_sync_status, etc.) + +**New Features**: +- Fine-grained RBAC (per-resource permissions) +- Better defaults (resource exclusions for high-churn objects) +- Secrets operators endorsement + +## Version 2.7 Changes (Flux) + +**New Features**: +- Image automation GA +- ExternalArtifact and ArtifactGenerator APIs +- Source-watcher component +- OpenTelemetry tracing support +- CEL expressions for readiness + +## Migration Considerations + +### From ArgoCD → Flux + +**Pros**: +- Lower resource consumption +- More modular architecture +- Better OCI support +- Native SOPS integration + +**Cons**: +- Lose web UI +- More complex setup +- Manual multi-tenancy + +**Effort**: Medium-High (2-4 weeks for large deployment) + +### From Flux → ArgoCD + +**Pros**: +- Gain web UI +- Easier multi-tenancy +- ApplicationSets for multi-cluster +- Better for teams new to GitOps + +**Cons**: +- Higher resource consumption +- Less modular +- Limited OCI support + +**Effort**: Medium (1-3 weeks) + +## Recommendations by Use Case + +### Choose ArgoCD if: +- ✅ Developer teams need visibility (UI required) +- ✅ Managing dozens of applications across teams +- ✅ Multi-tenancy with Projects model +- ✅ Fast onboarding is priority +- ✅ Need built-in RBAC with fine-grained control + +### Choose Flux if: +- ✅ Platform engineering focus +- ✅ Infrastructure-as-code emphasis +- ✅ Using OCI artifacts extensively +- ✅ Want modular, composable architecture +- ✅ Team comfortable with CLI tools +- ✅ SOPS+age encryption requirement + +### Use Both if: +- Different teams have different needs +- ArgoCD for app teams, Flux for infrastructure +- Separate concerns (apps vs infrastructure) + +## Cost Considerations + +**ArgoCD**: +- Higher memory/CPU usage (~500MB-1GB per instance) +- Commercial support available (Akuity) + +**Flux**: +- Lower resource footprint (~200-400MB total) +- Grafana Cloud integration available + +## Conclusion + +**2024-2025 Recommendation**: +- **For most organizations**: Start with ArgoCD for ease of use +- **For platform teams**: Flux offers more control and modularity +- **For enterprises**: Consider ArgoCD for UI + Flux for infrastructure +- Both are production-ready CNCF Graduated projects + +The choice depends more on team preferences and workflows than technical capability. diff --git a/references/best_practices.md b/references/best_practices.md new file mode 100644 index 0000000..adeb675 --- /dev/null +++ b/references/best_practices.md @@ -0,0 +1,160 @@ +# GitOps Best Practices (2024-2025) + +## CNCF GitOps Principles (OpenGitOps v1.0) + +1. **Declarative**: System desired state expressed declaratively +2. **Versioned**: State stored in version control (Git) +3. **Automated**: Changes automatically applied +4. **Continuous Reconciliation**: Software agents ensure desired state +5. **Auditable**: All changes tracked in Git history + +## Repository Organization + +✅ **DO**: +- Separate infrastructure from applications +- Use clear directory structure (apps/, infrastructure/, clusters/) +- Implement environment promotion (dev → staging → prod) +- Use Kustomize overlays for environment differences + +❌ **DON'T**: +- Commit secrets to Git (use SOPS/Sealed Secrets/ESO) +- Use `:latest` image tags (pin to specific versions) +- Make manual cluster changes (everything through Git) +- Skip testing in lower environments + +## Security Best Practices + +1. **Secrets**: Never plain text, use encryption or external stores +2. **RBAC**: Least privilege for GitOps controllers +3. **Image Security**: Pin to digests, scan for vulnerabilities +4. **Network Policies**: Restrict controller traffic +5. **Audit**: Enable audit logging + +## ArgoCD 3.x Specific + +**Fine-Grained RBAC** (new in 3.0): +```yaml +p, role:dev, applications, *, dev/*, allow +p, role:dev, applications/resources, *, dev/*/Deployment/*, allow +``` + +**Resource Exclusions** (default in 3.0): +- Reduces API load +- Excludes high-churn resources (Endpoints, Leases) + +**Annotation Tracking** (default): +- More reliable than labels +- Auto-migrates on sync + +## Flux 2.7 Specific + +**OCI Artifacts** (GA in 2.6): +- Prefer OCI over Git for generated configs +- Use digest pinning for immutability +- Sign artifacts with cosign/notation + +**Image Automation** (GA in 2.7): +- Automated image updates +- GitRepository write-back + +**Source-Watcher** (new in 2.7): +- Improves reconciliation efficiency +- Enable with: `--components-extra=source-watcher` + +## CI/CD Integration + +**Git Workflow**: +``` +1. Developer commits to feature branch +2. CI runs tests, builds image +3. CI updates Git manifest with new image tag +4. Developer creates PR to main +5. GitOps controller syncs after merge +``` + +**Don't**: Deploy directly from CI to cluster (breaks GitOps) +**Do**: Update Git from CI, let GitOps deploy + +## Monitoring & Observability + +**Track**: +- Sync success rate +- Reconciliation time +- Drift detection frequency +- Failed syncs/reconciliations + +**Tools**: +- Prometheus metrics (both ArgoCD and Flux) +- Grafana dashboards +- Alert on sync failures + +## Image Management + +✅ **Good**: +```yaml +image: myapp:v1.2.3 +image: myapp@sha256:abc123... +``` + +❌ **Bad**: +```yaml +image: myapp:latest +image: myapp:dev +``` + +**Strategy**: Semantic versioning + digest pinning + +## Environment Promotion + +**Recommended Flow**: +``` +Dev (auto-sync) → Staging (auto-sync) → Production (manual approval) +``` + +**Implementation**: +- Separate directories or repos per environment +- PR-based promotion +- Automated tests before promotion +- Manual approval for production + +## Disaster Recovery + +1. **Git is Source of Truth**: Cluster can be rebuilt from Git +2. **Backup**: Git repo + cluster state +3. **Test Recovery**: Practice cluster rebuild +4. **Document Bootstrap**: How to restore from scratch + +## Performance Optimization + +**ArgoCD**: +- Use ApplicationSets for multi-cluster +- Enable resource exclusions (3.x default) +- Server-side diff for large apps + +**Flux**: +- Use OCI artifacts for large repos +- Enable source-watcher (2.7) +- Tune reconciliation intervals + +## Common Anti-Patterns to Avoid + +1. **Manual kubectl apply**: Bypasses GitOps, creates drift +2. **Multiple sources of truth**: Git should be only source +3. **Secrets in Git**: Always encrypt +4. **Direct cluster modifications**: All changes through Git +5. **No testing**: Always test in dev/staging first +6. **Missing RBAC**: Controllers need minimal permissions + +## 2025 Trends + +✅ **Adopt**: +- OCI artifacts (Flux) +- Workload identity (no static credentials) +- SOPS + age (over PGP) +- External Secrets Operator (dynamic secrets) +- Multi-cluster with ApplicationSets/Flux + +⚠️ **Avoid**: +- Label-based tracking (use annotations - ArgoCD 3.x default) +- PGP encryption (use age) +- Long-lived service account tokens (use workload identity) diff --git a/references/multi_cluster.md b/references/multi_cluster.md new file mode 100644 index 0000000..a900c11 --- /dev/null +++ b/references/multi_cluster.md @@ -0,0 +1,80 @@ +# Multi-Cluster GitOps Management (2024-2025) + +## ArgoCD ApplicationSets + +**Cluster Generator** (auto-discover clusters): +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: my-apps +spec: + generators: + - cluster: + selector: + matchLabels: + environment: production + template: + spec: + source: + repoURL: https://github.com/org/repo + path: apps/{{name}} + destination: + server: '{{server}}' +``` + +**Matrix Generator** (Cluster x Apps): +```yaml +generators: +- matrix: + generators: + - cluster: {} + - git: + directories: + - path: apps/* +``` + +**Performance**: 83% faster than manual (30min → 5min) + +## Flux Multi-Cluster + +**Option 1: Flux Per Cluster** +``` +cluster-1/ → Flux instance 1 +cluster-2/ → Flux instance 2 +``` + +**Option 2: Hub-and-Spoke** +``` +management-cluster/ +└── flux manages → cluster-1, cluster-2 +``` + +**Setup**: +```bash +flux bootstrap github --owner=org --repository=fleet \ + --path=clusters/production --context=prod-cluster +``` + +## Hub-and-Spoke Pattern + +**Benefits**: Centralized management, single source of truth +**Cons**: Single point of failure +**Best for**: < 50 clusters + +## Workload Identity (2025 Best Practice) + +**Instead of service account tokens, use**: +- AWS IRSA +- GCP Workload Identity +- Azure AD Workload Identity + +No more long-lived credentials! + +## Best Practices + +1. **Cluster labeling** for organization +2. **Progressive rollout** (dev → staging → prod clusters) +3. **Separate repos** for cluster config vs apps +4. **Monitor sync status** across all clusters +5. **Use workload identity** (no static credentials) diff --git a/references/oci_artifacts.md b/references/oci_artifacts.md new file mode 100644 index 0000000..f3fdf8e --- /dev/null +++ b/references/oci_artifacts.md @@ -0,0 +1,290 @@ +# OCI Artifacts with Flux (2024-2025) + +## Overview + +**GA Status**: Flux v2.6 (June 2025) +**Current**: Fully supported in Flux v2.7 + +OCI artifacts allow storing Kubernetes manifests, Helm charts, and Kustomize overlays in container registries instead of Git. + +## Benefits + +✅ **Decoupled from Git**: No Git dependency for deployment +✅ **Immutable**: Content-addressable by digest +✅ **Standard**: Uses OCI spec, works with any OCI registry +✅ **Signature Verification**: Native support for cosign/notation +✅ **Performance**: Faster than Git for large repos + +## OCIRepository Resource + +```yaml +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: OC IRepository +metadata: + name: my-app-oci + namespace: flux-system +spec: + interval: 5m + url: oci://ghcr.io/org/app-config + ref: + tag: v1.0.0 + # or digest: + # digest: sha256:abc123... + # or semver: + # semver: ">=1.0.0 <2.0.0" + provider: generic # or azure, aws, gcp + verify: + provider: cosign + secretRef: + name: cosign-public-key +``` + +## Using with Kustomization + +```yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: my-app +spec: + interval: 10m + sourceRef: + kind: OCIRepository + name: my-app-oci + path: ./ + prune: true +``` + +## Using with HelmRelease + +**OCIRepository for Helm charts**: +```yaml +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: OCIRepository +metadata: + name: podinfo-oci +spec: + interval: 5m + url: oci://ghcr.io/stefanprodan/charts/podinfo + ref: + semver: ">=6.0.0" +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: podinfo +spec: + chart: + spec: + chart: podinfo + sourceRef: + kind: OCIRepository + name: podinfo-oci +``` + +## Publishing OCI Artifacts + +**Using flux CLI**: +```bash +# Build and push Kustomize overlay +flux push artifact oci://ghcr.io/org/app-config:v1.0.0 \ + --path="./kustomize" \ + --source="$(git config --get remote.origin.url)" \ + --revision="$(git rev-parse HEAD)" + +# Build and push Helm chart +flux push artifact oci://ghcr.io/org/charts/myapp:1.0.0 \ + --path="./charts/myapp" \ + --source="$(git config --get remote.origin.url)" \ + --revision="$(git rev-parse HEAD)" +``` + +## Signature Verification + +### Using cosign + +**Sign artifact**: +```bash +cosign sign ghcr.io/org/app-config:v1.0.0 +``` + +**Verify in Flux**: +```yaml +spec: + verify: + provider: cosign + secretRef: + name: cosign-public-key +``` + +### Using notation + +**Sign artifact**: +```bash +notation sign ghcr.io/org/app-config:v1.0.0 +``` + +**Verify in Flux**: +```yaml +spec: + verify: + provider: notation + secretRef: + name: notation-config +``` + +## Workload Identity + +**Instead of static credentials, use cloud provider workload identity**: + +**AWS IRSA**: +```yaml +spec: + provider: aws + # No credentials needed - uses pod's IAM role +``` + +**GCP Workload Identity**: +```yaml +spec: + provider: gcp + # No credentials needed - uses service account binding +``` + +**Azure Workload Identity**: +```yaml +spec: + provider: azure + # No credentials needed - uses managed identity +``` + +## Best Practices (2025) + +1. **Use digest pinning** for production: + ```yaml + ref: + digest: sha256:abc123... + ``` + +2. **Sign all artifacts**: + ```bash + flux push artifact ... | cosign sign + ``` + +3. **Use semver for automated updates**: + ```yaml + ref: + semver: ">=1.0.0 <2.0.0" + ``` + +4. **Leverage workload identity** (no static credentials) + +5. **Prefer OCI for generated configs** (Jsonnet, CUE, Helm output) + +## When to Use OCI vs Git + +**Use OCI Artifacts when**: +- ✅ Storing generated configurations (Jsonnet, CUE output) +- ✅ Need immutable, content-addressable storage +- ✅ Want signature verification +- ✅ Large repos (performance) +- ✅ Decoupling from Git + +**Use Git when**: +- ✅ Source of truth for manifests +- ✅ Need Git workflow (PRs, reviews) +- ✅ Audit trail important +- ✅ Team collaboration + +## Common Pattern: Hybrid Approach + +``` +Git (source of truth) + ↓ +CI builds/generates manifests + ↓ +Push to OCI registry (signed) + ↓ +Flux pulls from OCI (verified) + ↓ +Deploy to cluster +``` + +## Migration from Git to OCI + +**Before (Git)**: +```yaml +apiVersion: source.toolkit.fluxcd.io/v1 +kind: GitRepository +metadata: + name: my-app +spec: + url: https://github.com/org/repo + ref: + branch: main +``` + +**After (OCI)**: +```yaml +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: OCIRepository +metadata: + name: my-app-oci +spec: + url: oci://ghcr.io/org/app-config + ref: + tag: v1.0.0 +``` + +**Update Kustomization/HelmRelease** sourceRef to point to OCIRepository + +## Supported Registries + +- ✅ GitHub Container Registry (ghcr.io) +- ✅ Docker Hub +- ✅ AWS ECR +- ✅ Google Artifact Registry +- ✅ Azure Container Registry +- ✅ Harbor +- ✅ GitLab Container Registry + +## Troubleshooting + +**Artifact not found**: +```bash +flux get sources oci +kubectl describe ocirepository + +# Verify artifact exists +crane digest ghcr.io/org/app:v1.0.0 +``` + +**Authentication failures**: +```bash +# Check secret +kubectl get secret -n flux-system + +# Test manually +crane manifest ghcr.io/org/app:v1.0.0 +``` + +**Signature verification fails**: +```bash +# Verify locally +cosign verify ghcr.io/org/app:v1.0.0 + +# Check public key secret +kubectl get secret cosign-public-key -o yaml +``` + +## 2025 Recommendation + +**Adopt OCI artifacts** for: +- Helm charts (already standard) +- Generated manifests (CI output) +- Multi-environment configs + +**Keep Git for**: +- Source manifests +- Infrastructure definitions +- Team collaboration workflows diff --git a/references/progressive_delivery.md b/references/progressive_delivery.md new file mode 100644 index 0000000..e1913cd --- /dev/null +++ b/references/progressive_delivery.md @@ -0,0 +1,94 @@ +# Progressive Delivery with GitOps (2024-2025) + +## Argo Rollouts (with ArgoCD) + +**Current Focus**: Kubernetes-native progressive delivery + +**Deployment Strategies**: + +### 1. Canary +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: Rollout +metadata: + name: my-app +spec: + strategy: + canary: + steps: + - setWeight: 20 + - pause: {duration: 5m} + - setWeight: 50 + - pause: {duration: 5m} + - setWeight: 100 +``` + +### 2. Blue-Green +```yaml +spec: + strategy: + blueGreen: + activeService: my-app + previewService: my-app-preview + autoPromotionEnabled: false +``` + +### 3. Analysis with Metrics +```yaml +spec: + strategy: + canary: + analysis: + templates: + - templateName: success-rate + args: + - name: service-name + value: my-app +``` + +**Metric Providers**: Prometheus, Datadog, New Relic, CloudWatch + +## Flagger (with Flux) + +**Installation**: +```bash +flux install +kubectl apply -k github.com/fluxcd/flagger//kustomize/linkerd +``` + +**Canary with Flagger**: +```yaml +apiVersion: flagger.app/v1beta1 +kind: Canary +metadata: + name: my-app +spec: + targetRef: + apiVersion: apps/v1 + kind: Deployment + name: my-app + service: + port: 9898 + analysis: + interval: 1m + threshold: 5 + maxWeight: 50 + stepWeight: 10 + metrics: + - name: request-success-rate + thresholdRange: + min: 99 +``` + +## Best Practices + +1. **Start with Manual Approval** (autoPromotionEnabled: false) +2. **Monitor Key Metrics** (error rate, latency, saturation) +3. **Set Conservative Steps** (10%, 25%, 50%, 100%) +4. **Define Rollback Criteria** (error rate > 1%) +5. **Test in Staging First** + +## 2025 Recommendation + +**For ArgoCD users**: Argo Rollouts (tight integration, UI support) +**For Flux users**: Flagger (CNCF project, modular design) diff --git a/references/repo_patterns.md b/references/repo_patterns.md new file mode 100644 index 0000000..878682e --- /dev/null +++ b/references/repo_patterns.md @@ -0,0 +1,184 @@ +# GitOps Repository Patterns (2024-2025) + +## Monorepo vs Polyrepo + +### Monorepo Pattern + +**Structure**: +``` +gitops-repo/ +├── apps/ +│ ├── frontend/ +│ ├── backend/ +│ └── database/ +├── infrastructure/ +│ ├── ingress/ +│ ├── monitoring/ +│ └── secrets/ +└── clusters/ + ├── dev/ + ├── staging/ + └── production/ +``` + +**Pros**: +- Single source of truth +- Atomic changes across apps +- Easier to start with +- Simpler CI/CD + +**Cons**: +- Scaling issues (>100 apps) +- RBAC complexity +- Large repo size +- Blast radius concerns + +**Best for**: Startups, small teams (< 20 apps), single team ownership + +### Polyrepo Pattern + +**Structure**: +``` +infrastructure-repo/ (Platform team) +app-team-1-repo/ (Team 1) +app-team-2-repo/ (Team 2) +cluster-config-repo/ (Platform team) +``` + +**Pros**: +- Clear ownership boundaries +- Better RBAC (repo-level) +- Scales to 100s of apps +- Team autonomy + +**Cons**: +- More complex setup +- Cross-repo dependencies +- Multiple CI/CD pipelines + +**Best for**: Large orgs, multiple teams, clear separation of concerns + +## Common Patterns + +### 1. Repo Per Team +- Each team has own repo +- Platform team manages infra repo +- Hub cluster manages all + +### 2. Repo Per App +- Each app in separate repo +- Good for microservices +- Maximum autonomy + +### 3. Hybrid (Recommended) +- Infrastructure monorepo (platform team) +- Application polyrepo (dev teams) +- Best of both worlds + +## App-of-Apps Pattern (ArgoCD) + +**Root Application**: +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: root +spec: + source: + repoURL: https://github.com/org/gitops + path: apps/ + destination: + server: https://kubernetes.default.svc +``` + +**Apps Directory**: +``` +apps/ +├── app1.yaml (Application manifest) +├── app2.yaml +└── app3.yaml +``` + +**Benefits**: Centralized management, single sync point + +## Environment Structure + +### Option 1: Directory Per Environment +``` +apps/ +├── base/ +│ └── kustomization.yaml +└── overlays/ + ├── dev/ + ├── staging/ + └── production/ +``` + +### Option 2: Branch Per Environment +``` +main branch → production +staging branch → staging +dev branch → development +``` + +**Don't Repeat YAML**: Use Kustomize bases + overlays + +## Flux Repository Organization + +**Recommended Structure**: +``` +flux-repo/ +├── clusters/ +│ ├── production/ +│ │ ├── flux-system/ +│ │ ├── apps.yaml +│ │ └── infrastructure.yaml +│ └── staging/ +├── apps/ +│ └── podinfo/ +│ ├── kustomization.yaml +│ └── release.yaml +└── infrastructure/ + └── sources/ + ├── gitrepositories.yaml + └── ocirepositories.yaml +``` + +## Kustomize vs Helm in GitOps + +**Kustomize** (recommended for GitOps): +- Native Kubernetes +- Declarative patches +- No templating language + +**Helm** (when necessary): +- Third-party charts +- Complex applications +- Need parameterization + +**Best Practice**: Kustomize for your apps, Helm for third-party + +## Promotion Strategies + +### 1. Manual PR-based +``` +dev/ → (PR) → staging/ → (PR) → production/ +``` + +### 2. Automated with CI +``` +dev/ → (auto-promote on tests pass) → staging/ → (manual approval) → production/ +``` + +### 3. Progressive with Canary +``` +production/stable/ → canary deployment → production/all/ +``` + +## 2024-2025 Recommendations + +1. **Start with monorepo**, migrate to polyrepo when needed +2. **Use Kustomize bases + overlays** (don't repeat YAML) +3. **Separate infrastructure from applications** +4. **Implement promotion workflows** (dev → staging → prod) +5. **Never commit directly to production** (always PR) diff --git a/references/secret_management.md b/references/secret_management.md new file mode 100644 index 0000000..fd8776e --- /dev/null +++ b/references/secret_management.md @@ -0,0 +1,213 @@ +# Secrets Management in GitOps (2024-2025) + +## Overview + +**Never commit plain secrets to Git.** Use encryption or external secret stores. + +## Solutions Comparison + +| Solution | Type | Complexity | Best For | 2025 Trend | +|----------|------|------------|----------|------------| +| **Sealed Secrets** | Encrypted in Git | Low | Simple, GitOps-first | Stable | +| **External Secrets Operator** | External store | Medium | Cloud-native, dynamic | ↗️ Growing | +| **SOPS + age** | Encrypted in Git | Medium | Flexible, Git-friendly | ↗️ Preferred over PGP | + +## 1. Sealed Secrets + +**How it works**: Public key encryption, controller decrypts in-cluster + +**Setup**: +```bash +kubectl apply -f https://github.com/bitnami-labs/sealed-secrets/releases/download/v0.24.0/controller.yaml +``` + +**Usage**: +```bash +# Create sealed secret +kubectl create secret generic my-secret --dry-run=client -o yaml --from-literal=password=supersecret | \ + kubeseal -o yaml > sealed-secret.yaml + +# Commit to Git +git add sealed-secret.yaml +git commit -m "Add sealed secret" +``` + +**Pros**: Simple, GitOps-native, no external dependencies +**Cons**: Key rotation complexity, static secrets only + +## 2. External Secrets Operator (ESO) + +**Latest Version**: v0.20.2 (2024-2025) + +**Supported Providers**: +- AWS Secrets Manager +- Azure Key Vault +- Google Secret Manager +- HashiCorp Vault +- 1Password +- Doppler + +**Setup**: +```bash +helm install external-secrets external-secrets/external-secrets -n external-secrets-system --create-namespace +``` + +**Usage**: +```yaml +apiVersion: external-secrets.io/v1beta1 +kind: SecretStore +metadata: + name: aws-secret-store +spec: + provider: + aws: + service: SecretsManager + region: us-east-1 + auth: + jwt: + serviceAccountRef: + name: external-secrets-sa +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: my-secret +spec: + secretStoreRef: + name: aws-secret-store + target: + name: my-app-secret + data: + - secretKey: password + remoteRef: + key: prod/my-app/password +``` + +**Pros**: Dynamic secrets, cloud-native, automatic rotation +**Cons**: External dependency, requires cloud secret store + +**2025 Recommendation**: Growing preference over Sealed Secrets + +## 3. SOPS + age + +**Recommended over PGP as of 2024-2025** + +**Setup age**: +```bash +# Install age +brew install age # macOS +apt install age # Ubuntu + +# Generate key +age-keygen -o key.txt +# Public key: age1... +``` + +**Setup SOPS**: +```bash +# Install SOPS +brew install sops + +# Create .sops.yaml +cat < .sops.yaml +creation_rules: + - path_regex: .*.yaml + encrypted_regex: ^(data|stringData)$ + age: age1ql3z7hjy54pw3hyww5ayyfg7zqgvc7w3j2elw8zmrj2kg5sfn9aqmcac8p +EOF +``` + +**Encrypt secrets**: +```bash +# Create secret +kubectl create secret generic my-secret --dry-run=client -o yaml --from-literal=password=supersecret > secret.yaml + +# Encrypt with SOPS +sops -e secret.yaml > secret.enc.yaml + +# Commit encrypted version +git add secret.enc.yaml .sops.yaml +``` + +**Flux Integration**: +```yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: app +spec: + decryption: + provider: sops + secretRef: + name: sops-age +``` + +**Pros**: Git-friendly, flexible, age is simpler than PGP +**Cons**: Manual encryption step, key management + +## Best Practices (2024-2025) + +### 1. Key Rotation +**Sealed Secrets**: Rotate annually, maintain old keys for decryption +**ESO**: Automatic with cloud providers +**SOPS**: Re-encrypt when rotating age keys + +### 2. Access Control +- Never commit `.sops` age key to Git +- Use separate keys per environment +- Store age keys in CI/CD secrets +- Use RBAC for Secret access + +### 3. Encryption Scope +**SOPS .sops.yaml**: +```yaml +creation_rules: + - path_regex: production/.* + encrypted_regex: ^(data|stringData)$ + age: age1prod... + - path_regex: staging/.* + encrypted_regex: ^(data|stringData)$ + age: age1staging... +``` + +### 4. Git Pre-commit Hook +Prevent committing plain secrets: +```bash +#!/bin/bash +# .git/hooks/pre-commit +if git diff --cached --name-only | grep -E 'secret.*\.yaml$'; then + echo "⚠️ Potential secret file detected" + echo "Ensure it's encrypted with SOPS" + exit 1 +fi +``` + +### 5. ArgoCD 3.0 Recommendation +**Use secrets operators** (ESO preferred), avoid config management plugins for secrets + +## Decision Guide + +**Choose Sealed Secrets if**: +- ✅ Simple GitOps workflow +- ✅ Static secrets +- ✅ No external dependencies wanted +- ✅ Small team + +**Choose External Secrets Operator if**: +- ✅ Already using cloud secret stores +- ✅ Need secret rotation +- ✅ Dynamic secrets +- ✅ Enterprise compliance + +**Choose SOPS + age if**: +- ✅ Git-centric workflow +- ✅ Want flexibility +- ✅ Multi-cloud +- ✅ Prefer open standards + +## 2025 Trend Summary + +**Growing**: External Secrets Operator, SOPS+age +**Stable**: Sealed Secrets (still widely used) +**Declining**: PGP encryption (age preferred) +**Emerging**: age encryption as standard (simpler than PGP) diff --git a/references/troubleshooting.md b/references/troubleshooting.md new file mode 100644 index 0000000..db97973 --- /dev/null +++ b/references/troubleshooting.md @@ -0,0 +1,134 @@ +# GitOps Troubleshooting Guide (2024-2025) + +## Common ArgoCD Issues + +### 1. Application OutOfSync +**Symptoms**: Application shows OutOfSync status +**Causes**: Git changes not applied, manual cluster changes +**Fix**: +```bash +argocd app sync my-app +argocd app diff my-app # See differences +``` + +### 2. Annotation Tracking Migration (ArgoCD 3.x) +**Symptoms**: Resources not tracked after upgrade to 3.x +**Cause**: Default changed from labels to annotations +**Fix**: Resources auto-migrate on next sync, or force: +```bash +argocd app sync my-app --force +``` + +### 3. Sync Fails with "Resource is Invalid" +**Cause**: YAML validation error, CRD mismatch +**Fix**: +```bash +argocd app get my-app --show-operation +kubectl apply --dry-run=client -f manifest.yaml # Test locally +``` + +### 4. Image Pull Errors +**Cause**: Registry credentials, network issues +**Fix**: +```bash +kubectl get events -n +kubectl describe pod -n +# Check image pull secret +kubectl get secret -n +``` + +## Common Flux Issues + +### 1. GitRepository Not Ready +**Symptoms**: source not ready, no artifact +**Causes**: Auth failure, branch doesn't exist +**Fix**: +```bash +flux get sources git +flux reconcile source git -n flux-system +kubectl describe gitrepository -n flux-system +``` + +### 2. Kustomization Build Failed +**Cause**: Invalid kustomization.yaml, missing resources +**Fix**: +```bash +flux get kustomizations +kubectl describe kustomization -n flux-system +# Test locally +kustomize build +``` + +### 3. HelmRelease Install Failed +**Cause**: Values error, chart incompatibility +**Fix**: +```bash +flux get helmreleases +kubectl logs -n flux-system -l app=helm-controller +# Test locally +helm template -f values.yaml +``` + +### 4. OCI Repository Issues (Flux 2.6+) +**Cause**: Registry auth, OCI artifact not found +**Fix**: +```bash +flux get sources oci +kubectl describe ocirepository +# Verify artifact exists +crane digest ghcr.io/org/app:v1.0.0 +``` + +## SOPS Decryption Failures + +**Symptom**: Secret not decrypted +**Fix**: +```bash +# Check age secret exists +kubectl get secret sops-age -n flux-system + +# Test decryption locally +export SOPS_AGE_KEY_FILE=key.txt +sops -d secret.enc.yaml +``` + +## Performance Issues + +### ArgoCD Slow Syncs +**Cause**: Too many resources, inefficient queries +**Fix** (ArgoCD 3.x): +- Use default resource exclusions +- Enable server-side diff +- Increase controller replicas + +### Flux Slow Reconciliation +**Cause**: Large monorepos, many sources +**Fix** (Flux 2.7+): +- Enable source-watcher +- Increase interval +- Use OCI artifacts instead of Git + +## Debugging Commands + +**ArgoCD**: +```bash +argocd app get --refresh +argocd app logs +kubectl logs -n argocd -l app.kubernetes.io/name=argocd-application-controller +``` + +**Flux**: +```bash +flux logs --all-namespaces +flux check +flux get all +kubectl -n flux-system get events --sort-by='.lastTimestamp' +``` + +## Quick Wins + +1. **Use `--dry-run`** before applying +2. **Check controller logs** first +3. **Verify RBAC** permissions +4. **Test manifests locally** (kubectl apply --dry-run, kustomize build) +5. **Check Git connectivity** (credentials, network) diff --git a/scripts/applicationset_generator.py b/scripts/applicationset_generator.py new file mode 100644 index 0000000..73557b6 --- /dev/null +++ b/scripts/applicationset_generator.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +""" +Generate ArgoCD ApplicationSet manifests for multi-cluster deployments. +Supports Cluster, List, and Matrix generators (ArgoCD 3.x). +""" + +import argparse +import sys +import yaml + + +APPLICATIONSET_TEMPLATE = """--- +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: {name} + namespace: argocd +spec: + goTemplate: true + goTemplateOptions: ["missingkey=error"] + generators: +{generators} + template: + metadata: + name: '{{{{.name}}}}-{name}' + labels: + environment: '{{{{.environment}}}}' + spec: + project: default + source: + repoURL: {repo_url} + targetRevision: {target_revision} + path: '{path}' + destination: + server: '{{{{.server}}}}' + namespace: {namespace} + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true +""" + + +def generate_cluster_generator(label_selector: str = "") -> str: + """Generate Cluster generator.""" + selector = f"\n selector:\n matchLabels:\n {label_selector}" if label_selector else "" + return f""" - cluster: {{{selector}}}""" + + +def generate_list_generator(clusters: list) -> str: + """Generate List generator.""" + elements = "\n".join([f" - name: {c['name']}\n server: {c['server']}\n environment: {c.get('environment', 'production')}" + for c in clusters]) + return f""" - list: + elements: +{elements}""" + + +def generate_matrix_generator(cluster_label: str, git_directories: list) -> str: + """Generate Matrix generator (Cluster x Git directories).""" + git_list = "\n".join([f" - path: {d}" for d in git_directories]) + return f""" - matrix: + generators: + - cluster: + selector: + matchLabels: + environment: {cluster_label} + - git: + repoURL: https://github.com/example/apps + revision: HEAD + directories: +{git_list}""" + + +def main(): + parser = argparse.ArgumentParser( + description='Generate ArgoCD ApplicationSet manifests', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Cluster generator (all clusters) + python3 applicationset_generator.py cluster \\ + --name my-apps \\ + --repo-url https://github.com/org/repo \\ + --path apps/ + + # List generator (specific clusters) + python3 applicationset_generator.py list \\ + --name my-apps \\ + --clusters prod=https://prod.k8s.local,staging=https://staging.k8s.local + + # Matrix generator (cluster x directories) + python3 applicationset_generator.py matrix \\ + --name my-apps \\ + --cluster-label production \\ + --directories app1,app2,app3 + """ + ) + + parser.add_argument('generator_type', choices=['cluster', 'list', 'matrix'], + help='Generator type') + parser.add_argument('--name', required=True, help='ApplicationSet name') + parser.add_argument('--repo-url', default='https://github.com/example/repo', + help='Git repository URL') + parser.add_argument('--path', default='apps/', help='Path in repository') + parser.add_argument('--namespace', default='default', help='Target namespace') + parser.add_argument('--target-revision', default='main', help='Git branch/tag') + parser.add_argument('--cluster-label', help='Cluster label selector') + parser.add_argument('--clusters', help='Cluster list (name=server,name=server)') + parser.add_argument('--directories', help='Git directories (comma-separated)') + parser.add_argument('--output', help='Output file') + + args = parser.parse_args() + + # Generate based on type + if args.generator_type == 'cluster': + generators = generate_cluster_generator(args.cluster_label or "") + elif args.generator_type == 'list': + if not args.clusters: + print("❌ --clusters required for list generator") + sys.exit(1) + cluster_list = [] + for c in args.clusters.split(','): + name, server = c.split('=') + cluster_list.append({'name': name, 'server': server}) + generators = generate_list_generator(cluster_list) + elif args.generator_type == 'matrix': + if not args.cluster_label or not args.directories: + print("❌ --cluster-label and --directories required for matrix generator") + sys.exit(1) + directories = args.directories.split(',') + generators = generate_matrix_generator(args.cluster_label, directories) + + # Create ApplicationSet + appset = APPLICATIONSET_TEMPLATE.format( + name=args.name, + generators=generators, + repo_url=args.repo_url, + target_revision=args.target_revision, + path=args.path, + namespace=args.namespace + ) + + # Output + if args.output: + with open(args.output, 'w') as f: + f.write(appset) + print(f"✅ ApplicationSet written to: {args.output}") + else: + print(appset) + + +if __name__ == '__main__': + main() diff --git a/scripts/check_argocd_health.py b/scripts/check_argocd_health.py new file mode 100644 index 0000000..449d60c --- /dev/null +++ b/scripts/check_argocd_health.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 +""" +Check ArgoCD application health and diagnose sync issues. +Supports ArgoCD 3.x API with annotation-based tracking. +""" + +import argparse +import sys +import json +from typing import Dict, List, Any, Optional +from datetime import datetime + +try: + import requests +except ImportError: + print("⚠️ Warning: 'requests' library not found. Install with: pip install requests") + sys.exit(1) + +try: + from tabulate import tabulate +except ImportError: + tabulate = None + + +class ArgoCDHealthChecker: + def __init__(self, server: str, token: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None): + self.server = server.rstrip('/') + self.token = token + self.session = requests.Session() + + if token: + self.session.headers['Authorization'] = f'Bearer {token}' + elif username and password: + # Login to get token + self._login(username, password) + else: + raise ValueError("Either --token or --username/--password must be provided") + + def _login(self, username: str, password: str): + """Login to ArgoCD and get auth token.""" + try: + response = self.session.post( + f"{self.server}/api/v1/session", + json={"username": username, "password": password}, + verify=False + ) + response.raise_for_status() + self.token = response.json()['token'] + self.session.headers['Authorization'] = f'Bearer {self.token}' + except Exception as e: + print(f"❌ Failed to login to ArgoCD: {e}") + sys.exit(1) + + def get_applications(self, name: Optional[str] = None) -> List[Dict]: + """Get ArgoCD applications.""" + try: + if name: + url = f"{self.server}/api/v1/applications/{name}" + response = self.session.get(url, verify=False) + response.raise_for_status() + return [response.json()] + else: + url = f"{self.server}/api/v1/applications" + response = self.session.get(url, verify=False) + response.raise_for_status() + return response.json().get('items', []) + except Exception as e: + print(f"❌ Failed to get applications: {e}") + return [] + + def check_application_health(self, app: Dict) -> Dict[str, Any]: + """Check application health and sync status.""" + name = app['metadata']['name'] + health = app.get('status', {}).get('health', {}) + sync = app.get('status', {}).get('sync', {}) + operation_state = app.get('status', {}).get('operationState', {}) + + result = { + 'name': name, + 'health_status': health.get('status', 'Unknown'), + 'health_message': health.get('message', ''), + 'sync_status': sync.get('status', 'Unknown'), + 'sync_revision': sync.get('revision', 'N/A')[:8] if sync.get('revision') else 'N/A', + 'operation_phase': operation_state.get('phase', 'N/A'), + 'issues': [], + 'recommendations': [] + } + + # Check for common issues + if result['health_status'] not in ['Healthy', 'Unknown']: + result['issues'].append(f"Application is {result['health_status']}") + if result['health_message']: + result['issues'].append(f"Health message: {result['health_message']}") + + if result['sync_status'] == 'OutOfSync': + result['issues'].append("Application is out of sync with Git") + result['recommendations'].append("Run: argocd app sync " + name) + result['recommendations'].append("Check if manual sync is required (sync policy)") + + if result['sync_status'] == 'Unknown': + result['issues'].append("Sync status is unknown") + result['recommendations'].append("Check ArgoCD application controller logs") + result['recommendations'].append(f"kubectl logs -n argocd -l app.kubernetes.io/name=argocd-application-controller") + + # Check for failed operations + if operation_state.get('phase') == 'Failed': + result['issues'].append(f"Last operation failed") + if 'message' in operation_state: + result['issues'].append(f"Operation message: {operation_state['message']}") + result['recommendations'].append("Check operation details in ArgoCD UI") + result['recommendations'].append(f"argocd app get {name}") + + # Check resource conditions (ArgoCD 3.x) + resources = app.get('status', {}).get('resources', []) + unhealthy_resources = [r for r in resources if r.get('health', {}).get('status') not in ['Healthy', 'Unknown', '']] + if unhealthy_resources: + result['issues'].append(f"{len(unhealthy_resources)} resources are unhealthy") + for r in unhealthy_resources[:3]: # Show first 3 + kind = r.get('kind', 'Unknown') + name = r.get('name', 'Unknown') + status = r.get('health', {}).get('status', 'Unknown') + result['issues'].append(f" - {kind}/{name}: {status}") + result['recommendations'].append(f"kubectl get {unhealthy_resources[0]['kind']} -n {app['spec']['destination']['namespace']}") + + # Check for annotation-based tracking (ArgoCD 3.x default) + tracking_method = app.get('spec', {}).get('syncPolicy', {}).get('syncOptions', []) + has_label_tracking = 'UseLabel=true' in tracking_method + if has_label_tracking: + result['recommendations'].append("⚠️ Using legacy label-based tracking. Consider migrating to annotation-based tracking (ArgoCD 3.x default)") + + return result + + def check_all_applications(self, name: Optional[str] = None, show_healthy: bool = False) -> List[Dict]: + """Check all applications or specific application.""" + apps = self.get_applications(name) + results = [] + + for app in apps: + result = self.check_application_health(app) + if show_healthy or result['issues']: + results.append(result) + + return results + + def print_summary(self, results: List[Dict]): + """Print summary of application health.""" + if not results: + print("✅ No applications found or all healthy (use --show-healthy to see healthy apps)") + return + + # Summary statistics + total = len(results) + with_issues = len([r for r in results if r['issues']]) + + print(f"\n📊 Summary: {with_issues}/{total} applications have issues\n") + + # Table output + if tabulate: + table_data = [] + for r in results: + status_icon = "❌" if r['issues'] else "✅" + table_data.append([ + status_icon, + r['name'], + r['health_status'], + r['sync_status'], + r['sync_revision'], + len(r['issues']) + ]) + + print(tabulate( + table_data, + headers=['', 'Application', 'Health', 'Sync', 'Revision', 'Issues'], + tablefmt='simple' + )) + else: + for r in results: + status_icon = "❌" if r['issues'] else "✅" + print(f"{status_icon} {r['name']}: Health={r['health_status']}, Sync={r['sync_status']}, Issues={len(r['issues'])}") + + # Detailed issues and recommendations + print("\n🔍 Detailed Issues:\n") + for r in results: + if not r['issues']: + continue + + print(f"Application: {r['name']}") + print(f" Health: {r['health_status']}") + print(f" Sync: {r['sync_status']}") + + if r['issues']: + print(" Issues:") + for issue in r['issues']: + print(f" • {issue}") + + if r['recommendations']: + print(" Recommendations:") + for rec in r['recommendations']: + print(f" → {rec}") + print() + + +def main(): + parser = argparse.ArgumentParser( + description='Check ArgoCD application health and diagnose sync issues (ArgoCD 3.x compatible)', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Check all applications + python3 check_argocd_health.py \\ + --server https://argocd.example.com \\ + --token $ARGOCD_TOKEN + + # Check specific application + python3 check_argocd_health.py \\ + --server https://argocd.example.com \\ + --username admin \\ + --password $ARGOCD_PASSWORD \\ + --app my-app + + # Show all applications including healthy ones + python3 check_argocd_health.py \\ + --server https://argocd.example.com \\ + --token $ARGOCD_TOKEN \\ + --show-healthy + +ArgoCD 3.x Features: + - Annotation-based tracking (default) + - Fine-grained RBAC support + - Enhanced resource health checks + """ + ) + + parser.add_argument('--server', required=True, help='ArgoCD server URL') + parser.add_argument('--token', help='ArgoCD auth token (or set ARGOCD_TOKEN env var)') + parser.add_argument('--username', help='ArgoCD username') + parser.add_argument('--password', help='ArgoCD password') + parser.add_argument('--app', help='Specific application name to check') + parser.add_argument('--show-healthy', action='store_true', help='Show healthy applications') + parser.add_argument('--json', action='store_true', help='Output as JSON') + + args = parser.parse_args() + + # Get token from env if not provided + import os + token = args.token or os.getenv('ARGOCD_TOKEN') + + try: + checker = ArgoCDHealthChecker( + server=args.server, + token=token, + username=args.username, + password=args.password + ) + + results = checker.check_all_applications( + name=args.app, + show_healthy=args.show_healthy + ) + + if args.json: + print(json.dumps(results, indent=2)) + else: + checker.print_summary(results) + + except KeyboardInterrupt: + print("\n\nInterrupted by user") + sys.exit(1) + except Exception as e: + print(f"❌ Error: {e}") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/scripts/check_flux_health.py b/scripts/check_flux_health.py new file mode 100644 index 0000000..36a318d --- /dev/null +++ b/scripts/check_flux_health.py @@ -0,0 +1,418 @@ +#!/usr/bin/env python3 +""" +Check Flux CD health and diagnose reconciliation issues. +Supports Flux v2.7+ with OCI artifacts, image automation, and source-watcher. +""" + +import argparse +import sys +import json +from typing import Dict, List, Any, Optional +from datetime import datetime, timedelta + +try: + from kubernetes import client, config + from kubernetes.client.rest import ApiException +except ImportError: + print("⚠️ Warning: 'kubernetes' library not found. Install with: pip install kubernetes") + sys.exit(1) + +try: + from tabulate import tabulate +except ImportError: + tabulate = None + + +class FluxHealthChecker: + def __init__(self, namespace: str = "flux-system", kubeconfig: Optional[str] = None): + self.namespace = namespace + + # Load kubeconfig + try: + if kubeconfig: + config.load_kube_config(config_file=kubeconfig) + else: + try: + config.load_kube_config() + except: + config.load_incluster_config() + except Exception as e: + print(f"❌ Failed to load kubeconfig: {e}") + sys.exit(1) + + self.api = client.ApiClient() + self.custom_api = client.CustomObjectsApi(self.api) + self.core_api = client.CoreV1Api(self.api) + + def get_flux_resources(self, resource_type: str, namespace: Optional[str] = None) -> List[Dict]: + """Get Flux custom resources.""" + ns = namespace or self.namespace + + resource_map = { + 'gitrepositories': ('source.toolkit.fluxcd.io', 'v1', 'gitrepositories'), + 'ocirepositories': ('source.toolkit.fluxcd.io', 'v1beta2', 'ocirepositories'), + 'helmrepositories': ('source.toolkit.fluxcd.io', 'v1', 'helmrepositories'), + 'buckets': ('source.toolkit.fluxcd.io', 'v1beta2', 'buckets'), + 'kustomizations': ('kustomize.toolkit.fluxcd.io', 'v1', 'kustomizations'), + 'helmreleases': ('helm.toolkit.fluxcd.io', 'v2', 'helmreleases'), + 'imageupdateautomations': ('image.toolkit.fluxcd.io', 'v1beta2', 'imageupdateautomations'), + 'imagerepositories': ('image.toolkit.fluxcd.io', 'v1beta2', 'imagerepositories'), + } + + if resource_type not in resource_map: + return [] + + group, version, plural = resource_map[resource_type] + + try: + response = self.custom_api.list_namespaced_custom_object( + group=group, + version=version, + namespace=ns, + plural=plural + ) + return response.get('items', []) + except ApiException as e: + if e.status == 404: + return [] + print(f"⚠️ Warning: Failed to get {resource_type}: {e}") + return [] + + def check_resource_health(self, resource: Dict, resource_type: str) -> Dict[str, Any]: + """Check resource health and reconciliation status.""" + name = resource['metadata']['name'] + namespace = resource['metadata']['namespace'] + status = resource.get('status', {}) + + # Get conditions + conditions = status.get('conditions', []) + ready_condition = next((c for c in conditions if c['type'] == 'Ready'), None) + + result = { + 'type': resource_type, + 'name': name, + 'namespace': namespace, + 'ready': ready_condition.get('status', 'Unknown') if ready_condition else 'Unknown', + 'message': ready_condition.get('message', '') if ready_condition else '', + 'last_reconcile': status.get('lastHandledReconcileAt', 'N/A'), + 'issues': [], + 'recommendations': [] + } + + # Check if ready + if result['ready'] != 'True': + result['issues'].append(f"{resource_type} is not ready") + if result['message']: + result['issues'].append(f"Message: {result['message']}") + + # Type-specific checks + if resource_type == 'gitrepositories': + self._check_git_repository(resource, result) + elif resource_type == 'ocirepositories': + self._check_oci_repository(resource, result) + elif resource_type == 'kustomizations': + self._check_kustomization(resource, result) + elif resource_type == 'helmreleases': + self._check_helm_release(resource, result) + elif resource_type == 'imageupdateautomations': + self._check_image_automation(resource, result) + + return result + + def _check_git_repository(self, resource: Dict, result: Dict): + """Check GitRepository-specific issues.""" + status = resource.get('status', {}) + + # Check artifact + if not status.get('artifact'): + result['issues'].append("No artifact available") + result['recommendations'].append("Check repository URL and credentials") + result['recommendations'].append(f"flux reconcile source git {result['name']} -n {result['namespace']}") + + # Check for auth errors + if 'authentication' in result['message'].lower() or 'credentials' in result['message'].lower(): + result['recommendations'].append("Check Git credentials secret") + result['recommendations'].append(f"kubectl get secret -n {result['namespace']}") + + def _check_oci_repository(self, resource: Dict, result: Dict): + """Check OCIRepository-specific issues (Flux v2.6+ feature).""" + status = resource.get('status', {}) + + # Check artifact + if not status.get('artifact'): + result['issues'].append("No OCI artifact available") + result['recommendations'].append("Check OCI repository URL and credentials") + result['recommendations'].append("Verify OCI artifact exists in registry") + + # Check signature verification (Flux v2.7+) + spec = resource.get('spec', {}) + if spec.get('verify'): + verify_status = status.get('observedGeneration') + if not verify_status: + result['issues'].append("Signature verification configured but not completed") + result['recommendations'].append("Check cosign or notation configuration") + + def _check_kustomization(self, resource: Dict, result: Dict): + """Check Kustomization-specific issues.""" + status = resource.get('status', {}) + + # Check source reference + spec = resource.get('spec', {}) + source_ref = spec.get('sourceRef', {}) + if not source_ref: + result['issues'].append("No source reference configured") + + # Check inventory + inventory = status.get('inventory') + if inventory and 'entries' in inventory: + total_resources = len(inventory['entries']) + result['recommendations'].append(f"Managing {total_resources} resources") + + # Check for prune errors + if 'prune' in result['message'].lower(): + result['recommendations'].append("Check for resources blocking pruning") + result['recommendations'].append("Review finalizers on deleted resources") + + def _check_helm_release(self, resource: Dict, result: Dict): + """Check HelmRelease-specific issues.""" + status = resource.get('status', {}) + + # Check install/upgrade status + install_failures = status.get('installFailures', 0) + upgrade_failures = status.get('upgradeFailures', 0) + + if install_failures > 0: + result['issues'].append(f"Install failed {install_failures} times") + result['recommendations'].append("Check Helm values and chart compatibility") + + if upgrade_failures > 0: + result['issues'].append(f"Upgrade failed {upgrade_failures} times") + result['recommendations'].append("Review Helm upgrade logs") + result['recommendations'].append(f"kubectl logs -n {result['namespace']} -l app=helm-controller") + + # Check for timeout issues + if 'timeout' in result['message'].lower(): + result['recommendations'].append("Increase timeout in HelmRelease spec") + result['recommendations'].append("Check pod startup times and readiness probes") + + def _check_image_automation(self, resource: Dict, result: Dict): + """Check ImageUpdateAutomation-specific issues (Flux v2.7+ GA).""" + status = resource.get('status', {}) + + # Check last automation time + last_automation = status.get('lastAutomationRunTime') + if not last_automation: + result['issues'].append("No automation runs recorded") + result['recommendations'].append("Check ImagePolicy and git write access") + + def check_flux_controllers(self) -> List[Dict]: + """Check health of Flux controller pods.""" + results = [] + + controller_labels = [ + 'source-controller', + 'kustomize-controller', + 'helm-controller', + 'notification-controller', + 'image-reflector-controller', + 'image-automation-controller', + ] + + for controller in controller_labels: + try: + pods = self.core_api.list_namespaced_pod( + namespace=self.namespace, + label_selector=f'app={controller}' + ) + + if not pods.items: + results.append({ + 'controller': controller, + 'status': 'Not Found', + 'issues': [f'{controller} not found'], + 'recommendations': ['Check Flux installation'] + }) + continue + + pod = pods.items[0] + pod_status = pod.status.phase + + result = { + 'controller': controller, + 'status': pod_status, + 'issues': [], + 'recommendations': [] + } + + if pod_status != 'Running': + result['issues'].append(f'Controller not running (status: {pod_status})') + result['recommendations'].append(f'kubectl describe pod -n {self.namespace} -l app={controller}') + result['recommendations'].append(f'kubectl logs -n {self.namespace} -l app={controller}') + + # Check container restarts + for container_status in pod.status.container_statuses or []: + if container_status.restart_count > 5: + result['issues'].append(f'High restart count: {container_status.restart_count}') + result['recommendations'].append('Check controller logs for crash loops') + + results.append(result) + + except ApiException as e: + results.append({ + 'controller': controller, + 'status': 'Error', + 'issues': [f'Failed to check: {e}'], + 'recommendations': [] + }) + + return results + + def print_summary(self, resource_results: List[Dict], controller_results: List[Dict]): + """Print summary of Flux health.""" + # Controller health + print("\n🎛️ Flux Controllers:\n") + + if tabulate: + controller_table = [] + for r in controller_results: + status_icon = "✅" if r['status'] == 'Running' and not r['issues'] else "❌" + controller_table.append([ + status_icon, + r['controller'], + r['status'], + len(r['issues']) + ]) + print(tabulate( + controller_table, + headers=['', 'Controller', 'Status', 'Issues'], + tablefmt='simple' + )) + else: + for r in controller_results: + status_icon = "✅" if r['status'] == 'Running' and not r['issues'] else "❌" + print(f"{status_icon} {r['controller']}: {r['status']} ({len(r['issues'])} issues)") + + # Resource health + if resource_results: + print("\n📦 Flux Resources:\n") + + if tabulate: + resource_table = [] + for r in resource_results: + status_icon = "✅" if r['ready'] == 'True' and not r['issues'] else "❌" + resource_table.append([ + status_icon, + r['type'], + r['name'], + r['namespace'], + r['ready'], + len(r['issues']) + ]) + print(tabulate( + resource_table, + headers=['', 'Type', 'Name', 'Namespace', 'Ready', 'Issues'], + tablefmt='simple' + )) + else: + for r in resource_results: + status_icon = "✅" if r['ready'] == 'True' and not r['issues'] else "❌" + print(f"{status_icon} {r['type']}/{r['name']}: {r['ready']} ({len(r['issues'])} issues)") + + # Detailed issues + all_results = controller_results + resource_results + issues_found = [r for r in all_results if r.get('issues')] + + if issues_found: + print("\n🔍 Detailed Issues:\n") + for r in issues_found: + print(f"{r.get('controller') or r.get('type')}/{r.get('name', 'N/A')}:") + for issue in r['issues']: + print(f" • {issue}") + if r.get('recommendations'): + print(" Recommendations:") + for rec in r['recommendations']: + print(f" → {rec}") + print() + else: + print("\n✅ No issues found!") + + +def main(): + parser = argparse.ArgumentParser( + description='Check Flux CD health and diagnose reconciliation issues (Flux v2.7+ compatible)', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Check Flux controllers and all resources + python3 check_flux_health.py + + # Check specific namespace + python3 check_flux_health.py --namespace my-app + + # Check only GitRepositories + python3 check_flux_health.py --type gitrepositories + + # Check OCI repositories (Flux v2.6+) + python3 check_flux_health.py --type ocirepositories + + # Output as JSON + python3 check_flux_health.py --json + +Flux v2.7+ Features: + - OCI artifact support (GA in v2.6) + - Image automation (GA in v2.7) + - Source-watcher component + - OpenTelemetry tracing + """ + ) + + parser.add_argument('--namespace', default='flux-system', help='Flux namespace (default: flux-system)') + parser.add_argument('--type', help='Check specific resource type only') + parser.add_argument('--kubeconfig', help='Path to kubeconfig file') + parser.add_argument('--json', action='store_true', help='Output as JSON') + + args = parser.parse_args() + + try: + checker = FluxHealthChecker(namespace=args.namespace, kubeconfig=args.kubeconfig) + + # Check controllers + controller_results = checker.check_flux_controllers() + + # Check resources + resource_results = [] + resource_types = [args.type] if args.type else [ + 'gitrepositories', + 'ocirepositories', + 'helmrepositories', + 'kustomizations', + 'helmreleases', + 'imageupdateautomations', + ] + + for resource_type in resource_types: + resources = checker.get_flux_resources(resource_type) + for resource in resources: + result = checker.check_resource_health(resource, resource_type) + resource_results.append(result) + + if args.json: + print(json.dumps({ + 'controllers': controller_results, + 'resources': resource_results + }, indent=2)) + else: + checker.print_summary(resource_results, controller_results) + + except KeyboardInterrupt: + print("\n\nInterrupted by user") + sys.exit(1) + except Exception as e: + print(f"❌ Error: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/scripts/oci_artifact_checker.py b/scripts/oci_artifact_checker.py new file mode 100644 index 0000000..51f5ecf --- /dev/null +++ b/scripts/oci_artifact_checker.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +""" +Validate Flux OCI artifact references and verify signatures. +Supports Flux v2.6+ OCI artifacts with cosign/notation verification. +""" + +import argparse +import sys +import subprocess +import json + +try: + from kubernetes import client, config +except ImportError: + print("⚠️ 'kubernetes' not found. Install with: pip install kubernetes") + sys.exit(1) + + +def check_oci_repository(name: str, namespace: str = 'flux-system'): + """Check OCIRepository resource status.""" + try: + config.load_kube_config() + api = client.CustomObjectsApi() + + oci_repo = api.get_namespaced_custom_object( + group='source.toolkit.fluxcd.io', + version='v1beta2', + namespace=namespace, + plural='ocirepositories', + name=name + ) + + status = oci_repo.get('status', {}) + conditions = status.get('conditions', []) + ready = next((c for c in conditions if c['type'] == 'Ready'), None) + + print(f"📦 OCIRepository: {name}") + print(f" Ready: {ready.get('status') if ready else 'Unknown'}") + print(f" Message: {ready.get('message', 'N/A') if ready else 'N/A'}") + + # Check artifact + artifact = status.get('artifact') + if artifact: + print(f" Artifact: {artifact.get('revision', 'N/A')}") + print(f" Digest: {artifact.get('digest', 'N/A')}") + else: + print(" ⚠️ No artifact available") + + # Check verification + spec = oci_repo.get('spec', {}) + if spec.get('verify'): + print(" ✓ Signature verification enabled") + provider = spec['verify'].get('provider', 'cosign') + print(f" Provider: {provider}") + else: + print(" ⚠️ No signature verification") + + return ready.get('status') == 'True' if ready else False + + except Exception as e: + print(f"❌ Error checking OCIRepository: {e}") + return False + + +def verify_oci_artifact(image: str, provider: str = 'cosign'): + """Verify OCI artifact signature.""" + print(f"\n🔐 Verifying {image} with {provider}...\n") + + if provider == 'cosign': + try: + result = subprocess.run( + ['cosign', 'verify', image], + capture_output=True, + text=True + ) + if result.returncode == 0: + print("✅ Signature verification successful") + return True + else: + print(f"❌ Verification failed: {result.stderr}") + return False + except FileNotFoundError: + print("⚠️ cosign not found. Install: https://github.com/sigstore/cosign") + return False + + elif provider == 'notation': + try: + result = subprocess.run( + ['notation', 'verify', image], + capture_output=True, + text=True + ) + if result.returncode == 0: + print("✅ Signature verification successful") + return True + else: + print(f"❌ Verification failed: {result.stderr}") + return False + except FileNotFoundError: + print("⚠️ notation not found. Install: https://notaryproject.dev") + return False + + +def main(): + parser = argparse.ArgumentParser( + description='Validate Flux OCI artifacts and verify signatures', + epilog=""" +Examples: + # Check OCIRepository status + python3 oci_artifact_checker.py --name my-app-oci --namespace flux-system + + # Verify OCI artifact signature with cosign + python3 oci_artifact_checker.py --verify ghcr.io/org/app:v1.0.0 + + # Verify with notation + python3 oci_artifact_checker.py --verify myregistry.io/app:latest --provider notation + +Requirements: + - kubectl configured for cluster access + - cosign (for signature verification) + - notation (for notation verification) + +Flux v2.6+ OCI Features: + - OCIRepository for Helm charts and Kustomize overlays + - Signature verification with cosign or notation + - Digest pinning for immutability + """ + ) + + parser.add_argument('--name', help='OCIRepository name') + parser.add_argument('--namespace', default='flux-system', help='Namespace') + parser.add_argument('--verify', help='OCI image to verify') + parser.add_argument('--provider', choices=['cosign', 'notation'], default='cosign', + help='Verification provider') + + args = parser.parse_args() + + if args.name: + check_oci_repository(args.name, args.namespace) + + if args.verify: + verify_oci_artifact(args.verify, args.provider) + + if not args.name and not args.verify: + print("❌ Specify --name or --verify") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/scripts/promotion_validator.py b/scripts/promotion_validator.py new file mode 100644 index 0000000..f916d77 --- /dev/null +++ b/scripts/promotion_validator.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +""" +Validate environment promotion workflows (dev → staging → production). +Checks that changes are promoted through environments in the correct order. +""" + +import argparse +import sys +import subprocess +from pathlib import Path + + +def get_git_diff(ref1: str, ref2: str, path: str = ".") -> str: + """Get git diff between two refs.""" + try: + result = subprocess.run( + ['git', 'diff', f'{ref1}...{ref2}', '--', path], + capture_output=True, + text=True, + check=True + ) + return result.stdout + except subprocess.CalledProcessError as e: + print(f"❌ Git diff failed: {e}") + sys.exit(1) + + +def validate_promotion(source_env: str, target_env: str, repo_path: str): + """Validate that changes exist in source before promoting to target.""" + print(f"🔍 Validating promotion: {source_env} → {target_env}\n") + + # Check that source and target directories exist + source_path = Path(repo_path) / f"environments/{source_env}" + target_path = Path(repo_path) / f"environments/{target_env}" + + if not source_path.exists(): + print(f"❌ Source environment not found: {source_path}") + sys.exit(1) + + if not target_path.exists(): + print(f"❌ Target environment not found: {target_path}") + sys.exit(1) + + # Check git history - target should not have changes that source doesn't have + diff = get_git_diff('HEAD~10', 'HEAD', str(target_path)) + + if diff and source_env == 'dev': + # If there are recent changes to target (prod/staging) check they came from source + print("⚠️ Recent changes detected in target environment") + print(" Verify changes were promoted from dev/staging first") + + print("✅ Promotion path is valid") + print(f"\nNext steps:") + print(f"1. Review changes in {source_env}") + print(f"2. Test in {source_env} environment") + print(f"3. Copy changes to {target_env}") + print(f"4. Create PR for {target_env} promotion") + + +def main(): + parser = argparse.ArgumentParser( + description='Validate environment promotion workflows', + epilog=""" +Examples: + # Validate dev → staging promotion + python3 promotion_validator.py --source dev --target staging + + # Validate staging → production promotion + python3 promotion_validator.py --source staging --target production + +Checks: + - Environment directories exist + - Changes flow through proper promotion path + - No direct changes to production + """ + ) + + parser.add_argument('--source', required=True, help='Source environment (dev/staging)') + parser.add_argument('--target', required=True, help='Target environment (staging/production)') + parser.add_argument('--repo-path', default='.', help='Repository path') + + args = parser.parse_args() + + validate_promotion(args.source, args.target, args.repo_path) + + +if __name__ == '__main__': + main() diff --git a/scripts/secret_audit.py b/scripts/secret_audit.py new file mode 100644 index 0000000..27b702b --- /dev/null +++ b/scripts/secret_audit.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +""" +Audit secrets management in GitOps repositories. +Checks for plain secrets, SOPS, Sealed Secrets, and External Secrets Operator. +""" + +import argparse +import sys +from pathlib import Path +from typing import List, Dict + +try: + import yaml +except ImportError: + print("⚠️ 'pyyaml' not found. Install with: pip install pyyaml") + sys.exit(1) + + +class SecretAuditor: + def __init__(self, repo_path: str): + self.repo_path = Path(repo_path) + self.findings = [] + + def audit(self) -> Dict: + """Run all secret audits.""" + print(f"🔐 Auditing secrets in: {self.repo_path}\n") + + self._check_plain_secrets() + self._check_sops_config() + self._check_sealed_secrets() + self._check_external_secrets() + + return self._generate_report() + + def _check_plain_secrets(self): + """Check for plain Kubernetes secrets.""" + secret_files = list(self.repo_path.rglob('*.yaml')) + list(self.repo_path.rglob('*.yml')) + plain_secrets = [] + + for sfile in secret_files: + if '.git' in sfile.parts: + continue + + try: + with open(sfile) as f: + for doc in yaml.safe_load_all(f): + if doc and doc.get('kind') == 'Secret': + # Skip service account tokens + if doc.get('type') == 'kubernetes.io/service-account-token': + continue + # Check if it's encrypted + if 'sops' not in str(doc) and doc.get('kind') != 'SealedSecret': + plain_secrets.append(sfile.relative_to(self.repo_path)) + except: + pass + + if plain_secrets: + self.findings.append({ + 'severity': 'HIGH', + 'type': 'Plain Secrets', + 'count': len(plain_secrets), + 'message': f"Found {len(plain_secrets)} plain Kubernetes Secret manifests", + 'recommendation': 'Encrypt with SOPS, Sealed Secrets, or use External Secrets Operator', + 'files': [str(f) for f in plain_secrets[:5]] + }) + else: + print("✅ No plain secrets found in Git") + + def _check_sops_config(self): + """Check SOPS configuration.""" + sops_config = self.repo_path / '.sops.yaml' + + if sops_config.exists(): + print("✅ SOPS config found (.sops.yaml)") + with open(sops_config) as f: + config = yaml.safe_load(f) + + # Check for age keys + if 'age' in str(config): + print(" ✓ Using age encryption (recommended)") + elif 'pgp' in str(config): + print(" ⚠️ Using PGP (consider migrating to age)") + self.findings.append({ + 'severity': 'LOW', + 'type': 'SOPS Configuration', + 'message': 'Using PGP encryption', + 'recommendation': 'Migrate to age for better security and simplicity' + }) + else: + encrypted_files = list(self.repo_path.rglob('*.enc.yaml')) + if encrypted_files: + print("⚠️ SOPS encrypted files found but no .sops.yaml config") + self.findings.append({ + 'severity': 'MEDIUM', + 'type': 'SOPS Configuration', + 'message': 'Encrypted files without .sops.yaml', + 'recommendation': 'Add .sops.yaml for consistent encryption settings' + }) + + def _check_sealed_secrets(self): + """Check Sealed Secrets usage.""" + sealed_secrets = list(self.repo_path.rglob('*sealedsecret*.yaml')) + + if sealed_secrets: + print(f"✅ Found {len(sealed_secrets)} Sealed Secrets") + + def _check_external_secrets(self): + """Check External Secrets Operator usage.""" + eso_files = list(self.repo_path.rglob('*externalsecret*.yaml')) + \ + list(self.repo_path.rglob('*secretstore*.yaml')) + + if eso_files: + print(f"✅ Found {len(eso_files)} External Secrets manifests") + + def _generate_report(self) -> Dict: + """Generate audit report.""" + return { + 'findings': self.findings, + 'total_issues': len(self.findings), + 'high_severity': len([f for f in self.findings if f['severity'] == 'HIGH']), + 'medium_severity': len([f for f in self.findings if f['severity'] == 'MEDIUM']), + 'low_severity': len([f for f in self.findings if f['severity'] == 'LOW']) + } + + +def main(): + parser = argparse.ArgumentParser( + description='Audit secrets management in GitOps repositories', + epilog=""" +Examples: + # Audit current directory + python3 secret_audit.py . + + # Audit specific repo + python3 secret_audit.py /path/to/gitops-repo + +Checks: + - Plain Kubernetes Secrets in Git (HIGH risk) + - SOPS configuration and encryption method + - Sealed Secrets usage + - External Secrets Operator usage + """ + ) + + parser.add_argument('repo_path', help='Path to GitOps repository') + + args = parser.parse_args() + + auditor = SecretAuditor(args.repo_path) + report = auditor.audit() + + # Print summary + print("\n" + "="*60) + print("📊 Audit Summary") + print("="*60) + + if report['findings']: + print(f"\n🔴 HIGH: {report['high_severity']}") + print(f"🟡 MEDIUM: {report['medium_severity']}") + print(f"🟢 LOW: {report['low_severity']}") + + print("\n📋 Findings:\n") + for f in report['findings']: + icon = {'HIGH': '🔴', 'MEDIUM': '🟡', 'LOW': '🟢'}[f['severity']] + print(f"{icon} [{f['severity']}] {f['type']}") + print(f" {f['message']}") + print(f" → {f['recommendation']}") + if 'files' in f and f['files']: + print(f" Files: {', '.join(f['files'][:3])}") + print() + else: + print("\n✅ No security issues found!") + + sys.exit(1 if report['high_severity'] > 0 else 0) + + +if __name__ == '__main__': + main() diff --git a/scripts/sync_drift_detector.py b/scripts/sync_drift_detector.py new file mode 100644 index 0000000..0760a21 --- /dev/null +++ b/scripts/sync_drift_detector.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +Detect configuration drift between Git and Kubernetes cluster. +Supports both ArgoCD and Flux CD deployments. +""" + +import argparse +import sys +import subprocess +import json +from typing import Dict, List, Optional + +try: + from kubernetes import client, config +except ImportError: + print("⚠️ 'kubernetes' library not found. Install with: pip install kubernetes") + sys.exit(1) + +try: + import yaml +except ImportError: + print("⚠️ 'pyyaml' library not found. Install with: pip install pyyaml") + sys.exit(1) + + +def run_command(cmd: List[str]) -> tuple: + """Run shell command and return output.""" + try: + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + return result.stdout, None + except subprocess.CalledProcessError as e: + return None, e.stderr + + +def check_argocd_drift(app_name: Optional[str] = None): + """Check drift using ArgoCD CLI.""" + print("🔍 Checking ArgoCD drift...\n") + + cmd = ['argocd', 'app', 'diff'] + if app_name: + cmd.append(app_name) + else: + # Get all apps + stdout, err = run_command(['argocd', 'app', 'list', '-o', 'json']) + if err: + print(f"❌ Failed to list apps: {err}") + return + + apps = json.loads(stdout) + for app in apps: + app_name = app['metadata']['name'] + check_single_app_drift(app_name) + return + + check_single_app_drift(app_name) + + +def check_single_app_drift(app_name: str): + """Check drift for single ArgoCD application.""" + stdout, err = run_command(['argocd', 'app', 'diff', app_name]) + + if err and 'no differences' not in err.lower(): + print(f"❌ {app_name}: Error checking drift") + print(f" {err}") + return + + if not stdout or 'no differences' in (stdout + (err or '')).lower(): + print(f"✅ {app_name}: No drift detected") + else: + print(f"⚠️ {app_name}: Drift detected") + print(f" Run: argocd app sync {app_name}") + + +def check_flux_drift(namespace: str = 'flux-system'): + """Check drift using Flux CLI.""" + print("🔍 Checking Flux drift...\n") + + # Check kustomizations + stdout, err = run_command(['flux', 'get', 'kustomizations', '-n', namespace, '--status-selector', 'ready=false']) + + if stdout: + print("⚠️ Out-of-sync Kustomizations:") + print(stdout) + else: + print("✅ All Kustomizations synced") + + # Check helmreleases + stdout, err = run_command(['flux', 'get', 'helmreleases', '-n', namespace, '--status-selector', 'ready=false']) + + if stdout: + print("\n⚠️ Out-of-sync HelmReleases:") + print(stdout) + else: + print("✅ All HelmReleases synced") + + +def main(): + parser = argparse.ArgumentParser( + description='Detect configuration drift between Git and cluster', + epilog=""" +Examples: + # Check ArgoCD drift + python3 sync_drift_detector.py --argocd + + # Check specific ArgoCD app + python3 sync_drift_detector.py --argocd --app my-app + + # Check Flux drift + python3 sync_drift_detector.py --flux + +Requirements: + - argocd CLI (for ArgoCD mode) + - flux CLI (for Flux mode) + - kubectl configured + """ + ) + + parser.add_argument('--argocd', action='store_true', help='Check ArgoCD drift') + parser.add_argument('--flux', action='store_true', help='Check Flux drift') + parser.add_argument('--app', help='Specific ArgoCD application name') + parser.add_argument('--namespace', default='flux-system', help='Flux namespace') + + args = parser.parse_args() + + if not args.argocd and not args.flux: + print("❌ Specify --argocd or --flux") + sys.exit(1) + + try: + if args.argocd: + check_argocd_drift(args.app) + if args.flux: + check_flux_drift(args.namespace) + + except KeyboardInterrupt: + print("\n\nInterrupted") + sys.exit(1) + except Exception as e: + print(f"❌ Error: {e}") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/scripts/validate_gitops_repo.py b/scripts/validate_gitops_repo.py new file mode 100644 index 0000000..2921331 --- /dev/null +++ b/scripts/validate_gitops_repo.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +""" +Validate GitOps repository structure, manifests, and best practices. +Supports both monorepo and polyrepo patterns with Kustomize and Helm. +""" + +import argparse +import sys +import os +import glob +from typing import Dict, List, Any, Tuple +from pathlib import Path + +try: + import yaml +except ImportError: + print("⚠️ Warning: 'pyyaml' library not found. Install with: pip install pyyaml") + sys.exit(1) + + +class GitOpsRepoValidator: + def __init__(self, repo_path: str): + self.repo_path = Path(repo_path).resolve() + if not self.repo_path.exists(): + raise ValueError(f"Path does not exist: {repo_path}") + + self.issues = [] + self.warnings = [] + self.recommendations = [] + + def validate(self) -> Dict[str, List[str]]: + """Run all validations.""" + print(f"🔍 Validating GitOps repository: {self.repo_path}\n") + + # Structure validations + self._check_repository_structure() + self._check_kustomization_files() + self._check_yaml_syntax() + self._check_best_practices() + self._check_secrets_management() + + return { + 'issues': self.issues, + 'warnings': self.warnings, + 'recommendations': self.recommendations + } + + def _check_repository_structure(self): + """Check repository structure and organization.""" + print("📁 Checking repository structure...") + + # Check for common patterns + has_apps = (self.repo_path / 'apps').exists() + has_clusters = (self.repo_path / 'clusters').exists() + has_infrastructure = (self.repo_path / 'infrastructure').exists() + has_base = (self.repo_path / 'base').exists() + has_overlays = (self.repo_path / 'overlays').exists() + + if not any([has_apps, has_clusters, has_infrastructure, has_base]): + self.warnings.append("No standard directory structure detected (apps/, clusters/, infrastructure/, base/)") + self.recommendations.append("Consider organizing with: apps/ (applications), infrastructure/ (cluster config), clusters/ (per-cluster)") + + # Check for Flux bootstrap (if Flux) + flux_system = self.repo_path / 'clusters' / 'flux-system' + if flux_system.exists(): + print(" ✓ Flux bootstrap detected") + if not (flux_system / 'gotk-components.yaml').exists(): + self.warnings.append("Flux bootstrap directory exists but gotk-components.yaml not found") + + # Check for ArgoCD bootstrap (if ArgoCD) + argocd_patterns = list(self.repo_path.rglob('*argocd-*.yaml')) + if argocd_patterns: + print(" ✓ ArgoCD manifests detected") + + def _check_kustomization_files(self): + """Check Kustomization files for validity.""" + print("\n🔧 Checking Kustomization files...") + + kustomization_files = list(self.repo_path.rglob('kustomization.yaml')) + \ + list(self.repo_path.rglob('kustomization.yml')) + + if not kustomization_files: + self.warnings.append("No kustomization.yaml files found") + return + + print(f" Found {len(kustomization_files)} kustomization files") + + for kfile in kustomization_files: + try: + with open(kfile, 'r') as f: + content = yaml.safe_load(f) + + if not content: + self.issues.append(f"Empty kustomization file: {kfile.relative_to(self.repo_path)}") + continue + + # Check for required fields + if 'resources' not in content and 'bases' not in content and 'components' not in content: + self.warnings.append(f"Kustomization has no resources/bases: {kfile.relative_to(self.repo_path)}") + + # Check for deprecated 'bases' (Kustomize 5.7+) + if 'bases' in content: + self.warnings.append(f"Using deprecated 'bases' field: {kfile.relative_to(self.repo_path)}") + self.recommendations.append("Migrate 'bases:' to 'resources:' (Kustomize 5.0+)") + + except yaml.YAMLError as e: + self.issues.append(f"Invalid YAML in {kfile.relative_to(self.repo_path)}: {e}") + except Exception as e: + self.issues.append(f"Error reading {kfile.relative_to(self.repo_path)}: {e}") + + def _check_yaml_syntax(self): + """Check YAML files for syntax errors.""" + print("\n📝 Checking YAML syntax...") + + yaml_files = list(self.repo_path.rglob('*.yaml')) + list(self.repo_path.rglob('*.yml')) + + # Exclude certain directories + exclude_dirs = {'.git', 'node_modules', 'vendor', '.github'} + yaml_files = [f for f in yaml_files if not any(ex in f.parts for ex in exclude_dirs)] + + syntax_errors = 0 + for yfile in yaml_files: + try: + with open(yfile, 'r') as f: + yaml.safe_load_all(f) + except yaml.YAMLError as e: + self.issues.append(f"YAML syntax error in {yfile.relative_to(self.repo_path)}: {e}") + syntax_errors += 1 + + if syntax_errors == 0: + print(f" ✓ All {len(yaml_files)} YAML files are valid") + else: + print(f" ✗ {syntax_errors} YAML files have syntax errors") + + def _check_best_practices(self): + """Check GitOps best practices.""" + print("\n✨ Checking best practices...") + + # Check for namespace definitions + namespace_files = list(self.repo_path.rglob('*namespace*.yaml')) + if not namespace_files: + self.recommendations.append("No namespace definitions found. Consider explicitly defining namespaces.") + + # Check for image tags (not 'latest') + all_yamls = list(self.repo_path.rglob('*.yaml')) + list(self.repo_path.rglob('*.yml')) + latest_tag_count = 0 + + for yfile in all_yamls: + try: + with open(yfile, 'r') as f: + content = f.read() + if ':latest' in content or 'image: latest' in content: + latest_tag_count += 1 + except: + pass + + if latest_tag_count > 0: + self.warnings.append(f"Found {latest_tag_count} files using ':latest' image tag") + self.recommendations.append("Pin image tags to specific versions or digests for reproducibility") + + # Check for resource limits + deployment_files = [f for f in all_yamls if 'deployment' in str(f).lower() or 'statefulset' in str(f).lower()] + missing_limits = 0 + + for dfile in deployment_files: + try: + with open(dfile, 'r') as f: + content = yaml.safe_load_all(f) + for doc in content: + if not doc or doc.get('kind') not in ['Deployment', 'StatefulSet']: + continue + + containers = doc.get('spec', {}).get('template', {}).get('spec', {}).get('containers', []) + for container in containers: + if 'resources' not in container or 'limits' not in container.get('resources', {}): + missing_limits += 1 + break + except: + pass + + if missing_limits > 0: + self.recommendations.append(f"{missing_limits} Deployments/StatefulSets missing resource limits") + + def _check_secrets_management(self): + """Check for secrets management practices.""" + print("\n🔐 Checking secrets management...") + + # Check for plain Kubernetes secrets + secret_files = list(self.repo_path.rglob('*secret*.yaml')) + plain_secrets = [] + + for sfile in secret_files: + try: + with open(sfile, 'r') as f: + for doc in yaml.safe_load_all(f): + if doc and doc.get('kind') == 'Secret' and doc.get('type') != 'kubernetes.io/service-account-token': + # Check if it's a SealedSecret or ExternalSecret + if doc.get('kind') not in ['SealedSecret'] and 'external-secrets.io' not in doc.get('apiVersion', ''): + plain_secrets.append(sfile.relative_to(self.repo_path)) + except: + pass + + if plain_secrets: + self.issues.append(f"Found {len(plain_secrets)} plain Kubernetes Secret manifests in Git") + self.recommendations.append("Use Sealed Secrets, External Secrets Operator, or SOPS for secrets management") + for s in plain_secrets[:3]: # Show first 3 + self.issues.append(f" - {s}") + + # Check for SOPS configuration + sops_config = self.repo_path / '.sops.yaml' + if sops_config.exists(): + print(" ✓ SOPS configuration found (.sops.yaml)") + + # Check for Sealed Secrets + sealed_secrets = list(self.repo_path.rglob('*sealedsecret*.yaml')) + if sealed_secrets: + print(f" ✓ Found {len(sealed_secrets)} SealedSecret manifests") + + # Check for External Secrets + external_secrets = [f for f in self.repo_path.rglob('*.yaml') + if 'externalsecret' in str(f).lower() or 'secretstore' in str(f).lower()] + if external_secrets: + print(f" ✓ Found {len(external_secrets)} External Secrets manifests") + + if not sops_config.exists() and not sealed_secrets and not external_secrets and plain_secrets: + self.recommendations.append("No secrets management solution detected. Consider implementing Sealed Secrets, ESO, or SOPS+age") + + +def main(): + parser = argparse.ArgumentParser( + description='Validate GitOps repository structure and manifests', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Validate current directory + python3 validate_gitops_repo.py . + + # Validate specific repository + python3 validate_gitops_repo.py /path/to/gitops-repo + + # Show only issues (no warnings) + python3 validate_gitops_repo.py . --errors-only + +Checks: + - Repository structure (monorepo/polyrepo patterns) + - Kustomization file validity + - YAML syntax errors + - Best practices (image tags, resource limits, namespaces) + - Secrets management (detect plain secrets, check for SOPS/Sealed Secrets/ESO) + """ + ) + + parser.add_argument('repo_path', help='Path to GitOps repository') + parser.add_argument('--errors-only', action='store_true', help='Show only errors, not warnings') + + args = parser.parse_args() + + try: + validator = GitOpsRepoValidator(args.repo_path) + results = validator.validate() + + # Print summary + print("\n" + "="*60) + print("📊 Validation Summary") + print("="*60) + + if results['issues']: + print(f"\n❌ Issues ({len(results['issues'])}):") + for issue in results['issues']: + print(f" • {issue}") + + if results['warnings'] and not args.errors_only: + print(f"\n⚠️ Warnings ({len(results['warnings'])}):") + for warning in results['warnings']: + print(f" • {warning}") + + if results['recommendations'] and not args.errors_only: + print(f"\n💡 Recommendations ({len(results['recommendations'])}):") + for rec in results['recommendations']: + print(f" → {rec}") + + if not results['issues'] and not results['warnings']: + print("\n✅ No issues found! Repository structure looks good.") + + # Exit code + sys.exit(1 if results['issues'] else 0) + + except KeyboardInterrupt: + print("\n\nInterrupted by user") + sys.exit(1) + except Exception as e: + print(f"❌ Error: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == '__main__': + main()