Initial commit

2025-11-30 08:19:54 +08:00
commit 67c918fc64
11 changed files with 616 additions and 0 deletions
--- a/skills/monitoring-stack-deployer/assets/README.md
+++ b/skills/monitoring-stack-deployer/assets/README.md
@@ -0,0 +1,9 @@
+# Assets
+
+Bundled resources for monitoring-stack-deployer skill
+
+- [ ] prometheus_config_template.yml: Template for Prometheus configuration files.
+- [ ] grafana_dashboard_template.json: Template for Grafana dashboard configurations.
+- [ ] datadog_agent_config_template.yml: Template for Datadog agent configuration files.
+- [ ] example_k8s_manifests/: Example Kubernetes manifests for deploying the monitoring stack.
+- [ ] example_terraform_configurations/: Example Terraform configurations for deploying the monitoring stack.
--- a/skills/monitoring-stack-deployer/assets/datadog_agent_config_template.yml
+++ b/skills/monitoring-stack-deployer/assets/datadog_agent_config_template.yml
@@ -0,0 +1,90 @@
+# Datadog Agent Configuration Template
+
+# This file provides a template for configuring the Datadog agent.
+# Replace placeholders with your specific values.
+
+# Agent Configuration
+agent_hostname: REPLACE_ME_HOSTNAME # The hostname reported to Datadog.  Defaults to system hostname.
+# agent_tags:  # Optional: List of tags to apply to all metrics sent by this agent.
+#   - env:production
+#   - role:webserver
+
+# API Key
+api_key: YOUR_DATADOG_API_KEY # Your Datadog API key. Required.
+
+# Site
+# site: datadoghq.com  # The Datadog site.  Defaults to datadoghq.com
+
+# Listen Address
+# listen_address: 0.0.0.0  # The address the agent listens on for HTTP requests. Defaults to 127.0.0.1
+
+# Log Level
+log_level: INFO # Valid values: DEBUG, INFO, WARN, ERROR, CRITICAL
+
+# Log to file
+# log_file: /var/log/datadog/agent.log  # Uncomment to log to a file.  Ensure proper permissions are set.
+
+# Enable/Disable Agent
+# enabled: true  # Defaults to true
+
+# --- Integrations ---
+
+# Example: System Check
+system_core_check:
+  enabled: true
+  collect_count: true # Collect CPU core counts
+  # use_mount: false # Disable mount point metrics (defaults to true)
+
+# Example: Network Check
+network:
+  enabled: true
+  collect_connection_state: true # Collect TCP connection states
+  excluded_interfaces:
+    - lo # Exclude loopback interface
+
+# Example: Disk Check
+disk:
+  enabled: true
+  all_partitions: false # Only monitor certain partitions
+  partitions:
+    - /
+    - /var
+    - /tmp
+  # use_mount: false # Disable mount point metrics (defaults to true)
+  # excluded_filesystems:  # Optional: List of filesystem types to exclude.
+  #   - tmpfs
+
+# --- Advanced Configuration ---
+
+# Proxy settings (if required)
+# proxy:
+#   http: http://YOUR_PROXY_SERVER:YOUR_PROXY_PORT
+#   https: https://YOUR_PROXY_SERVER:YOUR_PROXY_PORT
+#   no_proxy: localhost,127.0.0.1,YOUR_INTERNAL_DOMAIN
+
+# DogStatsD Configuration (for custom metrics)
+dogstatsd_port: 8125 # UDP port for DogStatsD
+# dogstatsd_non_local_traffic: true # Enable for non-local traffic (security implications!)
+
+# --- Autodiscovery ---
+# For containerized environments (Docker, Kubernetes)
+# autodiscovery_listeners:
+#   - name: docker
+#   - name: kubelet
+
+# Example: Custom check
+# confd_path: /etc/datadog-agent/conf.d
+
+# --- Process Agent ---
+# process_config:
+#   enabled: true # Enable process collection
+#   process_collection:
+#     enabled: true
+#   container_collection:
+#     enabled: true
+#   process_discovery:
+#     enabled: false # Disable process discovery by default for security
+
+# --- Security Agent ---
+# security_agent:
+#   enabled: false # Disable security agent by default
--- a/skills/monitoring-stack-deployer/assets/grafana_dashboard_template.json
+++ b/skills/monitoring-stack-deployer/assets/grafana_dashboard_template.json
@@ -0,0 +1,258 @@
+{
+  "_comment": "Grafana Dashboard Template for Monitoring Stack Deployer",
+  "dashboard": {
+    "annotations": {
+      "list": []
+    },
+    "editable": true,
+    "gnetId": null,
+    "graphTooltip": 0,
+    "id": null,
+    "links": [],
+    "panels": [
+      {
+        "_comment": "Example: CPU Usage Panel",
+        "aliasColors": {},
+        "bars": false,
+        "dashLength": 10,
+        "dashes": false,
+        "datasource": "${DS_PROMETHEUS}",
+        "decimals": 2,
+        "fill": 1,
+        "fillGradient": 0,
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 0
+        },
+        "hiddenSeries": false,
+        "id": 1,
+        "interval": null,
+        "legend": {
+          "avg": true,
+          "current": true,
+          "max": true,
+          "min": true,
+          "show": true,
+          "total": false,
+          "values": true
+        },
+        "lines": true,
+        "linewidth": 1,
+        "nullPointMode": "null",
+        "options": {
+          "dataLinks": []
+        },
+        "percentage": false,
+        "pluginVersion": "7.5.7",
+        "pointradius": 2,
+        "points": false,
+        "renderer": "flot",
+        "seriesOverrides": [],
+        "spaceLength": 10,
+        "stack": false,
+        "steppedLine": false,
+        "targets": [
+          {
+            "expr": "rate(process_cpu_seconds_total[5m])",
+            "instant": false,
+            "legendFormat": "{{instance}}",
+            "refId": "A"
+          }
+        ],
+        "thresholds": [],
+        "timeFrom": null,
+        "timeRegions": [],
+        "timeShift": null,
+        "title": "CPU Usage",
+        "tooltip": {
+          "shared": true,
+          "sort": 0,
+          "value_type": "individual"
+        },
+        "type": "graph",
+        "xaxis": {
+          "buckets": null,
+          "mode": "time",
+          "name": null,
+          "show": true,
+          "values": []
+        },
+        "yaxes": [
+          {
+            "decimal": 2,
+            "format": "percent",
+            "label": null,
+            "logBase": 1,
+            "max": null,
+            "min": "0",
+            "show": true
+          },
+          {
+            "format": "short",
+            "label": null,
+            "logBase": 1,
+            "max": null,
+            "min": null,
+            "show": true
+          }
+        ]
+      },
+      {
+        "_comment": "Example: Memory Usage Panel",
+        "aliasColors": {},
+        "bars": false,
+        "dashLength": 10,
+        "dashes": false,
+        "datasource": "${DS_PROMETHEUS}",
+        "decimals": 2,
+        "fill": 1,
+        "fillGradient": 0,
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 0
+        },
+        "hiddenSeries": false,
+        "id": 2,
+        "interval": null,
+        "legend": {
+          "avg": true,
+          "current": true,
+          "max": true,
+          "min": true,
+          "show": true,
+          "total": false,
+          "values": true
+        },
+        "lines": true,
+        "linewidth": 1,
+        "nullPointMode": "null",
+        "options": {
+          "dataLinks": []
+        },
+        "percentage": false,
+        "pluginVersion": "7.5.7",
+        "pointradius": 2,
+        "points": false,
+        "renderer": "flot",
+        "seriesOverrides": [],
+        "spaceLength": 10,
+        "stack": false,
+        "steppedLine": false,
+        "targets": [
+          {
+            "expr": "sum(container_memory_usage_bytes) by (instance)",
+            "instant": false,
+            "legendFormat": "{{instance}}",
+            "refId": "A"
+          }
+        ],
+        "thresholds": [],
+        "timeFrom": null,
+        "timeRegions": [],
+        "timeShift": null,
+        "title": "Memory Usage",
+        "tooltip": {
+          "shared": true,
+          "sort": 0,
+          "value_type": "individual"
+        },
+        "type": "graph",
+        "xaxis": {
+          "buckets": null,
+          "mode": "time",
+          "name": null,
+          "show": true,
+          "values": []
+        },
+        "yaxes": [
+          {
+            "format": "bytes",
+            "label": null,
+            "logBase": 1,
+            "max": null,
+            "min": "0",
+            "show": true
+          },
+          {
+            "format": "short",
+            "label": null,
+            "logBase": 1,
+            "max": null,
+            "min": null,
+            "show": true
+          }
+        ]
+      }
+    ],
+    "refresh": "5s",
+    "schemaVersion": 26,
+    "style": "dark",
+    "tags": [
+      "monitoring"
+    ],
+    "templating": {
+      "list": [
+        {
+          "_comment": "Example Prometheus Datasource Variable",
+          "current": {
+            "selected": false,
+            "text": "Prometheus",
+            "value": "Prometheus"
+          },
+          "datasource": null,
+          "definition": "Prometheus",
+          "hide": 0,
+          "includeAll": false,
+          "label": "Prometheus",
+          "multi": false,
+          "name": "DS_PROMETHEUS",
+          "options": [],
+          "query": "Prometheus",
+          "refresh": 1,
+          "regex": "",
+          "sort": 0,
+          "tagValuesQuery": "",
+          "tagsQuery": "",
+          "type": "datasource"
+        }
+      ]
+    },
+    "time": {
+      "from": "now-1h",
+      "to": "now"
+    },
+    "timepicker": {
+      "refresh_intervals": [
+        "5s",
+        "10s",
+        "30s",
+        "1m",
+        "5m",
+        "15m",
+        "30m",
+        "1h",
+        "2h",
+        "1d"
+      ],
+      "time_options": [
+        "5m",
+        "15m",
+        "1h",
+        "6h",
+        "12h",
+        "24h",
+        "2d",
+        "7d",
+        "30d"
+      ]
+    },
+    "timezone": "",
+    "title": "Monitoring Dashboard",
+    "uid": "example-dashboard",
+    "version": 1
+  }
+}
--- a/skills/monitoring-stack-deployer/assets/prometheus_config_template.yml
+++ b/skills/monitoring-stack-deployer/assets/prometheus_config_template.yml
@@ -0,0 +1,73 @@
+# Global configuration for Prometheus
+global:
+  scrape_interval:     30s  # How frequently to scrape targets. Adjust based on your needs.
+  evaluation_interval: 30s  # How frequently to evaluate rules. Adjust based on your needs.
+  # scrape_timeout is set to the global default (10s).
+
+  # External labels to identify the Prometheus instance.
+  external_labels:
+    monitor: 'REPLACE_ME_MONITORING_INSTANCE'  # A label identifying this Prometheus instance.
+
+# Rule files that define alerting rules.
+rule_files:
+  # - "prometheus.rules" # Example: Uncomment and adjust path to include your rules file.
+
+# Scrape configuration for Prometheus.
+scrape_configs:
+  # Example: Scrape Prometheus itself.  Good for monitoring Prometheus's own health.
+  - job_name: 'prometheus'
+    # metrics_path defaults to '/metrics'
+    # scheme defaults to 'http'.
+
+    static_configs:
+      - targets: ['localhost:9090']  # Prometheus's default port.  Change if needed.
+
+  # Example: Scrape node_exporter metrics.  Provides system-level metrics.
+  - job_name: 'node_exporter'
+    static_configs:
+      - targets: ['REPLACE_ME_NODE_EXPORTER_ADDRESS:9100'] # Replace with the actual node_exporter address and port.
+
+  # Example: Scrape cadvisor metrics.  Provides container-level metrics.
+  - job_name: 'cadvisor'
+    static_configs:
+      - targets: ['REPLACE_ME_CADVISOR_ADDRESS:8080'] # Replace with the actual cadvisor address and port.
+
+  # Example: Scrape Kubernetes pods with specific labels.
+  - job_name: 'kubernetes-pods'
+    kubernetes_sd_configs:
+      - role: pod
+
+    relabel_configs:
+      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+        action: keep
+        regex: true
+      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
+        action: replace
+        target_label: __metrics_path__
+        regex: (.+)
+      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
+        action: replace
+        regex: ([^:]+)(?::\d+)?;(\d+)
+        replacement: $1:$2
+        target_label: __address__
+      - action: labelmap
+        regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
+        replacement: __param_$1
+      - action: replace
+        source_labels: [__meta_kubernetes_namespace]
+        target_label: namespace
+      - source_labels: [__meta_kubernetes_pod_name]
+        target_label: pod
+
+# Alerting configuration
+alerting:
+  alertmanagers:
+  - static_configs:
+    - targets: ['REPLACE_ME_ALERTMANAGER_ADDRESS:9093']  # Replace with your Alertmanager address.
+
+# Remote write configuration to send metrics to a remote endpoint (e.g., Grafana Cloud, Cortex).
+# remote_write:
+#   - url: "YOUR_VALUE_HERE" # Example: Grafana Cloud remote write endpoint.
+#     basic_auth:
+#       username: "YOUR_VALUE_HERE" # Example: Grafana Cloud username.
+#       password: "YOUR_VALUE_HERE" # Example: Grafana Cloud API key.