Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:19:54 +08:00
commit 67c918fc64
11 changed files with 616 additions and 0 deletions

View File

@@ -0,0 +1,9 @@
# Assets
Bundled resources for monitoring-stack-deployer skill
- [ ] prometheus_config_template.yml: Template for Prometheus configuration files.
- [ ] grafana_dashboard_template.json: Template for Grafana dashboard configurations.
- [ ] datadog_agent_config_template.yml: Template for Datadog agent configuration files.
- [ ] example_k8s_manifests/: Example Kubernetes manifests for deploying the monitoring stack.
- [ ] example_terraform_configurations/: Example Terraform configurations for deploying the monitoring stack.

View File

@@ -0,0 +1,90 @@
# Datadog Agent Configuration Template
# This file provides a template for configuring the Datadog agent.
# Replace placeholders with your specific values.
# Agent Configuration
agent_hostname: REPLACE_ME_HOSTNAME # The hostname reported to Datadog. Defaults to system hostname.
# agent_tags: # Optional: List of tags to apply to all metrics sent by this agent.
# - env:production
# - role:webserver
# API Key
api_key: YOUR_DATADOG_API_KEY # Your Datadog API key. Required.
# Site
# site: datadoghq.com # The Datadog site. Defaults to datadoghq.com
# Listen Address
# listen_address: 0.0.0.0 # The address the agent listens on for HTTP requests. Defaults to 127.0.0.1
# Log Level
log_level: INFO # Valid values: DEBUG, INFO, WARN, ERROR, CRITICAL
# Log to file
# log_file: /var/log/datadog/agent.log # Uncomment to log to a file. Ensure proper permissions are set.
# Enable/Disable Agent
# enabled: true # Defaults to true
# --- Integrations ---
# Example: System Check
system_core_check:
enabled: true
collect_count: true # Collect CPU core counts
# use_mount: false # Disable mount point metrics (defaults to true)
# Example: Network Check
network:
enabled: true
collect_connection_state: true # Collect TCP connection states
excluded_interfaces:
- lo # Exclude loopback interface
# Example: Disk Check
disk:
enabled: true
all_partitions: false # Only monitor certain partitions
partitions:
- /
- /var
- /tmp
# use_mount: false # Disable mount point metrics (defaults to true)
# excluded_filesystems: # Optional: List of filesystem types to exclude.
# - tmpfs
# --- Advanced Configuration ---
# Proxy settings (if required)
# proxy:
# http: http://YOUR_PROXY_SERVER:YOUR_PROXY_PORT
# https: https://YOUR_PROXY_SERVER:YOUR_PROXY_PORT
# no_proxy: localhost,127.0.0.1,YOUR_INTERNAL_DOMAIN
# DogStatsD Configuration (for custom metrics)
dogstatsd_port: 8125 # UDP port for DogStatsD
# dogstatsd_non_local_traffic: true # Enable for non-local traffic (security implications!)
# --- Autodiscovery ---
# For containerized environments (Docker, Kubernetes)
# autodiscovery_listeners:
# - name: docker
# - name: kubelet
# Example: Custom check
# confd_path: /etc/datadog-agent/conf.d
# --- Process Agent ---
# process_config:
# enabled: true # Enable process collection
# process_collection:
# enabled: true
# container_collection:
# enabled: true
# process_discovery:
# enabled: false # Disable process discovery by default for security
# --- Security Agent ---
# security_agent:
# enabled: false # Disable security agent by default

View File

@@ -0,0 +1,258 @@
{
"_comment": "Grafana Dashboard Template for Monitoring Stack Deployer",
"dashboard": {
"annotations": {
"list": []
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"links": [],
"panels": [
{
"_comment": "Example: CPU Usage Panel",
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"hiddenSeries": false,
"id": 1,
"interval": null,
"legend": {
"avg": true,
"current": true,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pluginVersion": "7.5.7",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "rate(process_cpu_seconds_total[5m])",
"instant": false,
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "CPU Usage",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimal": 2,
"format": "percent",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"_comment": "Example: Memory Usage Panel",
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"decimals": 2,
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"hiddenSeries": false,
"id": 2,
"interval": null,
"legend": {
"avg": true,
"current": true,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pluginVersion": "7.5.7",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_usage_bytes) by (instance)",
"instant": false,
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Memory Usage",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"refresh": "5s",
"schemaVersion": 26,
"style": "dark",
"tags": [
"monitoring"
],
"templating": {
"list": [
{
"_comment": "Example Prometheus Datasource Variable",
"current": {
"selected": false,
"text": "Prometheus",
"value": "Prometheus"
},
"datasource": null,
"definition": "Prometheus",
"hide": 0,
"includeAll": false,
"label": "Prometheus",
"multi": false,
"name": "DS_PROMETHEUS",
"options": [],
"query": "Prometheus",
"refresh": 1,
"regex": "",
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "datasource"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "Monitoring Dashboard",
"uid": "example-dashboard",
"version": 1
}
}

View File

@@ -0,0 +1,73 @@
# Global configuration for Prometheus
global:
scrape_interval: 30s # How frequently to scrape targets. Adjust based on your needs.
evaluation_interval: 30s # How frequently to evaluate rules. Adjust based on your needs.
# scrape_timeout is set to the global default (10s).
# External labels to identify the Prometheus instance.
external_labels:
monitor: 'REPLACE_ME_MONITORING_INSTANCE' # A label identifying this Prometheus instance.
# Rule files that define alerting rules.
rule_files:
# - "prometheus.rules" # Example: Uncomment and adjust path to include your rules file.
# Scrape configuration for Prometheus.
scrape_configs:
# Example: Scrape Prometheus itself. Good for monitoring Prometheus's own health.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090'] # Prometheus's default port. Change if needed.
# Example: Scrape node_exporter metrics. Provides system-level metrics.
- job_name: 'node_exporter'
static_configs:
- targets: ['REPLACE_ME_NODE_EXPORTER_ADDRESS:9100'] # Replace with the actual node_exporter address and port.
# Example: Scrape cadvisor metrics. Provides container-level metrics.
- job_name: 'cadvisor'
static_configs:
- targets: ['REPLACE_ME_CADVISOR_ADDRESS:8080'] # Replace with the actual cadvisor address and port.
# Example: Scrape Kubernetes pods with specific labels.
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: replace
source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
# Alerting configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['REPLACE_ME_ALERTMANAGER_ADDRESS:9093'] # Replace with your Alertmanager address.
# Remote write configuration to send metrics to a remote endpoint (e.g., Grafana Cloud, Cortex).
# remote_write:
# - url: "YOUR_VALUE_HERE" # Example: Grafana Cloud remote write endpoint.
# basic_auth:
# username: "YOUR_VALUE_HERE" # Example: Grafana Cloud username.
# password: "YOUR_VALUE_HERE" # Example: Grafana Cloud API key.