Initial commit
This commit is contained in:
9
skills/monitoring-stack-deployer/assets/README.md
Normal file
9
skills/monitoring-stack-deployer/assets/README.md
Normal file
@@ -0,0 +1,9 @@
|
||||
# Assets
|
||||
|
||||
Bundled resources for monitoring-stack-deployer skill
|
||||
|
||||
- [ ] prometheus_config_template.yml: Template for Prometheus configuration files.
|
||||
- [ ] grafana_dashboard_template.json: Template for Grafana dashboard configurations.
|
||||
- [ ] datadog_agent_config_template.yml: Template for Datadog agent configuration files.
|
||||
- [ ] example_k8s_manifests/: Example Kubernetes manifests for deploying the monitoring stack.
|
||||
- [ ] example_terraform_configurations/: Example Terraform configurations for deploying the monitoring stack.
|
||||
@@ -0,0 +1,90 @@
|
||||
# Datadog Agent Configuration Template
|
||||
|
||||
# This file provides a template for configuring the Datadog agent.
|
||||
# Replace placeholders with your specific values.
|
||||
|
||||
# Agent Configuration
|
||||
agent_hostname: REPLACE_ME_HOSTNAME # The hostname reported to Datadog. Defaults to system hostname.
|
||||
# agent_tags: # Optional: List of tags to apply to all metrics sent by this agent.
|
||||
# - env:production
|
||||
# - role:webserver
|
||||
|
||||
# API Key
|
||||
api_key: YOUR_DATADOG_API_KEY # Your Datadog API key. Required.
|
||||
|
||||
# Site
|
||||
# site: datadoghq.com # The Datadog site. Defaults to datadoghq.com
|
||||
|
||||
# Listen Address
|
||||
# listen_address: 0.0.0.0 # The address the agent listens on for HTTP requests. Defaults to 127.0.0.1
|
||||
|
||||
# Log Level
|
||||
log_level: INFO # Valid values: DEBUG, INFO, WARN, ERROR, CRITICAL
|
||||
|
||||
# Log to file
|
||||
# log_file: /var/log/datadog/agent.log # Uncomment to log to a file. Ensure proper permissions are set.
|
||||
|
||||
# Enable/Disable Agent
|
||||
# enabled: true # Defaults to true
|
||||
|
||||
# --- Integrations ---
|
||||
|
||||
# Example: System Check
|
||||
system_core_check:
|
||||
enabled: true
|
||||
collect_count: true # Collect CPU core counts
|
||||
# use_mount: false # Disable mount point metrics (defaults to true)
|
||||
|
||||
# Example: Network Check
|
||||
network:
|
||||
enabled: true
|
||||
collect_connection_state: true # Collect TCP connection states
|
||||
excluded_interfaces:
|
||||
- lo # Exclude loopback interface
|
||||
|
||||
# Example: Disk Check
|
||||
disk:
|
||||
enabled: true
|
||||
all_partitions: false # Only monitor certain partitions
|
||||
partitions:
|
||||
- /
|
||||
- /var
|
||||
- /tmp
|
||||
# use_mount: false # Disable mount point metrics (defaults to true)
|
||||
# excluded_filesystems: # Optional: List of filesystem types to exclude.
|
||||
# - tmpfs
|
||||
|
||||
# --- Advanced Configuration ---
|
||||
|
||||
# Proxy settings (if required)
|
||||
# proxy:
|
||||
# http: http://YOUR_PROXY_SERVER:YOUR_PROXY_PORT
|
||||
# https: https://YOUR_PROXY_SERVER:YOUR_PROXY_PORT
|
||||
# no_proxy: localhost,127.0.0.1,YOUR_INTERNAL_DOMAIN
|
||||
|
||||
# DogStatsD Configuration (for custom metrics)
|
||||
dogstatsd_port: 8125 # UDP port for DogStatsD
|
||||
# dogstatsd_non_local_traffic: true # Enable for non-local traffic (security implications!)
|
||||
|
||||
# --- Autodiscovery ---
|
||||
# For containerized environments (Docker, Kubernetes)
|
||||
# autodiscovery_listeners:
|
||||
# - name: docker
|
||||
# - name: kubelet
|
||||
|
||||
# Example: Custom check
|
||||
# confd_path: /etc/datadog-agent/conf.d
|
||||
|
||||
# --- Process Agent ---
|
||||
# process_config:
|
||||
# enabled: true # Enable process collection
|
||||
# process_collection:
|
||||
# enabled: true
|
||||
# container_collection:
|
||||
# enabled: true
|
||||
# process_discovery:
|
||||
# enabled: false # Disable process discovery by default for security
|
||||
|
||||
# --- Security Agent ---
|
||||
# security_agent:
|
||||
# enabled: false # Disable security agent by default
|
||||
@@ -0,0 +1,258 @@
|
||||
{
|
||||
"_comment": "Grafana Dashboard Template for Monitoring Stack Deployer",
|
||||
"dashboard": {
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"_comment": "Example: CPU Usage Panel",
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"decimals": 2,
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 1,
|
||||
"interval": null,
|
||||
"legend": {
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "7.5.7",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(process_cpu_seconds_total[5m])",
|
||||
"instant": false,
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "CPU Usage",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"decimal": 2,
|
||||
"format": "percent",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": "0",
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"_comment": "Example: Memory Usage Panel",
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"decimals": 2,
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 2,
|
||||
"interval": null,
|
||||
"legend": {
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "7.5.7",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(container_memory_usage_bytes) by (instance)",
|
||||
"instant": false,
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "Memory Usage",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": "0",
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"refresh": "5s",
|
||||
"schemaVersion": 26,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"monitoring"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"_comment": "Example Prometheus Datasource Variable",
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "Prometheus",
|
||||
"value": "Prometheus"
|
||||
},
|
||||
"datasource": null,
|
||||
"definition": "Prometheus",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Prometheus",
|
||||
"multi": false,
|
||||
"name": "DS_PROMETHEUS",
|
||||
"options": [],
|
||||
"query": "Prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 0,
|
||||
"tagValuesQuery": "",
|
||||
"tagsQuery": "",
|
||||
"type": "datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "Monitoring Dashboard",
|
||||
"uid": "example-dashboard",
|
||||
"version": 1
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,73 @@
|
||||
# Global configuration for Prometheus
|
||||
global:
|
||||
scrape_interval: 30s # How frequently to scrape targets. Adjust based on your needs.
|
||||
evaluation_interval: 30s # How frequently to evaluate rules. Adjust based on your needs.
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
|
||||
# External labels to identify the Prometheus instance.
|
||||
external_labels:
|
||||
monitor: 'REPLACE_ME_MONITORING_INSTANCE' # A label identifying this Prometheus instance.
|
||||
|
||||
# Rule files that define alerting rules.
|
||||
rule_files:
|
||||
# - "prometheus.rules" # Example: Uncomment and adjust path to include your rules file.
|
||||
|
||||
# Scrape configuration for Prometheus.
|
||||
scrape_configs:
|
||||
# Example: Scrape Prometheus itself. Good for monitoring Prometheus's own health.
|
||||
- job_name: 'prometheus'
|
||||
# metrics_path defaults to '/metrics'
|
||||
# scheme defaults to 'http'.
|
||||
|
||||
static_configs:
|
||||
- targets: ['localhost:9090'] # Prometheus's default port. Change if needed.
|
||||
|
||||
# Example: Scrape node_exporter metrics. Provides system-level metrics.
|
||||
- job_name: 'node_exporter'
|
||||
static_configs:
|
||||
- targets: ['REPLACE_ME_NODE_EXPORTER_ADDRESS:9100'] # Replace with the actual node_exporter address and port.
|
||||
|
||||
# Example: Scrape cadvisor metrics. Provides container-level metrics.
|
||||
- job_name: 'cadvisor'
|
||||
static_configs:
|
||||
- targets: ['REPLACE_ME_CADVISOR_ADDRESS:8080'] # Replace with the actual cadvisor address and port.
|
||||
|
||||
# Example: Scrape Kubernetes pods with specific labels.
|
||||
- job_name: 'kubernetes-pods'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
||||
action: keep
|
||||
regex: true
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
||||
action: replace
|
||||
target_label: __metrics_path__
|
||||
regex: (.+)
|
||||
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
||||
action: replace
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
target_label: __address__
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
|
||||
replacement: __param_$1
|
||||
- action: replace
|
||||
source_labels: [__meta_kubernetes_namespace]
|
||||
target_label: namespace
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: pod
|
||||
|
||||
# Alerting configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['REPLACE_ME_ALERTMANAGER_ADDRESS:9093'] # Replace with your Alertmanager address.
|
||||
|
||||
# Remote write configuration to send metrics to a remote endpoint (e.g., Grafana Cloud, Cortex).
|
||||
# remote_write:
|
||||
# - url: "YOUR_VALUE_HERE" # Example: Grafana Cloud remote write endpoint.
|
||||
# basic_auth:
|
||||
# username: "YOUR_VALUE_HERE" # Example: Grafana Cloud username.
|
||||
# password: "YOUR_VALUE_HERE" # Example: Grafana Cloud API key.
|
||||
Reference in New Issue
Block a user