Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:19:52 +08:00
commit 5e822e4e98
14 changed files with 1144 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
# Assets
Bundled resources for log-aggregation-setup skill
- [ ] elk_config_template.conf: Template configuration file for Logstash.
- [ ] loki_config_template.yaml: Template configuration file for Loki.
- [ ] splunk_config_template.conf: Template configuration file for Splunk.
- [ ] example_log_data.json: Example log data in JSON format for testing the log aggregation setup.
- [ ] dashboard_elk.json: Example Kibana dashboard configuration.
- [ ] dashboard_loki.json: Example Grafana dashboard configuration for Loki.
- [ ] dashboard_splunk.json: Example Splunk dashboard configuration.

View File

@@ -0,0 +1,204 @@
{
"_comment": "Kibana Dashboard Configuration for ELK Stack",
"title": "System Performance and Log Analysis",
"description": "Dashboard providing insights into system performance and log data.",
"panels": [
{
"id": "cpu_usage",
"type": "visualization",
"title": "CPU Usage",
"description": "Displays CPU usage over time.",
"visState": {
"type": "timeseries",
"params": {
"indexPattern": "system-metrics-*",
"timeField": "@timestamp",
"interval": "auto",
"metrics": [
{
"field": "system.cpu.usage",
"type": "avg",
"alias": "Average CPU Usage"
}
],
"xAxisMode": "timeseries",
"yAxisMode": "normal"
}
},
"gridData": {
"x": 0,
"y": 0,
"w": 12,
"h": 6
}
},
{
"id": "memory_usage",
"type": "visualization",
"title": "Memory Usage",
"description": "Displays memory usage over time.",
"visState": {
"type": "timeseries",
"params": {
"indexPattern": "system-metrics-*",
"timeField": "@timestamp",
"interval": "auto",
"metrics": [
{
"field": "system.memory.actual.used.pct",
"type": "avg",
"alias": "Average Memory Usage"
}
],
"xAxisMode": "timeseries",
"yAxisMode": "normal"
}
},
"gridData": {
"x": 0,
"y": 6,
"w": 12,
"h": 6
}
},
{
"id": "disk_usage",
"type": "visualization",
"title": "Disk Usage",
"description": "Displays disk usage over time.",
"visState": {
"type": "timeseries",
"params": {
"indexPattern": "system-metrics-*",
"timeField": "@timestamp",
"interval": "auto",
"metrics": [
{
"field": "system.disk.used.pct",
"type": "avg",
"alias": "Average Disk Usage"
}
],
"xAxisMode": "timeseries",
"yAxisMode": "normal"
}
},
"gridData": {
"x": 12,
"y": 0,
"w": 12,
"h": 6
}
},
{
"id": "log_level_distribution",
"type": "visualization",
"title": "Log Level Distribution",
"description": "Displays the distribution of log levels.",
"visState": {
"type": "pie",
"params": {
"indexPattern": "application-logs-*",
"timeField": "@timestamp",
"interval": "auto",
"metrics": [
{
"field": "log.level",
"type": "count",
"alias": "Count"
}
],
"xAxisMode": "categorical",
"yAxisMode": "normal",
"terms": {
"field": "log.level",
"size": 5
}
}
},
"gridData": {
"x": 12,
"y": 6,
"w": 6,
"h": 6
}
},
{
"id": "error_rate",
"type": "visualization",
"title": "Error Rate",
"description": "Displays the rate of error logs over time.",
"visState": {
"type": "timeseries",
"params": {
"indexPattern": "application-logs-*",
"timeField": "@timestamp",
"interval": "auto",
"metrics": [
{
"field": "log.level",
"type": "count",
"alias": "Error Count",
"filters": [
{
"field": "log.level",
"operator": "is",
"value": "error"
}
]
}
],
"xAxisMode": "timeseries",
"yAxisMode": "normal"
}
},
"gridData": {
"x": 18,
"y": 6,
"w": 6,
"h": 6
}
},
{
"id": "log_table",
"type": "visualization",
"title": "Recent Logs",
"description": "Displays a table of recent log entries.",
"visState": {
"type": "table",
"params": {
"indexPattern": "application-logs-*",
"timeField": "@timestamp",
"columns": [
"@timestamp",
"log.level",
"message",
"service.name"
],
"sort": {
"field": "@timestamp",
"direction": "desc"
},
"pageSize": 10
}
},
"gridData": {
"x": 0,
"y": 12,
"w": 24,
"h": 6
}
}
],
"timeRestore": true,
"timeTo": "now",
"timeFrom": "now-15m",
"refreshInterval": {
"pause": false,
"value": 15000
},
"indexPatternRefName": "kibana_index_pattern_ref",
"kibanaSavedObjectMeta": {
"searchSourceJSON": "{\"filter\": []}"
}
}

View File

@@ -0,0 +1,288 @@
{
"_comment": "Grafana dashboard for Loki",
"dashboard": {
"annotations": {
"list": []
},
"description": "Example Grafana dashboard for Loki log aggregation.",
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"links": [],
"panels": [
{
"_comment": "Panel: Logs overview",
"datasource": null,
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"display": "auto",
"filterable": true
},
"mappings": [],
"min": null,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 0
},
"id": 2,
"options": {
"dedupStrategy": "none",
"enableExemplar": true,
"prettifyJson": true,
"showCommonContext": true,
"showTime": true,
"sortOrder": "Descending",
"wrapLines": true
},
"pluginVersion": "7.5.7",
"targets": [
{
"datasource": "${DS_LOKI}",
"editorMode": "code",
"expr": "{job=\"my-app\"} |= \"error\"",
"instant": false,
"queryType": "range",
"refId": "A"
}
],
"title": "Error Logs",
"type": "logs"
},
{
"_comment": "Panel: Log volume over time",
"datasource": null,
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"showPoints": "auto",
"spanNulls": false,
"stacking": "normal",
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"min": null,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 3,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "7.5.7",
"targets": [
{
"datasource": "${DS_LOKI}",
"editorMode": "code",
"expr": "rate({job=\"my-app\"} |= `error` [1m])",
"instant": false,
"legendFormat": "{{job}}",
"queryType": "range",
"refId": "A"
}
],
"title": "Error Log Volume",
"type": "timeseries"
},
{
"_comment": "Panel: HTTP Request Latency",
"datasource": null,
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"showPoints": "auto",
"spanNulls": false,
"stacking": "normal",
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"min": null,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"id": 4,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "7.5.7",
"targets": [
{
"datasource": "${DS_LOKI}",
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum(rate({job=\"my-app\", endpoint=\"/api/users\"} | json | unwrap duration [1m])) by (le))",
"instant": false,
"legendFormat": "99th percentile",
"queryType": "range",
"refId": "A"
}
],
"title": "HTTP Request Latency (99th percentile)",
"type": "timeseries"
}
],
"refresh": "1m",
"schemaVersion": 30,
"style": "dark",
"tags": [
"loki",
"logs",
"example"
],
"templating": {
"list": [
{
"_comment": "Loki datasource variable",
"current": {
"text": "Loki",
"value": "Loki"
},
"datasource": null,
"definition": "Loki",
"hide": 0,
"includeAll": false,
"label": "Loki Datasource",
"multi": false,
"name": "DS_LOKI",
"options": [],
"query": "Loki",
"refresh": 1,
"regex": "",
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "datasource",
"useTags": false
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "Loki Log Aggregation Dashboard",
"uid": "logi-aggregation-dashboard",
"version": 1
}
}

View File

@@ -0,0 +1,141 @@
{
"_comment": "Splunk Dashboard Configuration - Example",
"dashboard": {
"label": "Application Performance Overview",
"description": "A dashboard providing insights into application performance and health.",
"version": "1.0",
"layout": {
"type": "absolute",
"options": {
"width": "100%",
"height": "100%"
}
},
"panels": [
{
"id": "panel1",
"title": "Requests per Minute",
"description": "Shows the rate of incoming requests.",
"type": "timeseries",
"options": {
"xAxisTitle": "Time",
"yAxisTitle": "Requests/Minute"
},
"search": {
"query": "index=main sourcetype=access_combined | timechart count by _time span=1m",
"earliest": "-15m",
"latest": "now"
},
"position": {
"x": 0,
"y": 0,
"width": 6,
"height": 4
}
},
{
"id": "panel2",
"title": "Error Rate",
"description": "Displays the percentage of error responses.",
"type": "singlevalue",
"options": {
"unit": "%",
"underLabel": "Error Rate (Last 15 minutes)"
},
"search": {
"query": "index=main sourcetype=access_combined status>=500 | stats count as errors | eval total = [search index=main sourcetype=access_combined | stats count] | eval error_rate=round((errors/total)*100,2)",
"earliest": "-15m",
"latest": "now"
},
"position": {
"x": 6,
"y": 0,
"width": 3,
"height": 4
}
},
{
"id": "panel3",
"title": "Average Response Time",
"description": "Measures the average time taken to process requests.",
"type": "singlevalue",
"options": {
"unit": "ms",
"underLabel": "Average Response Time (Last 15 minutes)"
},
"search": {
"query": "index=main sourcetype=access_combined | stats avg(response_time) as avg_rt | eval avg_rt=round(avg_rt,2)",
"earliest": "-15m",
"latest": "now"
},
"position": {
"x": 9,
"y": 0,
"width": 3,
"height": 4
}
},
{
"id": "panel4",
"title": "Top 10 Slowest Endpoints",
"description": "Lists the endpoints with the highest average response times.",
"type": "table",
"options": {
"drilldown": "none"
},
"search": {
"query": "index=main sourcetype=access_combined | stats avg(response_time) as avg_rt by uri | sort -avg_rt | head 10",
"earliest": "-1h",
"latest": "now"
},
"position": {
"x": 0,
"y": 4,
"width": 6,
"height": 4
}
},
{
"id": "panel5",
"title": "Server CPU Utilization",
"description": "Displays the CPU utilization across all servers.",
"type": "timeseries",
"options": {
"xAxisTitle": "Time",
"yAxisTitle": "% CPU Utilization"
},
"search": {
"query": "index=os sourcetype=cpu | timechart avg(percentIdle) as idle by host span=1m | eval cpu_utilization=100-idle",
"earliest": "-15m",
"latest": "now"
},
"position": {
"x": 6,
"y": 4,
"width": 6,
"height": 4
}
},
{
"id": "panel6",
"title": "Recent Error Logs",
"description": "Shows the most recent error logs.",
"type": "event",
"options": {
"count": 5
},
"search": {
"query": "index=main sourcetype=application log_level=ERROR",
"earliest": "-1h",
"latest": "now"
},
"position": {
"x": 0,
"y": 8,
"width": 12,
"height": 4
}
}
]
}
}

View File

@@ -0,0 +1,107 @@
[
{
"_comment": "Example log entry from a web server",
"timestamp": "2024-01-26T10:00:00.000Z",
"log_level": "INFO",
"component": "web_server",
"message": "Request received",
"request_id": "a1b2c3d4e5f6",
"client_ip": "192.168.1.100",
"http_method": "GET",
"http_path": "/api/users",
"http_status_code": 200,
"response_time_ms": 123
},
{
"_comment": "Example log entry from a database",
"timestamp": "2024-01-26T10:00:01.000Z",
"log_level": "DEBUG",
"component": "database",
"message": "SQL query executed",
"query": "SELECT * FROM users WHERE id = 1",
"execution_time_ms": 5,
"rows_returned": 1
},
{
"_comment": "Example log entry from an application",
"timestamp": "2024-01-26T10:00:02.000Z",
"log_level": "ERROR",
"component": "application",
"message": "Error processing request",
"error_code": 500,
"error_message": "Internal server error",
"request_id": "a1b2c3d4e5f6",
"user_id": 123
},
{
"_comment": "Example log entry from a system",
"timestamp": "2024-01-26T10:00:03.000Z",
"log_level": "WARN",
"component": "system",
"message": "Disk space nearing capacity",
"disk_usage_percent": 90,
"disk_path": "/var/log"
},
{
"_comment": "Example log entry from a security component",
"timestamp": "2024-01-26T10:00:04.000Z",
"log_level": "INFO",
"component": "security",
"message": "Authentication successful",
"user_id": 456,
"username": "testuser",
"client_ip": "192.168.1.200"
},
{
"_comment": "Example log entry for authentication failure",
"timestamp": "2024-01-26T10:00:05.000Z",
"log_level": "WARN",
"component": "security",
"message": "Authentication failed",
"username": "invaliduser",
"client_ip": "192.168.1.200",
"reason": "Invalid password"
},
{
"_comment": "Example log entry from a microservice",
"timestamp": "2024-01-26T10:00:06.000Z",
"log_level": "INFO",
"component": "microservice-auth",
"message": "User authenticated",
"user_id": 789,
"username": "validuser",
"service_name": "auth-service"
},
{
"_comment": "Example log entry with exception details",
"timestamp": "2024-01-26T10:00:07.000Z",
"log_level": "ERROR",
"component": "application",
"message": "Unhandled exception",
"exception_type": "NullPointerException",
"exception_message": "Object reference not set to an instance of an object.",
"stack_trace": "at MyApp.Main.DoSomething() in MyApp.cs:line 20",
"request_id": "g7h8i9j0k1l2"
},
{
"_comment": "Example log entry with metrics",
"timestamp": "2024-01-26T10:00:08.000Z",
"log_level": "INFO",
"component": "monitoring",
"message": "System metrics",
"cpu_usage_percent": 35,
"memory_usage_percent": 60,
"network_throughput_kbps": 1024
},
{
"_comment": "Example log entry with audit information",
"timestamp": "2024-01-26T10:00:09.000Z",
"log_level": "INFO",
"component": "audit",
"message": "User profile updated",
"user_id": 123,
"updated_field": "email",
"old_value": "old@example.com",
"new_value": "new@example.com"
}
]

View File

@@ -0,0 +1,75 @@
# Loki Configuration File
# This file configures the Loki log aggregation system.
auth_enabled: false # Disable authentication for simplicity (REPLACE_ME: Enable authentication in production)
server:
http_listen_port: 3100 # Port Loki listens on for HTTP requests
grpc_listen_port: 9096 # Port Loki listens on for gRPC requests
ingester:
lifecycler:
address: 127.0.0.1 # Address of the ingester
ring:
kvstore:
store: inmemory # Use in-memory store for simplicity (REPLACE_ME: Use a persistent store like Consul or etcd in production)
replication_factor: 1 # Number of replicas for log data
wal:
enabled: true # Enable Write-Ahead Log for durability
dir: /tmp/loki/wal # Directory for the Write-Ahead Log (REPLACE_ME: Use a persistent volume in production)
chunk_idle_period: 1h # Time after which an inactive chunk is flushed to storage
chunk_block_size: 262144 # Size of each chunk block (256KB)
chunk_retain_period: 24h # Time after which a chunk is deleted from the ingester
max_transfer_retries: 0 # Maximum number of retries for transferring chunks
schema_config:
configs:
- from: 2020-10-24 # Start date for this schema
store: boltdb-shipper # Use BoltDB shipper for index storage
object_store: filesystem # Use filesystem for chunk storage
schema: v11 # Schema version
index:
prefix: index_ # Prefix for index keys
period: 24h # Index rotation period
storage_config:
boltdb_shipper:
active_index_directory: /tmp/loki/index # Directory for the active index (REPLACE_ME: Use a persistent volume in production)
shared_dir: /tmp/loki/chunks # Directory for shared chunks (REPLACE_ME: Use a persistent volume in production)
filesystem:
path: /tmp/loki/chunks # Directory for chunk storage (REPLACE_ME: Use a persistent volume in production)
limits_config:
enforce_metric_name: false # Disable enforcement of metric names
reject_old_samples: true # Reject samples older than the configured time
reject_old_samples_max_age: 168h # Maximum age of samples (7 days)
max_global_streams_per_user: 0 # 0 means unlimited
max_streams_per_user: 0 # 0 means unlimited
ingestion_rate_mb: 100 # Maximum ingestion rate in MB/s
ingestion_burst_size_mb: 200 # Maximum burst size in MB
max_line_size: 512000 # Maximum line size in bytes (500KB)
max_line_length: 512000 # DEPRECATED: use max_line_size instead
max_query_lookback: 720h # Maximum query lookback (30 days)
split_queries_by_interval: 12h # Split queries by this interval
max_concurrent_queries: 30 # Maximum number of concurrent queries
max_query_series: 1000 # Maximum number of series returned by a query
max_query_parallelism: 16 # Maximum query parallelism
max_query_length: 720h # Maximum query length (30 days)
compactor:
working_directory: /tmp/loki/compactor # Directory for compactor working files (REPLACE_ME: Use a persistent volume in production)
shared_store: filesystem # Use filesystem for shared storage
compaction_interval: 1h # Interval between compactor runs
retention_enabled: true # Enable retention of old chunks
retention_delete_delay: 24h # Delay before deleting old chunks
retention_max_age: 720h # Maximum age of chunks to retain (30 days)
ruler:
storage:
type: local
local:
directory: /tmp/loki/rules # Directory to store the rules (REPLACE_ME: Use a persistent volume in production)
rule_path: /tmp/loki/rules # Path where rules are stored (REPLACE_ME: Use a persistent volume in production)
alertmanager_url: "" # URL of the Alertmanager instance (REPLACE_ME: YOUR_ALERTMANAGER_URL)
poll_interval: 30s # Interval to poll for rule changes
enable_api: true # Enable the API for managing rules

View File

@@ -0,0 +1,114 @@
# Splunk Configuration Template
# This file provides a template for configuring Splunk to collect and index logs.
# It includes examples for various log sources and configurations.
# Please review and modify this file according to your specific environment and requirements.
# ==============================================================================
# Global Settings
# ==============================================================================
[default]
host = <YOUR_HOSTNAME> # Replace with the actual hostname of the Splunk instance
# ==============================================================================
# Input Configuration: System Logs (Syslog)
# ==============================================================================
# Configure a UDP input for receiving syslog messages.
# Ensure your syslog daemon is configured to forward logs to this Splunk instance.
[udp://514]
connection_host = ip
sourcetype = syslog
index = main # Change if you want to index into a different index
disabled = false
# ==============================================================================
# Input Configuration: File Monitoring (Tail)
# ==============================================================================
# Monitor a specific log file. Useful for application logs.
# Adjust the path and sourcetype accordingly.
[monitor:///var/log/<YOUR_APPLICATION>/<YOUR_APPLICATION>.log]
sourcetype = <YOUR_APPLICATION>_log
index = main # Change if you want to index into a different index
disabled = false
# Optional: Multiline event breaking (if needed)
# MUST_BREAK_AFTER = ^\d{4}-\d{2}-\d{2}
# ==============================================================================
# Input Configuration: Windows Event Logs (Windows)
# ==============================================================================
# Configure Splunk to collect Windows Event Logs.
# Adjust the event logs to monitor as needed.
[WinEventLog://Application]
disabled = false
index = wineventlog
sourcetype = WinEventLog:Application
# Optional: Filter events by event code
# evt_resolve_ad_obj = 1 # Resolve AD objects
# whitelist = 4624,4625 # Example: Only collect events with ID 4624 and 4625
[WinEventLog://System]
disabled = false
index = wineventlog
sourcetype = WinEventLog:System
[WinEventLog://Security]
disabled = false
index = wineventlog
sourcetype = WinEventLog:Security
# IMPORTANT: Consider the volume of security logs and storage implications.
# ==============================================================================
# Input Configuration: Scripted Input (Example: CPU Utilization)
# ==============================================================================
# Example of a scripted input to collect CPU utilization.
# Requires a script (e.g., cpu_utilization.sh or cpu_utilization.ps1)
# that outputs the CPU utilization in a structured format (e.g., CSV, JSON).
[script://$SPLUNK_HOME/etc/apps/<YOUR_APP_NAME>/bin/cpu_utilization.sh]
interval = 60 # Run every 60 seconds
sourcetype = cpu_utilization
index = metrics # Consider a dedicated metrics index
disabled = false
# ==============================================================================
# Transformations (Optional)
# ==============================================================================
# Use transformations to modify events before they are indexed.
# Example: Masking sensitive data.
# [transform-null]
# REGEX = (.*)<SENSITIVE_FIELD>.*
# DEST_KEY = _raw
# FORMAT = $1<SENSITIVE_FIELD>MASKED
# ==============================================================================
# Index Configuration (Optional)
# ==============================================================================
# Configure index-specific settings.
# [<YOUR_INDEX_NAME>]
# homePath = $SPLUNK_DB/<YOUR_INDEX_NAME>/db
# coldPath = $SPLUNK_DB/<YOUR_INDEX_NAME>/colddb
# thawedPath = $SPLUNK_DB/<YOUR_INDEX_NAME>/thaweddb
# maxDataSize = auto
# frozenTimePeriodInSecs = 90d # 90 days retention
# ==============================================================================
# Notes
# ==============================================================================
# * Replace placeholders with actual values.
# * Ensure proper permissions are set for log files and scripts.
# * Test configurations thoroughly before deploying to production.
# * Consider using Splunk's monitoring console for health checks and troubleshooting.
# * Review Splunk documentation for detailed information on configuration options.