From ccc4f37ba961b4a7f8a2c61eaeb9c55d3aeb8e19 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sat, 29 Nov 2025 18:52:28 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 15 + README.md | 3 + commands/create-monitoring.md | 2181 +++++++++++++++++ plugin.lock.json | 97 + skills/skill-adapter/assets/README.md | 7 + .../skill-adapter/assets/config-template.json | 32 + .../assets/dashboard_template.json | 90 + .../assets/example_dashboard_config.yaml | 113 + skills/skill-adapter/assets/skill-schema.json | 28 + skills/skill-adapter/assets/test-data.json | 27 + .../assets/visualization_examples.md | 122 + skills/skill-adapter/references/README.md | 7 + .../references/best-practices.md | 69 + skills/skill-adapter/references/examples.md | 70 + skills/skill-adapter/scripts/README.md | 7 + .../skill-adapter/scripts/helper-template.sh | 42 + skills/skill-adapter/scripts/validation.sh | 32 + 17 files changed, 2942 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 commands/create-monitoring.md create mode 100644 plugin.lock.json create mode 100644 skills/skill-adapter/assets/README.md create mode 100644 skills/skill-adapter/assets/config-template.json create mode 100644 skills/skill-adapter/assets/dashboard_template.json create mode 100644 skills/skill-adapter/assets/example_dashboard_config.yaml create mode 100644 skills/skill-adapter/assets/skill-schema.json create mode 100644 skills/skill-adapter/assets/test-data.json create mode 100644 skills/skill-adapter/assets/visualization_examples.md create mode 100644 skills/skill-adapter/references/README.md create mode 100644 skills/skill-adapter/references/best-practices.md create mode 100644 skills/skill-adapter/references/examples.md create mode 100644 skills/skill-adapter/scripts/README.md create mode 100755 skills/skill-adapter/scripts/helper-template.sh create mode 100755 skills/skill-adapter/scripts/validation.sh diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..3be95fa --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,15 @@ +{ + "name": "api-monitoring-dashboard", + "description": "Create monitoring dashboards for API health, metrics, and alerts", + "version": "1.0.0", + "author": { + "name": "Jeremy Longshore", + "email": "[email protected]" + }, + "skills": [ + "./skills" + ], + "commands": [ + "./commands" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..cafea95 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# api-monitoring-dashboard + +Create monitoring dashboards for API health, metrics, and alerts diff --git a/commands/create-monitoring.md b/commands/create-monitoring.md new file mode 100644 index 0000000..e97bf3d --- /dev/null +++ b/commands/create-monitoring.md @@ -0,0 +1,2181 @@ +--- +description: Create API monitoring dashboard +shortcut: monitor +--- + +# Create API Monitoring Dashboard + +Build comprehensive monitoring infrastructure with metrics, logs, traces, and alerts for full API observability. + +## When to Use This Command + +Use `/create-monitoring` when you need to: +- Establish observability for production APIs +- Track RED metrics (Rate, Errors, Duration) across services +- Set up real-time alerting for SLO violations +- Debug performance issues with distributed tracing +- Create executive dashboards for API health +- Implement SRE practices with data-driven insights + +DON'T use this when: +- Building proof-of-concept applications (use lightweight logging instead) +- Monitoring non-critical internal tools (basic health checks may suffice) +- Resources are extremely constrained (consider managed solutions like Datadog first) + +## Design Decisions + +This command implements a **Prometheus + Grafana stack** as the primary approach because: +- Open-source with no vendor lock-in +- Industry-standard metric format with wide ecosystem support +- Powerful query language (PromQL) for complex analysis +- Horizontal scalability via federation and remote storage + +**Alternative considered: ELK Stack** (Elasticsearch, Logstash, Kibana) +- Better for log-centric analysis +- Higher resource requirements +- More complex operational overhead +- Recommended when logs are primary data source + +**Alternative considered: Managed solutions** (Datadog, New Relic) +- Faster time-to-value +- Higher ongoing cost +- Less customization flexibility +- Recommended for teams without dedicated DevOps + +## Prerequisites + +Before running this command: +1. Docker and Docker Compose installed +2. API instrumented with metrics endpoints (Prometheus format) +3. Basic understanding of PromQL query language +4. Network access for inter-service communication +5. Sufficient disk space for time-series data (plan for 2-4 weeks retention) + +## Implementation Process + +### Step 1: Configure Prometheus +Set up Prometheus to scrape metrics from your API endpoints with service discovery. + +### Step 2: Create Grafana Dashboards +Build visualizations for RED metrics, custom business metrics, and SLO tracking. + +### Step 3: Implement Distributed Tracing +Integrate Jaeger for end-to-end request tracing across microservices. + +### Step 4: Configure Alerting +Set up AlertManager rules for critical thresholds with notification channels (Slack, PagerDuty). + +### Step 5: Deploy Monitoring Stack +Deploy complete observability infrastructure with health checks and backup configurations. + +## Output Format + +The command generates: +- `docker-compose.yml` - Complete monitoring stack configuration +- `prometheus.yml` - Prometheus scrape configuration +- `grafana-dashboards/` - Pre-built dashboard JSON files +- `alerting-rules.yml` - AlertManager rule definitions +- `jaeger-config.yml` - Distributed tracing configuration +- `README.md` - Deployment and operation guide + +## Code Examples + +### Example 1: Complete Node.js Express API with Comprehensive Monitoring + +```javascript +// metrics/instrumentation.js - Full-featured Prometheus instrumentation +const promClient = require('prom-client'); +const { performance } = require('perf_hooks'); +const os = require('os'); + +class MetricsCollector { + constructor() { + // Create separate registries for different metric types + this.register = new promClient.Registry(); + this.businessRegister = new promClient.Registry(); + + // Add default system metrics + promClient.collectDefaultMetrics({ + register: this.register, + prefix: 'api_', + gcDurationBuckets: [0.001, 0.01, 0.1, 1, 2, 5] + }); + + // Initialize all metric types + this.initializeMetrics(); + this.initializeBusinessMetrics(); + this.initializeCustomCollectors(); + + // Start periodic collectors + this.startPeriodicCollectors(); + } + + initializeMetrics() { + // RED Metrics (Rate, Errors, Duration) + this.httpRequestDuration = new promClient.Histogram({ + name: 'http_request_duration_seconds', + help: 'Duration of HTTP requests in seconds', + labelNames: ['method', 'route', 'status_code', 'service', 'environment'], + buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10] + }); + + this.httpRequestTotal = new promClient.Counter({ + name: 'http_requests_total', + help: 'Total number of HTTP requests', + labelNames: ['method', 'route', 'status_code', 'service', 'environment'] + }); + + this.httpRequestErrors = new promClient.Counter({ + name: 'http_request_errors_total', + help: 'Total number of HTTP errors', + labelNames: ['method', 'route', 'error_type', 'service', 'environment'] + }); + + // Database metrics + this.dbQueryDuration = new promClient.Histogram({ + name: 'db_query_duration_seconds', + help: 'Database query execution time', + labelNames: ['operation', 'table', 'database', 'status'], + buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5] + }); + + this.dbConnectionPool = new promClient.Gauge({ + name: 'db_connection_pool_size', + help: 'Database connection pool metrics', + labelNames: ['state', 'database'] // states: active, idle, total + }); + + // Cache metrics + this.cacheHitRate = new promClient.Counter({ + name: 'cache_operations_total', + help: 'Cache operation counts', + labelNames: ['operation', 'cache_name', 'status'] // hit, miss, set, delete + }); + + this.cacheLatency = new promClient.Histogram({ + name: 'cache_operation_duration_seconds', + help: 'Cache operation latency', + labelNames: ['operation', 'cache_name'], + buckets: [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1] + }); + + // External API metrics + this.externalApiCalls = new promClient.Histogram({ + name: 'external_api_duration_seconds', + help: 'External API call duration', + labelNames: ['service', 'endpoint', 'status_code'], + buckets: [0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30] + }); + + // Circuit breaker metrics + this.circuitBreakerState = new promClient.Gauge({ + name: 'circuit_breaker_state', + help: 'Circuit breaker state (0=closed, 1=open, 2=half-open)', + labelNames: ['service'] + }); + + // Rate limiting metrics + this.rateLimitHits = new promClient.Counter({ + name: 'rate_limit_hits_total', + help: 'Number of rate limited requests', + labelNames: ['limit_type', 'client_type'] + }); + + // WebSocket metrics + this.activeWebsockets = new promClient.Gauge({ + name: 'websocket_connections_active', + help: 'Number of active WebSocket connections', + labelNames: ['namespace', 'room'] + }); + + // Register all metrics + [ + this.httpRequestDuration, this.httpRequestTotal, this.httpRequestErrors, + this.dbQueryDuration, this.dbConnectionPool, this.cacheHitRate, + this.cacheLatency, this.externalApiCalls, this.circuitBreakerState, + this.rateLimitHits, this.activeWebsockets + ].forEach(metric => this.register.registerMetric(metric)); + } + + initializeBusinessMetrics() { + // User activity metrics + this.activeUsers = new promClient.Gauge({ + name: 'business_active_users', + help: 'Number of active users in the last 5 minutes', + labelNames: ['user_type', 'plan'] + }); + + this.userSignups = new promClient.Counter({ + name: 'business_user_signups_total', + help: 'Total user signups', + labelNames: ['source', 'plan', 'country'] + }); + + // Transaction metrics + this.transactionAmount = new promClient.Histogram({ + name: 'business_transaction_amount_dollars', + help: 'Transaction amounts in dollars', + labelNames: ['type', 'status', 'payment_method'], + buckets: [1, 5, 10, 25, 50, 100, 250, 500, 1000, 5000, 10000] + }); + + this.orderProcessingTime = new promClient.Histogram({ + name: 'business_order_processing_seconds', + help: 'Time to process orders end-to-end', + labelNames: ['order_type', 'fulfillment_type'], + buckets: [10, 30, 60, 180, 300, 600, 1800, 3600] + }); + + // API usage metrics + this.apiUsageByClient = new promClient.Counter({ + name: 'business_api_usage_by_client', + help: 'API usage segmented by client', + labelNames: ['client_id', 'tier', 'endpoint'] + }); + + this.apiQuotaRemaining = new promClient.Gauge({ + name: 'business_api_quota_remaining', + help: 'Remaining API quota for clients', + labelNames: ['client_id', 'tier', 'quota_type'] + }); + + // Revenue metrics + this.revenueByProduct = new promClient.Counter({ + name: 'business_revenue_by_product_cents', + help: 'Revenue by product in cents', + labelNames: ['product_id', 'product_category', 'currency'] + }); + + // Register business metrics + [ + this.activeUsers, this.userSignups, this.transactionAmount, + this.orderProcessingTime, this.apiUsageByClient, this.apiQuotaRemaining, + this.revenueByProduct + ].forEach(metric => this.businessRegister.registerMetric(metric)); + } + + initializeCustomCollectors() { + // SLI/SLO metrics + this.sloCompliance = new promClient.Gauge({ + name: 'slo_compliance_percentage', + help: 'SLO compliance percentage', + labelNames: ['slo_name', 'service', 'window'] + }); + + this.errorBudgetRemaining = new promClient.Gauge({ + name: 'error_budget_remaining_percentage', + help: 'Remaining error budget percentage', + labelNames: ['service', 'slo_type'] + }); + + this.register.registerMetric(this.sloCompliance); + this.register.registerMetric(this.errorBudgetRemaining); + } + + startPeriodicCollectors() { + // Update active users every 30 seconds + setInterval(() => { + const activeUserCount = this.calculateActiveUsers(); + this.activeUsers.set( + { user_type: 'registered', plan: 'free' }, + activeUserCount.free + ); + this.activeUsers.set( + { user_type: 'registered', plan: 'premium' }, + activeUserCount.premium + ); + }, 30000); + + // Update SLO compliance every minute + setInterval(() => { + this.updateSLOCompliance(); + }, 60000); + + // Database pool monitoring + setInterval(() => { + this.updateDatabasePoolMetrics(); + }, 15000); + } + + // Middleware for HTTP metrics + httpMetricsMiddleware() { + return (req, res, next) => { + const start = performance.now(); + const route = req.route?.path || req.path || 'unknown'; + + // Track in-flight requests + const inFlightGauge = new promClient.Gauge({ + name: 'http_requests_in_flight', + help: 'Number of in-flight HTTP requests', + labelNames: ['method', 'route'] + }); + + inFlightGauge.inc({ method: req.method, route }); + + res.on('finish', () => { + const duration = (performance.now() - start) / 1000; + const labels = { + method: req.method, + route, + status_code: res.statusCode, + service: process.env.SERVICE_NAME || 'api', + environment: process.env.NODE_ENV || 'development' + }; + + // Record metrics + this.httpRequestDuration.observe(labels, duration); + this.httpRequestTotal.inc(labels); + + if (res.statusCode >= 400) { + const errorType = res.statusCode >= 500 ? 'server_error' : 'client_error'; + this.httpRequestErrors.inc({ + ...labels, + error_type: errorType + }); + } + + inFlightGauge.dec({ method: req.method, route }); + + // Log slow requests + if (duration > 1) { + console.warn('Slow request detected:', { + ...labels, + duration, + user: req.user?.id, + ip: req.ip + }); + } + }); + + next(); + }; + } + + // Database query instrumentation + instrumentDatabase(knex) { + knex.on('query', (query) => { + query.__startTime = performance.now(); + }); + + knex.on('query-response', (response, query) => { + const duration = (performance.now() - query.__startTime) / 1000; + const table = this.extractTableName(query.sql); + + this.dbQueryDuration.observe({ + operation: query.method || 'select', + table, + database: process.env.DB_NAME || 'default', + status: 'success' + }, duration); + }); + + knex.on('query-error', (error, query) => { + const duration = (performance.now() - query.__startTime) / 1000; + const table = this.extractTableName(query.sql); + + this.dbQueryDuration.observe({ + operation: query.method || 'select', + table, + database: process.env.DB_NAME || 'default', + status: 'error' + }, duration); + }); + } + + // Cache instrumentation wrapper + wrapCache(cache) { + const wrapper = {}; + const methods = ['get', 'set', 'delete', 'has']; + + methods.forEach(method => { + wrapper[method] = async (...args) => { + const start = performance.now(); + const cacheName = cache.name || 'default'; + + try { + const result = await cache[method](...args); + const duration = (performance.now() - start) / 1000; + + // Record cache metrics + if (method === 'get') { + const status = result !== undefined ? 'hit' : 'miss'; + this.cacheHitRate.inc({ + operation: method, + cache_name: cacheName, + status + }); + } else { + this.cacheHitRate.inc({ + operation: method, + cache_name: cacheName, + status: 'success' + }); + } + + this.cacheLatency.observe({ + operation: method, + cache_name: cacheName + }, duration); + + return result; + } catch (error) { + this.cacheHitRate.inc({ + operation: method, + cache_name: cacheName, + status: 'error' + }); + throw error; + } + }; + }); + + return wrapper; + } + + // External API call instrumentation + async trackExternalCall(serviceName, endpoint, callFunc) { + const start = performance.now(); + + try { + const result = await callFunc(); + const duration = (performance.now() - start) / 1000; + + this.externalApiCalls.observe({ + service: serviceName, + endpoint, + status_code: result.status || 200 + }, duration); + + return result; + } catch (error) { + const duration = (performance.now() - start) / 1000; + + this.externalApiCalls.observe({ + service: serviceName, + endpoint, + status_code: error.response?.status || 0 + }, duration); + + throw error; + } + } + + // Circuit breaker monitoring + updateCircuitBreakerState(service, state) { + const stateValue = { + 'closed': 0, + 'open': 1, + 'half-open': 2 + }[state] || 0; + + this.circuitBreakerState.set({ service }, stateValue); + } + + // Helper methods + calculateActiveUsers() { + // Implementation would query your session store or database + return { + free: Math.floor(Math.random() * 1000), + premium: Math.floor(Math.random() * 100) + }; + } + + updateSLOCompliance() { + // Calculate based on recent metrics + const availability = 99.95; // Calculate from actual metrics + const latencyP99 = 250; // Calculate from actual metrics + + this.sloCompliance.set({ + slo_name: 'availability', + service: 'api', + window: '30d' + }, availability); + + this.sloCompliance.set({ + slo_name: 'latency_p99', + service: 'api', + window: '30d' + }, latencyP99 < 500 ? 100 : 0); + + // Update error budget + const errorBudget = 100 - ((100 - availability) / 0.05) * 100; + this.errorBudgetRemaining.set({ + service: 'api', + slo_type: 'availability' + }, Math.max(0, errorBudget)); + } + + updateDatabasePoolMetrics() { + // Get pool stats from your database driver + const pool = global.dbPool; // Your database pool instance + if (pool) { + this.dbConnectionPool.set({ + state: 'active', + database: 'primary' + }, pool.numUsed()); + + this.dbConnectionPool.set({ + state: 'idle', + database: 'primary' + }, pool.numFree()); + + this.dbConnectionPool.set({ + state: 'total', + database: 'primary' + }, pool.numUsed() + pool.numFree()); + } + } + + extractTableName(sql) { + const match = sql.match(/(?:from|into|update)\s+`?(\w+)`?/i); + return match ? match[1] : 'unknown'; + } + + // Expose metrics endpoint + async getMetrics() { + const baseMetrics = await this.register.metrics(); + const businessMetrics = await this.businessRegister.metrics(); + return baseMetrics + '\n' + businessMetrics; + } +} + +// Express application setup +const express = require('express'); +const app = express(); +const metricsCollector = new MetricsCollector(); + +// Apply monitoring middleware +app.use(metricsCollector.httpMetricsMiddleware()); + +// Metrics endpoint +app.get('/metrics', async (req, res) => { + res.set('Content-Type', metricsCollector.register.contentType); + res.end(await metricsCollector.getMetrics()); +}); + +// Example API endpoint with comprehensive tracking +app.post('/api/orders', async (req, res) => { + const orderStart = performance.now(); + + try { + // Track business metrics + metricsCollector.transactionAmount.observe({ + type: 'purchase', + status: 'pending', + payment_method: req.body.paymentMethod + }, req.body.amount); + + // Simulate external payment API call + const paymentResult = await metricsCollector.trackExternalCall( + 'stripe', + '/charges', + async () => { + // Your actual payment API call + return await stripeClient.charges.create({ + amount: req.body.amount * 100, + currency: 'usd' + }); + } + ); + + // Track order processing time + const processingTime = (performance.now() - orderStart) / 1000; + metricsCollector.orderProcessingTime.observe({ + order_type: 'standard', + fulfillment_type: 'digital' + }, processingTime); + + // Track revenue + metricsCollector.revenueByProduct.inc({ + product_id: req.body.productId, + product_category: req.body.category, + currency: 'USD' + }, req.body.amount * 100); + + res.json({ success: true, orderId: paymentResult.id }); + } catch (error) { + res.status(500).json({ error: error.message }); + } +}); + +module.exports = { app, metricsCollector }; +``` + +### Example 2: Complete Monitoring Stack with Docker Compose + +```yaml +# docker-compose.yml +version: '3.8' + +services: + prometheus: + image: prom/prometheus:v2.45.0 + container_name: prometheus + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - ./alerting-rules.yml:/etc/prometheus/alerting-rules.yml + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + - '--storage.tsdb.retention.time=15d' + ports: + - "9090:9090" + networks: + - monitoring + + grafana: + image: grafana/grafana:10.0.0 + container_name: grafana + volumes: + - grafana-data:/var/lib/grafana + - ./grafana-dashboards:/etc/grafana/provisioning/dashboards + - ./grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + - GF_SERVER_ROOT_URL=http://localhost:3000 + ports: + - "3000:3000" + networks: + - monitoring + depends_on: + - prometheus + + jaeger: + image: jaegertracing/all-in-one:1.47 + container_name: jaeger + environment: + - COLLECTOR_ZIPKIN_HOST_PORT=:9411 + - COLLECTOR_OTLP_ENABLED=true + ports: + - "5775:5775/udp" + - "6831:6831/udp" + - "6832:6832/udp" + - "5778:5778" + - "16686:16686" # Jaeger UI + - "14268:14268" + - "14250:14250" + - "9411:9411" + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + networks: + - monitoring + + alertmanager: + image: prom/alertmanager:v0.26.0 + container_name: alertmanager + volumes: + - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + ports: + - "9093:9093" + networks: + - monitoring + +networks: + monitoring: + driver: bridge + +volumes: + prometheus-data: + grafana-data: +``` + +### Example 3: Advanced Grafana Dashboard Definitions + +```json +// grafana-dashboards/api-overview.json +{ + "dashboard": { + "id": null, + "uid": "api-overview", + "title": "API Performance Overview", + "tags": ["api", "performance", "sre"], + "timezone": "browser", + "schemaVersion": 16, + "version": 0, + "refresh": "30s", + "time": { + "from": "now-6h", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "datasource", + "type": "datasource", + "query": "prometheus", + "current": { + "value": "Prometheus", + "text": "Prometheus" + } + }, + { + "name": "service", + "type": "query", + "datasource": "$datasource", + "query": "label_values(http_requests_total, service)", + "multi": true, + "includeAll": true, + "current": { + "value": ["$__all"], + "text": "All" + }, + "refresh": 1 + }, + { + "name": "environment", + "type": "query", + "datasource": "$datasource", + "query": "label_values(http_requests_total, environment)", + "current": { + "value": "production", + "text": "Production" + } + } + ] + }, + "panels": [ + { + "id": 1, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 0 }, + "type": "graph", + "title": "Request Rate (req/s)", + "targets": [ + { + "expr": "sum(rate(http_requests_total{service=~\"$service\",environment=\"$environment\"}[5m])) by (service)", + "legendFormat": "{{service}}", + "refId": "A" + } + ], + "yaxes": [ + { + "format": "reqps", + "label": "Requests per second" + } + ], + "lines": true, + "linewidth": 2, + "fill": 1, + "fillGradient": 3, + "steppedLine": false, + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "alert": { + "name": "High Request Rate", + "conditions": [ + { + "evaluator": { + "params": [10000], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["A", "5m", "now"] + }, + "reducer": { + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "1m", + "handler": 1, + "noDataState": "no_data", + "notifications": [ + { + "uid": "slack-channel" + } + ] + } + }, + { + "id": 2, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 0 }, + "type": "graph", + "title": "Error Rate (%)", + "targets": [ + { + "expr": "sum(rate(http_requests_total{service=~\"$service\",environment=\"$environment\",status_code=~\"5..\"}[5m])) by (service) / sum(rate(http_requests_total{service=~\"$service\",environment=\"$environment\"}[5m])) by (service) * 100", + "legendFormat": "{{service}}", + "refId": "A" + } + ], + "yaxes": [ + { + "format": "percent", + "label": "Error Rate", + "max": 10 + } + ], + "thresholds": [ + { + "value": 1, + "op": "gt", + "fill": true, + "line": true, + "colorMode": "critical" + } + ], + "alert": { + "name": "High Error Rate", + "conditions": [ + { + "evaluator": { + "params": [1], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["A", "5m", "now"] + }, + "reducer": { + "type": "last" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "1m", + "handler": 1, + "noDataState": "no_data", + "notifications": [ + { + "uid": "pagerduty" + } + ], + "message": "Error rate is above 1% for service {{service}}" + } + }, + { + "id": 3, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 0 }, + "type": "graph", + "title": "Response Time (p50, p95, p99)", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service=~\"$service\",environment=\"$environment\"}[5m])) by (le, service))", + "legendFormat": "p50 {{service}}", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=~\"$service\",environment=\"$environment\"}[5m])) by (le, service))", + "legendFormat": "p95 {{service}}", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service=~\"$service\",environment=\"$environment\"}[5m])) by (le, service))", + "legendFormat": "p99 {{service}}", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Response Time" + } + ] + }, + { + "id": 4, + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 8 }, + "type": "stat", + "title": "Current QPS", + "targets": [ + { + "expr": "sum(rate(http_requests_total{service=~\"$service\",environment=\"$environment\"}[1m]))", + "instant": true, + "refId": "A" + } + ], + "format": "reqps", + "sparkline": { + "show": true, + "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(31, 120, 193, 0.18)" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": 0, "color": "green" }, + { "value": 5000, "color": "yellow" }, + { "value": 10000, "color": "red" } + ] + } + }, + { + "id": 5, + "gridPos": { "h": 6, "w": 6, "x": 6, "y": 8 }, + "type": "stat", + "title": "Error Budget Remaining", + "targets": [ + { + "expr": "error_budget_remaining_percentage{service=~\"$service\",slo_type=\"availability\"}", + "instant": true, + "refId": "A" + } + ], + "format": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": 0, "color": "red" }, + { "value": 25, "color": "orange" }, + { "value": 50, "color": "yellow" }, + { "value": 75, "color": "green" } + ] + } + }, + { + "id": 6, + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 8 }, + "type": "table", + "title": "Top Slow Endpoints", + "targets": [ + { + "expr": "topk(10, histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service=~\"$service\",environment=\"$environment\"}[5m])) by (le, route)))", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "type": "date" + }, + { + "alias": "Duration", + "colorMode": "cell", + "colors": ["green", "yellow", "red"], + "thresholds": [0.5, 1], + "type": "number", + "unit": "s" + } + ] + } + ] + } +} +``` + +### Example 4: Production-Ready Alerting Rules + +```yaml +# alerting-rules.yml +groups: + - name: api_alerts + interval: 30s + rules: + # SLO-based alerts + - alert: APIHighErrorRate + expr: | + ( + sum(rate(http_requests_total{status_code=~"5.."}[5m])) by (service, environment) + / + sum(rate(http_requests_total[5m])) by (service, environment) + ) > 0.01 + for: 5m + labels: + severity: critical + team: api-platform + annotations: + summary: "High error rate on {{ $labels.service }}" + description: "{{ $labels.service }} in {{ $labels.environment }} has error rate of {{ $value | humanizePercentage }} (threshold: 1%)" + runbook_url: "https://wiki.example.com/runbooks/api-high-error-rate" + dashboard_url: "https://grafana.example.com/d/api-overview?var-service={{ $labels.service }}" + + - alert: APIHighLatency + expr: | + histogram_quantile(0.95, + sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le) + ) > 0.5 + for: 10m + labels: + severity: warning + team: api-platform + annotations: + summary: "High latency on {{ $labels.service }}" + description: "P95 latency for {{ $labels.service }} is {{ $value | humanizeDuration }} (threshold: 500ms)" + + - alert: APILowAvailability + expr: | + up{job="api-services"} == 0 + for: 1m + labels: + severity: critical + team: api-platform + annotations: + summary: "API service {{ $labels.instance }} is down" + description: "{{ $labels.instance }} has been down for more than 1 minute" + + # Business metrics alerts + - alert: LowActiveUsers + expr: | + business_active_users{plan="premium"} < 10 + for: 30m + labels: + severity: warning + team: product + annotations: + summary: "Low number of active premium users" + description: "Only {{ $value }} premium users active in the last 30 minutes" + + - alert: HighTransactionFailureRate + expr: | + ( + sum(rate(business_transaction_amount_dollars_sum{status="failed"}[5m])) + / + sum(rate(business_transaction_amount_dollars_sum[5m])) + ) > 0.05 + for: 5m + labels: + severity: critical + team: payments + annotations: + summary: "High transaction failure rate" + description: "Transaction failure rate is {{ $value | humanizePercentage }} (threshold: 5%)" + + # Infrastructure alerts + - alert: DatabaseConnectionPoolExhausted + expr: | + ( + db_connection_pool_size{state="active"} + / + db_connection_pool_size{state="total"} + ) > 0.9 + for: 5m + labels: + severity: warning + team: database + annotations: + summary: "Database connection pool near exhaustion" + description: "{{ $labels.database }} pool is {{ $value | humanizePercentage }} utilized" + + - alert: CacheLowHitRate + expr: | + ( + sum(rate(cache_operations_total{status="hit"}[5m])) by (cache_name) + / + sum(rate(cache_operations_total{operation="get"}[5m])) by (cache_name) + ) < 0.8 + for: 15m + labels: + severity: warning + team: api-platform + annotations: + summary: "Low cache hit rate for {{ $labels.cache_name }}" + description: "Cache hit rate is {{ $value | humanizePercentage }} (expected: >80%)" + + - alert: CircuitBreakerOpen + expr: | + circuit_breaker_state == 1 + for: 1m + labels: + severity: warning + team: api-platform + annotations: + summary: "Circuit breaker open for {{ $labels.service }}" + description: "Circuit breaker for {{ $labels.service }} has been open for more than 1 minute" + + # SLO burn rate alerts (multi-window approach) + - alert: SLOBurnRateHigh + expr: | + ( + # 5m burn rate > 14.4 (1 hour of error budget in 5 minutes) + ( + sum(rate(http_requests_total{status_code=~"5.."}[5m])) by (service) + / + sum(rate(http_requests_total[5m])) by (service) + ) > (1 - 0.999) * 14.4 + ) and ( + # 1h burn rate > 1 (confirms it's not a spike) + ( + sum(rate(http_requests_total{status_code=~"5.."}[1h])) by (service) + / + sum(rate(http_requests_total[1h])) by (service) + ) > (1 - 0.999) + ) + labels: + severity: critical + team: api-platform + alert_type: slo_burn + annotations: + summary: "SLO burn rate critically high for {{ $labels.service }}" + description: "{{ $labels.service }} is burning error budget 14.4x faster than normal" + + # Resource alerts + - alert: HighMemoryUsage + expr: | + ( + container_memory_usage_bytes{container!="POD",container!=""} + / + container_spec_memory_limit_bytes{container!="POD",container!=""} + ) > 0.9 + for: 5m + labels: + severity: warning + team: api-platform + annotations: + summary: "High memory usage for {{ $labels.container }}" + description: "Container {{ $labels.container }} memory usage is {{ $value | humanizePercentage }}" + +# AlertManager configuration +# alertmanager.yml +global: + resolve_timeout: 5m + slack_api_url: 'YOUR_SLACK_WEBHOOK_URL' + +route: + group_by: ['alertname', 'cluster', 'service'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'default' + routes: + - match: + severity: critical + receiver: 'pagerduty-critical' + continue: true + - match: + severity: warning + receiver: 'slack-warnings' + - match: + team: payments + receiver: 'payments-team' + +receivers: + - name: 'default' + slack_configs: + - channel: '#alerts' + title: 'Alert: {{ .GroupLabels.alertname }}' + text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}' + + - name: 'pagerduty-critical' + pagerduty_configs: + - service_key: 'YOUR_PAGERDUTY_SERVICE_KEY' + description: '{{ .GroupLabels.alertname }}: {{ .CommonAnnotations.summary }}' + details: + firing: '{{ .Alerts.Firing | len }}' + resolved: '{{ .Alerts.Resolved | len }}' + labels: '{{ .CommonLabels }}' + + - name: 'slack-warnings' + slack_configs: + - channel: '#warnings' + send_resolved: true + title: 'Warning: {{ .GroupLabels.alertname }}' + text: '{{ .CommonAnnotations.description }}' + actions: + - type: button + text: 'View Dashboard' + url: '{{ .CommonAnnotations.dashboard_url }}' + - type: button + text: 'View Runbook' + url: '{{ .CommonAnnotations.runbook_url }}' + + - name: 'payments-team' + email_configs: + - to: 'payments-team@example.com' + from: 'alerts@example.com' + headers: + Subject: 'Payment Alert: {{ .GroupLabels.alertname }}' + +inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname', 'service'] +``` + +### Example 5: OpenTelemetry Integration for Distributed Tracing + +```javascript +// tracing/setup.js - OpenTelemetry configuration +const { NodeSDK } = require('@opentelemetry/sdk-node'); +const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node'); +const { Resource } = require('@opentelemetry/resources'); +const { SemanticResourceAttributes } = require('@opentelemetry/semantic-conventions'); +const { JaegerExporter } = require('@opentelemetry/exporter-jaeger'); +const { PrometheusExporter } = require('@opentelemetry/exporter-prometheus'); +const { + ConsoleSpanExporter, + BatchSpanProcessor, + SimpleSpanProcessor +} = require('@opentelemetry/sdk-trace-base'); +const { PeriodicExportingMetricReader } = require('@opentelemetry/sdk-metrics'); + +class TracingSetup { + constructor(serviceName, environment = 'production') { + this.serviceName = serviceName; + this.environment = environment; + this.sdk = null; + } + + initialize() { + // Create resource identifying the service + const resource = Resource.default().merge( + new Resource({ + [SemanticResourceAttributes.SERVICE_NAME]: this.serviceName, + [SemanticResourceAttributes.SERVICE_VERSION]: process.env.VERSION || '1.0.0', + [SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: this.environment, + 'service.namespace': 'api-platform', + 'service.instance.id': process.env.HOSTNAME || 'unknown', + 'telemetry.sdk.language': 'nodejs', + }) + ); + + // Configure Jaeger exporter for traces + const jaegerExporter = new JaegerExporter({ + endpoint: process.env.JAEGER_ENDPOINT || 'http://localhost:14268/api/traces', + tags: { + service: this.serviceName, + environment: this.environment + } + }); + + // Configure Prometheus exporter for metrics + const prometheusExporter = new PrometheusExporter({ + port: 9464, + endpoint: '/metrics', + prefix: 'otel_', + appendTimestamp: true, + }, () => { + console.log('Prometheus metrics server started on port 9464'); + }); + + // Create SDK with auto-instrumentation + this.sdk = new NodeSDK({ + resource, + instrumentations: [ + getNodeAutoInstrumentations({ + '@opentelemetry/instrumentation-fs': { + enabled: false, // Disable fs to reduce noise + }, + '@opentelemetry/instrumentation-http': { + requestHook: (span, request) => { + span.setAttribute('http.request.body', JSON.stringify(request.body)); + span.setAttribute('http.request.user_id', request.user?.id); + }, + responseHook: (span, response) => { + span.setAttribute('http.response.size', response.length); + }, + ignoreIncomingPaths: ['/health', '/metrics', '/favicon.ico'], + ignoreOutgoingUrls: [(url) => url.includes('prometheus')] + }, + '@opentelemetry/instrumentation-express': { + requestHook: (span, request) => { + span.setAttribute('express.route', request.route?.path); + span.setAttribute('express.params', JSON.stringify(request.params)); + } + }, + '@opentelemetry/instrumentation-mysql2': { + enhancedDatabaseReporting: true, + }, + '@opentelemetry/instrumentation-redis-4': { + dbStatementSerializer: (cmdName, cmdArgs) => { + return `${cmdName} ${cmdArgs.slice(0, 2).join(' ')}`; + } + } + }) + ], + spanProcessor: new BatchSpanProcessor(jaegerExporter, { + maxQueueSize: 2048, + maxExportBatchSize: 512, + scheduledDelayMillis: 5000, + exportTimeoutMillis: 30000, + }), + metricReader: new PeriodicExportingMetricReader({ + exporter: prometheusExporter, + exportIntervalMillis: 10000, + }), + }); + + // Start the SDK + this.sdk.start() + .then(() => console.log('Tracing initialized successfully')) + .catch((error) => console.error('Error initializing tracing', error)); + + // Graceful shutdown + process.on('SIGTERM', () => { + this.shutdown(); + }); + } + + async shutdown() { + try { + await this.sdk.shutdown(); + console.log('Tracing terminated successfully'); + } catch (error) { + console.error('Error terminating tracing', error); + } + } + + // Manual span creation for custom instrumentation + createSpan(tracer, spanName, fn) { + return tracer.startActiveSpan(spanName, async (span) => { + try { + span.setAttribute('span.kind', 'internal'); + span.setAttribute('custom.span', true); + + const result = await fn(span); + + span.setStatus({ code: 0, message: 'OK' }); + return result; + } catch (error) { + span.setStatus({ code: 2, message: error.message }); + span.recordException(error); + throw error; + } finally { + span.end(); + } + }); + } +} + +// Usage in application +const tracing = new TracingSetup('api-gateway', process.env.NODE_ENV); +tracing.initialize(); + +// Custom instrumentation example +const { trace } = require('@opentelemetry/api'); + +async function processOrder(orderId) { + const tracer = trace.getTracer('order-processing', '1.0.0'); + + return tracing.createSpan(tracer, 'processOrder', async (span) => { + span.setAttribute('order.id', orderId); + span.addEvent('Order processing started'); + + // Validate order + await tracing.createSpan(tracer, 'validateOrder', async (childSpan) => { + childSpan.setAttribute('validation.type', 'schema'); + // Validation logic + await validateOrderSchema(orderId); + }); + + // Process payment + await tracing.createSpan(tracer, 'processPayment', async (childSpan) => { + childSpan.setAttribute('payment.method', 'stripe'); + // Payment logic + const result = await processStripePayment(orderId); + childSpan.setAttribute('payment.status', result.status); + childSpan.addEvent('Payment processed', { + 'payment.amount': result.amount, + 'payment.currency': result.currency + }); + }); + + // Send confirmation + await tracing.createSpan(tracer, 'sendConfirmation', async (childSpan) => { + childSpan.setAttribute('notification.type', 'email'); + // Email logic + await sendOrderConfirmation(orderId); + }); + + span.addEvent('Order processing completed'); + return { success: true, orderId }; + }); +} + +module.exports = { TracingSetup, tracing }; +``` + +### Example 6: Custom Prometheus Exporters for Complex Metrics + +```python +# custom_exporters.py - Python Prometheus exporter for business metrics +from prometheus_client import start_http_server, Gauge, Counter, Histogram, Info, Enum +from prometheus_client.core import CollectorRegistry +from prometheus_client import generate_latest +import time +import psycopg2 +import redis +import requests +from datetime import datetime, timedelta +import asyncio +import aiohttp + +class CustomBusinessExporter: + def __init__(self, db_config, redis_config, port=9091): + self.registry = CollectorRegistry() + self.db_config = db_config + self.redis_config = redis_config + self.port = port + + # Initialize metrics + self.initialize_metrics() + + # Connect to data sources + self.connect_datasources() + + def initialize_metrics(self): + # Business KPI metrics + self.revenue_total = Gauge( + 'business_revenue_total_usd', + 'Total revenue in USD', + ['period', 'product_line', 'region'], + registry=self.registry + ) + + self.customer_lifetime_value = Histogram( + 'business_customer_lifetime_value_usd', + 'Customer lifetime value distribution', + ['customer_segment', 'acquisition_channel'], + buckets=(10, 50, 100, 500, 1000, 5000, 10000, 50000), + registry=self.registry + ) + + self.churn_rate = Gauge( + 'business_churn_rate_percentage', + 'Customer churn rate', + ['plan', 'cohort'], + registry=self.registry + ) + + self.monthly_recurring_revenue = Gauge( + 'business_mrr_usd', + 'Monthly recurring revenue', + ['plan', 'currency'], + registry=self.registry + ) + + self.net_promoter_score = Gauge( + 'business_nps', + 'Net Promoter Score', + ['segment', 'survey_type'], + registry=self.registry + ) + + # Operational metrics + self.data_pipeline_lag = Histogram( + 'data_pipeline_lag_seconds', + 'Data pipeline processing lag', + ['pipeline', 'stage'], + buckets=(1, 5, 10, 30, 60, 300, 600, 1800, 3600), + registry=self.registry + ) + + self.feature_usage = Counter( + 'feature_usage_total', + 'Feature usage counts', + ['feature_name', 'user_tier', 'success'], + registry=self.registry + ) + + self.api_quota_usage = Gauge( + 'api_quota_usage_percentage', + 'API quota usage by customer', + ['customer_id', 'tier', 'resource'], + registry=self.registry + ) + + # System health indicators + self.dependency_health = Enum( + 'dependency_health_status', + 'Health status of external dependencies', + ['service', 'dependency'], + states=['healthy', 'degraded', 'unhealthy'], + registry=self.registry + ) + + self.data_quality_score = Gauge( + 'data_quality_score', + 'Data quality score (0-100)', + ['dataset', 'dimension'], + registry=self.registry + ) + + def connect_datasources(self): + # PostgreSQL connection + self.db_conn = psycopg2.connect(**self.db_config) + + # Redis connection + self.redis_client = redis.Redis(**self.redis_config) + + def collect_business_metrics(self): + """Collect business metrics from various data sources""" + cursor = self.db_conn.cursor() + + # Revenue metrics + cursor.execute(""" + SELECT + DATE_TRUNC('day', created_at) as period, + product_line, + region, + SUM(amount) as total_revenue + FROM orders + WHERE status = 'completed' + AND created_at >= NOW() - INTERVAL '7 days' + GROUP BY period, product_line, region + """) + + for row in cursor.fetchall(): + self.revenue_total.labels( + period=row[0].isoformat(), + product_line=row[1], + region=row[2] + ).set(row[3]) + + # Customer lifetime value + cursor.execute(""" + SELECT + c.segment, + c.acquisition_channel, + AVG(o.total_spent) as avg_clv + FROM customers c + JOIN ( + SELECT customer_id, SUM(amount) as total_spent + FROM orders + WHERE status = 'completed' + GROUP BY customer_id + ) o ON c.id = o.customer_id + GROUP BY c.segment, c.acquisition_channel + """) + + for row in cursor.fetchall(): + self.customer_lifetime_value.labels( + customer_segment=row[0], + acquisition_channel=row[1] + ).observe(row[2]) + + # MRR calculation + cursor.execute(""" + SELECT + plan_name, + currency, + SUM( + CASE + WHEN billing_period = 'yearly' THEN amount / 12 + ELSE amount + END + ) as mrr + FROM subscriptions + WHERE status = 'active' + GROUP BY plan_name, currency + """) + + for row in cursor.fetchall(): + self.monthly_recurring_revenue.labels( + plan=row[0], + currency=row[1] + ).set(row[2]) + + # Churn rate + cursor.execute(""" + WITH cohort_data AS ( + SELECT + plan_name, + DATE_TRUNC('month', created_at) as cohort, + COUNT(*) as total_customers, + COUNT(CASE WHEN status = 'cancelled' THEN 1 END) as churned_customers + FROM subscriptions + WHERE created_at >= NOW() - INTERVAL '6 months' + GROUP BY plan_name, cohort + ) + SELECT + plan_name, + cohort, + (churned_customers::float / total_customers) * 100 as churn_rate + FROM cohort_data + """) + + for row in cursor.fetchall(): + self.churn_rate.labels( + plan=row[0], + cohort=row[1].isoformat() + ).set(row[2]) + + cursor.close() + + def collect_operational_metrics(self): + """Collect operational metrics from Redis and other sources""" + + # API quota usage from Redis + for key in self.redis_client.scan_iter("quota:*"): + parts = key.decode().split(':') + if len(parts) >= 3: + customer_id = parts[1] + resource = parts[2] + + used = float(self.redis_client.get(key) or 0) + limit_key = f"quota_limit:{customer_id}:{resource}" + limit = float(self.redis_client.get(limit_key) or 1000) + + usage_percentage = (used / limit) * 100 if limit > 0 else 0 + + # Get customer tier from database + cursor = self.db_conn.cursor() + cursor.execute( + "SELECT tier FROM customers WHERE id = %s", + (customer_id,) + ) + result = cursor.fetchone() + tier = result[0] if result else 'unknown' + cursor.close() + + self.api_quota_usage.labels( + customer_id=customer_id, + tier=tier, + resource=resource + ).set(usage_percentage) + + # Data pipeline lag from Redis + pipeline_stages = ['ingestion', 'processing', 'storage', 'delivery'] + for stage in pipeline_stages: + lag_key = f"pipeline:lag:{stage}" + lag_value = self.redis_client.get(lag_key) + if lag_value: + self.data_pipeline_lag.labels( + pipeline='main', + stage=stage + ).observe(float(lag_value)) + + def check_dependency_health(self): + """Check health of external dependencies""" + dependencies = [ + ('payment', 'stripe', 'https://api.stripe.com/health'), + ('email', 'sendgrid', 'https://api.sendgrid.com/health'), + ('storage', 's3', 'https://s3.amazonaws.com/health'), + ('cache', 'redis', 'redis://localhost:6379'), + ('database', 'postgres', self.db_config) + ] + + for service, dep_name, endpoint in dependencies: + try: + if dep_name == 'redis': + # Check Redis + self.redis_client.ping() + status = 'healthy' + elif dep_name == 'postgres': + # Check PostgreSQL + cursor = self.db_conn.cursor() + cursor.execute("SELECT 1") + cursor.close() + status = 'healthy' + else: + # Check HTTP endpoints + response = requests.get(endpoint, timeout=5) + if response.status_code == 200: + status = 'healthy' + elif 200 < response.status_code < 500: + status = 'degraded' + else: + status = 'unhealthy' + except Exception as e: + print(f"Health check failed for {dep_name}: {e}") + status = 'unhealthy' + + self.dependency_health.labels( + service=service, + dependency=dep_name + ).state(status) + + def calculate_data_quality(self): + """Calculate data quality scores""" + cursor = self.db_conn.cursor() + + # Completeness score + cursor.execute(""" + SELECT + 'orders' as dataset, + (COUNT(*) - COUNT(CASE WHEN customer_email IS NULL THEN 1 END))::float / COUNT(*) * 100 as completeness + FROM orders + WHERE created_at >= NOW() - INTERVAL '1 day' + """) + + for row in cursor.fetchall(): + self.data_quality_score.labels( + dataset=row[0], + dimension='completeness' + ).set(row[1]) + + # Accuracy score (checking for valid email formats) + cursor.execute(""" + SELECT + 'customers' as dataset, + COUNT(CASE WHEN email ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}$' THEN 1 END)::float / COUNT(*) * 100 as accuracy + FROM customers + WHERE created_at >= NOW() - INTERVAL '1 day' + """) + + for row in cursor.fetchall(): + self.data_quality_score.labels( + dataset=row[0], + dimension='accuracy' + ).set(row[1]) + + cursor.close() + + async def collect_metrics_async(self): + """Async collection for improved performance""" + tasks = [ + self.collect_business_metrics_async(), + self.collect_operational_metrics_async(), + self.check_dependency_health_async(), + self.calculate_data_quality_async() + ] + + await asyncio.gather(*tasks) + + def run(self): + """Start the exporter""" + # Start HTTP server for Prometheus to scrape + start_http_server(self.port, registry=self.registry) + print(f"Custom exporter started on port {self.port}") + + # Collect metrics every 30 seconds + while True: + try: + self.collect_business_metrics() + self.collect_operational_metrics() + self.check_dependency_health() + self.calculate_data_quality() + + print(f"Metrics collected at {datetime.now()}") + time.sleep(30) + + except Exception as e: + print(f"Error collecting metrics: {e}") + time.sleep(30) + +# Usage +if __name__ == "__main__": + db_config = { + 'host': 'localhost', + 'database': 'production', + 'user': 'metrics_user', + 'password': 'secure_password', + 'port': 5432 + } + + redis_config = { + 'host': 'localhost', + 'port': 6379, + 'db': 0, + 'decode_responses': True + } + + exporter = CustomBusinessExporter(db_config, redis_config) + exporter.run() +``` + +## Error Handling + +| Error | Cause | Solution | +|-------|-------|----------| +| "Connection refused to Prometheus" | Prometheus not running or wrong port | Check Docker container status with `docker ps`, verify port mapping | +| "No data in Grafana dashboard" | Metrics not being scraped | Verify Prometheus targets at `localhost:9090/targets`, check API metrics endpoint | +| "Too many samples" error | High cardinality labels | Review label usage, avoid user IDs or timestamps as labels | +| "Out of memory" in Prometheus | Retention too long or too many metrics | Reduce retention time, implement remote storage, or scale vertically | +| Jaeger traces not appearing | Incorrect sampling rate | Increase sampling rate in tracer configuration | +| "Context deadline exceeded" | Scrape timeout too short | Increase scrape_timeout in prometheus.yml (default 10s) | +| "Error reading Prometheus" | Corrupt WAL (write-ahead log) | Delete WAL directory: `rm -rf /prometheus/wal/*` and restart | +| "Too many open files" | File descriptor limit reached | Increase ulimit: `ulimit -n 65536` or adjust systemd limits | +| AlertManager not firing | Incorrect routing rules | Validate routing tree with `amtool config routes` | +| Grafana login loop | Cookie/session issues | Clear browser cookies, check Grafana cookie settings | + +## Configuration Options + +**Basic Usage:** +```bash +/create-monitoring \ + --stack=prometheus \ + --services=api-gateway,user-service,order-service \ + --environment=production \ + --retention=30d +``` + +**Available Options:** + +`--stack ` - Monitoring stack to deploy +- `prometheus` - Prometheus + Grafana + AlertManager (default, open-source) +- `elastic` - ELK stack (Elasticsearch, Logstash, Kibana) for log-centric +- `datadog` - Datadog agent configuration (requires API key) +- `newrelic` - New Relic agent setup (requires license key) +- `hybrid` - Combination of metrics (Prometheus) and logs (ELK) + +`--tracing ` - Distributed tracing backend +- `jaeger` - Jaeger all-in-one (default, recommended for start) +- `zipkin` - Zipkin server +- `tempo` - Grafana Tempo (for high-scale) +- `xray` - AWS X-Ray (for AWS environments) +- `none` - Skip tracing setup + +`--retention ` - Metrics retention period +- Default: `15d` (15 days) +- Production: `30d` to `90d` +- With remote storage: `365d` or more + +`--scrape-interval ` - How often to collect metrics +- Default: `15s` +- High-frequency: `5s` (higher resource usage) +- Low-frequency: `60s` (for stable metrics) + +`--alerting-channels ` - Where to send alerts +- `slack` - Slack webhook integration +- `pagerduty` - PagerDuty integration +- `email` - SMTP email notifications +- `webhook` - Custom webhook endpoint +- `opsgenie` - Atlassian OpsGenie + +`--dashboard-presets ` - Pre-built dashboards to install +- `red-metrics` - Rate, Errors, Duration +- `four-golden` - Latency, Traffic, Errors, Saturation +- `business-kpis` - Revenue, Users, Conversion +- `sre-slos` - SLI/SLO tracking +- `security` - Security metrics and anomalies + +`--exporters ` - Additional exporters to configure +- `node-exporter` - System/host metrics +- `blackbox-exporter` - Probe endpoints +- `postgres-exporter` - PostgreSQL metrics +- `redis-exporter` - Redis metrics +- `custom` - Custom business metrics + +`--high-availability` - Enable HA configuration +- Sets up Prometheus federation +- Configures AlertManager clustering +- Enables Grafana database replication + +`--storage ` - Long-term storage backend +- `local` - Local disk (default) +- `thanos` - Thanos for unlimited retention +- `cortex` - Cortex for multi-tenant +- `victoria` - VictoriaMetrics for efficiency +- `s3` - S3-compatible object storage + +`--dry-run` - Generate configuration without deploying +- Creates all config files +- Validates syntax +- Shows what would be deployed +- No actual containers started + +## Best Practices + +DO: +- Start with RED metrics (Rate, Errors, Duration) as your foundation +- Use histogram buckets that align with your SLO targets +- Tag metrics with environment, region, version, and service +- Create runbooks for every alert and link them in annotations +- Implement meta-monitoring (monitor the monitoring system) +- Use recording rules for frequently-run expensive queries +- Set up separate dashboards for different audiences (ops, dev, business) +- Use exemplars to link metrics to traces for easier debugging +- Implement gradual rollout of new metrics to avoid cardinality explosion +- Archive old dashboards before creating new ones + +DON'T: +- Add high-cardinality labels like user IDs, session IDs, or UUIDs +- Create dashboards with 50+ panels (causes browser performance issues) +- Alert on symptoms without providing actionable runbooks +- Store raw logs in Prometheus (use log aggregation systems) +- Ignore alert fatigue (regularly review and tune thresholds) +- Hardcode datasource UIDs in dashboard JSON +- Mix metrics from different time ranges in one panel +- Use regex selectors without limits in production queries +- Forget to set up backup for Grafana database +- Skip capacity planning for metrics growth + +TIPS: +- Import dashboards from grafana.com marketplace (dashboard IDs) +- Use Prometheus federation for multi-region deployments +- Implement progressive alerting: warning (Slack) → critical (PagerDuty) +- Create team-specific folders in Grafana for organization +- Use Grafana variables for dynamic, reusable dashboards +- Set up dashboard playlists for NOC/SOC displays +- Use annotations to mark deployments and incidents on graphs +- Implement SLO burn rate alerts instead of static thresholds +- Create separate Prometheus jobs for different scrape intervals +- Use remote_write for backup and long-term storage + +## Performance Considerations + +**Prometheus Resource Planning** +``` +Memory Required = + (number_of_time_series * 2KB) + # Active series + (ingestion_rate * 2 * retention_hours) + # WAL and blocks + (2GB) # Base overhead + +CPU Cores Required = + (ingestion_rate / 100,000) + # Ingestion processing + (query_rate / 10) + # Query processing + (1) # Base overhead + +Disk IOPS Required = + (ingestion_rate / 1000) + # Write IOPS + (query_rate * 100) + # Read IOPS + (100) # Background compaction +``` + +**Optimization Strategies** +1. **Reduce cardinality**: Audit and remove unnecessary labels +2. **Use recording rules**: Pre-compute expensive queries +3. **Optimize scrape configs**: Different intervals for different metrics +4. **Implement downsampling**: For long-term storage +5. **Horizontal sharding**: Separate Prometheus per service/team +6. **Remote storage**: Offload old data to object storage +7. **Query caching**: Use Trickster or built-in Grafana caching +8. **Metric relabeling**: Drop unwanted metrics at scrape time +9. **Federation**: Aggregate metrics hierarchically +10. **Capacity limits**: Set max_samples_per_send and queue sizes + +**Scaling Thresholds** +- < 1M active series: Single Prometheus instance +- 1M - 10M series: Prometheus with remote storage +- 10M - 100M series: Sharded Prometheus or Cortex +- > 100M series: Thanos or multi-region Cortex + +## Security Considerations + +**Authentication & Authorization** +```yaml +# prometheus.yml with basic auth +scrape_configs: + - job_name: 'secured-api' + basic_auth: + username: 'prometheus' + password_file: '/etc/prometheus/password.txt' + scheme: https + tls_config: + ca_file: '/etc/prometheus/ca.crt' + cert_file: '/etc/prometheus/cert.crt' + key_file: '/etc/prometheus/key.pem' + insecure_skip_verify: false +``` + +**Network Security** +- Deploy monitoring stack in isolated subnet +- Use internal load balancers for Prometheus federation +- Implement mTLS between Prometheus and targets +- Restrict metrics endpoints to monitoring CIDR blocks +- Use VPN or private links for cross-region federation + +**Data Security** +- Encrypt data at rest (filesystem encryption) +- Sanitize metrics to avoid leaking sensitive data +- Implement audit logging for all access +- Regular security scanning of monitoring infrastructure +- Rotate credentials and certificates regularly + +**Compliance Considerations** +- GDPR: Avoid collecting PII in metrics labels +- HIPAA: Encrypt all health-related metrics +- PCI DSS: Separate payment metrics into isolated stack +- SOC 2: Maintain audit trails and access logs + +## Troubleshooting Guide + +**Issue: Prometheus consuming too much memory** +```bash +# 1. Check current memory usage and series count +curl -s http://localhost:9090/api/v1/status/tsdb | jq '.data.seriesCountByMetricName' | head -20 + +# 2. Find high cardinality metrics +curl -g 'http://localhost:9090/api/v1/query?query=count(count+by(__name__)({__name__=~".+"}))' | jq + +# 3. Identify problematic labels +curl -s http://localhost:9090/api/v1/label/userId/values | jq '. | length' + +# 4. Drop high-cardinality metrics +# Add to prometheus.yml: +metric_relabel_configs: + - source_labels: [__name__] + regex: 'problematic_metric_.*' + action: drop +``` + +**Issue: Grafana dashboards loading slowly** +```bash +# 1. Check query performance +curl -s 'http://localhost:9090/api/v1/query_log' | jq '.data[] | select(.duration_seconds > 1)' + +# 2. Analyze slow queries in Grafana +SELECT + dashboard_id, + panel_id, + AVG(duration) as avg_duration, + query +FROM grafana.query_history +WHERE duration > 1000 +GROUP BY dashboard_id, panel_id, query +ORDER BY avg_duration DESC; + +# 3. Optimize with recording rules +# Add to recording_rules.yml: +groups: + - name: dashboard_queries + interval: 30s + rules: + - record: api:request_rate5m + expr: sum(rate(http_requests_total[5m])) by (service) +``` + +**Issue: Alerts not firing** +```bash +# 1. Check alert state +curl http://localhost:9090/api/v1/alerts | jq + +# 2. Validate AlertManager config +docker exec alertmanager amtool config routes + +# 3. Test alert routing +docker exec alertmanager amtool config routes test \ + --config.file=/etc/alertmanager/alertmanager.yml \ + --verify.receivers=slack-critical \ + severity=critical service=api + +# 4. Check for inhibition rules +curl http://localhost:9093/api/v1/alerts | jq '.[] | select(.status.inhibitedBy != [])' +``` + +**Issue: Missing traces in Jaeger** +```javascript +// 1. Verify sampling rate +const tracer = initTracer({ + serviceName: 'api-gateway', + sampler: { + type: 'const', // Change to 'const' for debugging + param: 1, // 1 = sample everything + }, +}); + +// 2. Check span reporting +tracer.on('span_finished', (span) => { + console.log('Span finished:', span.operationName(), span.context().toTraceId()); +}); + +// 3. Verify Jaeger agent connectivity +curl http://localhost:14268/api/traces?service=api-gateway +``` + +## Migration Guide + +**From CloudWatch to Prometheus:** +```python +# Migration script example +import boto3 +from prometheus_client import CollectorRegistry, Gauge, push_to_gateway + +def migrate_cloudwatch_to_prometheus(): + # Read from CloudWatch + cw = boto3.client('cloudwatch') + metrics = cw.get_metric_statistics( + Namespace='AWS/EC2', + MetricName='CPUUtilization', + StartTime=datetime.now() - timedelta(hours=1), + EndTime=datetime.now(), + Period=300, + Statistics=['Average'] + ) + + # Write to Prometheus + registry = CollectorRegistry() + g = Gauge('aws_ec2_cpu_utilization', 'EC2 CPU Usage', + ['instance_id'], registry=registry) + + for datapoint in metrics['Datapoints']: + g.labels(instance_id='i-1234567890abcdef0').set(datapoint['Average']) + push_to_gateway('localhost:9091', job='cloudwatch_migration', registry=registry) +``` + +**From Datadog to Prometheus:** +1. Export Datadog dashboards as JSON +2. Convert queries using query translator +3. Import to Grafana with dashboard converter +4. Map Datadog tags to Prometheus labels +5. Recreate alerts in AlertManager format + +## Related Commands + +- `/api-load-tester` - Generate test traffic to validate monitoring setup +- `/api-security-scanner` - Security testing with metrics integration +- `/add-rate-limiting` - Rate limiting with metrics exposure +- `/api-contract-generator` - Generate OpenAPI specs with metrics annotations +- `/deployment-pipeline-orchestrator` - CI/CD with monitoring integration +- `/api-versioning-manager` - Version-aware metrics tracking + +## Advanced Topics + +**Multi-Cluster Monitoring with Thanos:** +```yaml +# thanos-sidecar.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: thanos-config +data: + object-store.yaml: | + type: S3 + config: + bucket: metrics-long-term + endpoint: s3.amazonaws.com + access_key: ${AWS_ACCESS_KEY} + secret_key: ${AWS_SECRET_KEY} +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus-thanos +spec: + template: + spec: + containers: + - name: prometheus + args: + - --storage.tsdb.retention.time=2h + - --storage.tsdb.min-block-duration=2h + - --storage.tsdb.max-block-duration=2h + - --web.enable-lifecycle + - name: thanos-sidecar + image: quay.io/thanos/thanos:v0.31.0 + args: + - sidecar + - --prometheus.url=http://localhost:9090 + - --objstore.config-file=/etc/thanos/object-store.yaml +``` + +**Service Mesh Observability (Istio):** +```yaml +# Automatic metrics from Istio +telemetry: + v2: + prometheus: + providers: + - name: prometheus + configOverride: + inboundSidecar: + disable_host_header_fallback: false + metric_expiry_duration: 10m + outboundSidecar: + disable_host_header_fallback: false + metric_expiry_duration: 10m + gateway: + disable_host_header_fallback: true +``` + +## Version History + +- v1.0.0 (2024-01): Initial Prometheus + Grafana implementation +- v1.1.0 (2024-03): Added Jaeger tracing integration +- v1.2.0 (2024-05): Thanos long-term storage support +- v1.3.0 (2024-07): OpenTelemetry collector integration +- v1.4.0 (2024-09): Multi-cluster federation support +- v1.5.0 (2024-10): Custom business metrics exporters +- Planned v2.0.0: eBPF-based zero-instrumentation monitoring diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..c500b0b --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,97 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:jeremylongshore/claude-code-plugins-plus:plugins/api-development/api-monitoring-dashboard", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "7a5e6f37ed72f980f5d05ad2a1cc943611588fa0", + "treeHash": "44aa3050002c5dd367a146cbb791c252fd7e11bbf902b9d96b3c84b894627c0e", + "generatedAt": "2025-11-28T10:18:07.535834Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "api-monitoring-dashboard", + "description": "Create monitoring dashboards for API health, metrics, and alerts", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "c4a515bfab31dcc060b77a78a6714e6ba1c3bd0d7d8cb7ec56c0ccd2905594e8" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "9955e8e9340d27f0269091ed561c38b8fbb4efef89f93fe3e3511e9e3db1a1ce" + }, + { + "path": "commands/create-monitoring.md", + "sha256": "fe559bd3ab97abb5fc72361e2a69729e2b4043a2ce5a4c3d02ee1e040ec722b0" + }, + { + "path": "skills/skill-adapter/references/examples.md", + "sha256": "922bbc3c4ebf38b76f515b5c1998ebde6bf902233e00e2c5a0e9176f975a7572" + }, + { + "path": "skills/skill-adapter/references/best-practices.md", + "sha256": "c8f32b3566252f50daacd346d7045a1060c718ef5cfb07c55a0f2dec5f1fb39e" + }, + { + "path": "skills/skill-adapter/references/README.md", + "sha256": "aa6f26af1ed7acd182c9c7d49628e3d42094c9df8894341099096374310f62fb" + }, + { + "path": "skills/skill-adapter/scripts/helper-template.sh", + "sha256": "0881d5660a8a7045550d09ae0acc15642c24b70de6f08808120f47f86ccdf077" + }, + { + "path": "skills/skill-adapter/scripts/validation.sh", + "sha256": "92551a29a7f512d2036e4f1fb46c2a3dc6bff0f7dde4a9f699533e446db48502" + }, + { + "path": "skills/skill-adapter/scripts/README.md", + "sha256": "aac6edc951b6524a56cd2a0b69cf145106117f776e1935c8ab90b828e16cd83f" + }, + { + "path": "skills/skill-adapter/assets/dashboard_template.json", + "sha256": "a3597b47f572a501ab3639a9b798217b0a5fd5c2cdba07691e9664851fcbe09f" + }, + { + "path": "skills/skill-adapter/assets/test-data.json", + "sha256": "ac17dca3d6e253a5f39f2a2f1b388e5146043756b05d9ce7ac53a0042eee139d" + }, + { + "path": "skills/skill-adapter/assets/example_dashboard_config.yaml", + "sha256": "c9a5f99da14bc7e2e1a9e8fdd2c21255533e80629a8894a4831fc7f89ea18244" + }, + { + "path": "skills/skill-adapter/assets/visualization_examples.md", + "sha256": "84e9b3318d54f99e0be8e8fba2a680e3d47bde29e8de15e3cc26800231b5c7ea" + }, + { + "path": "skills/skill-adapter/assets/README.md", + "sha256": "a3934d5e3b4e7fcf187e20809e0e39b5a9aa9b6d78489897b8e88c676dca66ba" + }, + { + "path": "skills/skill-adapter/assets/skill-schema.json", + "sha256": "f5639ba823a24c9ac4fb21444c0717b7aefde1a4993682897f5bf544f863c2cd" + }, + { + "path": "skills/skill-adapter/assets/config-template.json", + "sha256": "0c2ba33d2d3c5ccb266c0848fc43caa68a2aa6a80ff315d4b378352711f83e1c" + } + ], + "dirSha256": "44aa3050002c5dd367a146cbb791c252fd7e11bbf902b9d96b3c84b894627c0e" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/skill-adapter/assets/README.md b/skills/skill-adapter/assets/README.md new file mode 100644 index 0000000..2005f64 --- /dev/null +++ b/skills/skill-adapter/assets/README.md @@ -0,0 +1,7 @@ +# Assets + +Bundled resources for api-monitoring-dashboard skill + +- [ ] dashboard_template.json: JSON template for creating a basic API monitoring dashboard. +- [ ] example_dashboard_config.yaml: Example configuration file for defining API endpoints, metrics, and alerting rules. +- [ ] visualization_examples.md: Examples of different visualizations (e.g., line charts, bar graphs) for displaying API metrics. diff --git a/skills/skill-adapter/assets/config-template.json b/skills/skill-adapter/assets/config-template.json new file mode 100644 index 0000000..16f1712 --- /dev/null +++ b/skills/skill-adapter/assets/config-template.json @@ -0,0 +1,32 @@ +{ + "skill": { + "name": "skill-name", + "version": "1.0.0", + "enabled": true, + "settings": { + "verbose": false, + "autoActivate": true, + "toolRestrictions": true + } + }, + "triggers": { + "keywords": [ + "example-trigger-1", + "example-trigger-2" + ], + "patterns": [] + }, + "tools": { + "allowed": [ + "Read", + "Grep", + "Bash" + ], + "restricted": [] + }, + "metadata": { + "author": "Plugin Author", + "category": "general", + "tags": [] + } +} diff --git a/skills/skill-adapter/assets/dashboard_template.json b/skills/skill-adapter/assets/dashboard_template.json new file mode 100644 index 0000000..53b1927 --- /dev/null +++ b/skills/skill-adapter/assets/dashboard_template.json @@ -0,0 +1,90 @@ +{ + "_comment": "Template for creating an API monitoring dashboard. This JSON defines the basic structure and sample data for visualizing API health, metrics, and alerts.", + "dashboard_name": "API Performance Dashboard", + "description": "A comprehensive dashboard for monitoring the health and performance of your APIs.", + "data_source": "Prometheus", + "refresh_interval": "5m", + "panels": [ + { + "panel_id": 1, + "title": "API Request Rate", + "type": "timeseries", + "_comment": "Visualizes the number of API requests over time.", + "query": "rate(http_requests_total[5m])", + "legend": "{{method}} {{path}}", + "unit": "req/s", + "axis_format": "short" + }, + { + "panel_id": 2, + "title": "API Error Rate (5xx)", + "type": "timeseries", + "_comment": "Displays the error rate of API requests resulting in 5xx errors.", + "query": "rate(http_requests_total{status=~'5.*'}[5m])", + "legend": "{{method}} {{path}}", + "unit": "%", + "axis_format": "percent", + "transform": "multiply_by_100" + }, + { + "panel_id": 3, + "title": "Average API Response Time", + "type": "timeseries", + "_comment": "Tracks the average response time of API requests.", + "query": "avg(http_request_duration_seconds_sum) / avg(http_request_duration_seconds_count)", + "legend": "{{method}} {{path}}", + "unit": "ms", + "axis_format": "short", + "transform": "multiply_by_1000" + }, + { + "panel_id": 4, + "title": "API Latency (P95)", + "type": "timeseries", + "_comment": "Shows the 95th percentile latency of API requests.", + "query": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", + "legend": "{{method}} {{path}}", + "unit": "ms", + "axis_format": "short", + "transform": "multiply_by_1000" + }, + { + "panel_id": 5, + "title": "API Status Codes", + "type": "stat", + "_comment": "Displays the distribution of API status codes.", + "query": "sum(http_requests_total) by (status)", + "unit": "total", + "color_thresholds": [ + { "value": 0, "color": "green" }, + { "value": 1000, "color": "yellow" }, + { "value": 5000, "color": "red" } + ] + }, + { + "panel_id": 6, + "title": "Alerts", + "type": "table", + "_comment": "Displays active alerts related to API performance.", + "query": "ALERTS{}", + "columns": ["alertname", "severity", "description", "value"] + } + ], + "variables": [ + { + "name": "namespace", + "label": "Namespace", + "query": "label_values(namespace)", + "multi": true, + "includeAll": true + }, + { + "name": "service", + "label": "Service", + "query": "label_values(service, namespace='$namespace')", + "multi": true, + "includeAll": true + } + ], + "tags": ["api", "monitoring", "performance", "health"] +} \ No newline at end of file diff --git a/skills/skill-adapter/assets/example_dashboard_config.yaml b/skills/skill-adapter/assets/example_dashboard_config.yaml new file mode 100644 index 0000000..5730a6c --- /dev/null +++ b/skills/skill-adapter/assets/example_dashboard_config.yaml @@ -0,0 +1,113 @@ +# Configuration file for API Monitoring Dashboard Plugin + +# API Endpoints to Monitor +api_endpoints: + # Each entry defines an API endpoint to be monitored. + - name: "User Service API" # Descriptive name for the API + url: "https://api.example.com/users" # The actual API endpoint URL + method: "GET" # HTTP method (GET, POST, PUT, DELETE, etc.) + expected_status_code: 200 # Expected HTTP status code for a successful response + timeout: 5 # Timeout in seconds for the API request + headers: # Optional headers to include in the API request + Content-Type: "application/json" + Authorization: "Bearer REPLACE_ME" + - name: "Product Service API" + url: "https://api.example.com/products" + method: "GET" + expected_status_code: 200 + timeout: 5 + - name: "Order Service API (POST)" + url: "https://api.example.com/orders" + method: "POST" + expected_status_code: 201 + timeout: 10 + data: # Example data for POST requests (can be a placeholder) + item_id: 123 + quantity: 2 + - name: "Authentication API" + url: "https://auth.example.com/login" + method: "POST" + expected_status_code: 200 + timeout: 5 + data: + username: "YOUR_USERNAME" + password: "YOUR_PASSWORD" + +# Metrics to Collect and Display +metrics: + # Each entry defines a metric to be collected from the API response. + - name: "Response Time (ms)" # Descriptive name for the metric + endpoint: "User Service API" # The API endpoint to collect the metric from (must match an entry in api_endpoints) + json_path: "response_time" # JSON path to extract the metric value from the response (if applicable) + unit: "ms" # Unit of measurement for the metric + type: "number" # Data type of the metric (number, string, boolean) + - name: "Data Size (KB)" + endpoint: "Product Service API" + json_path: "data_size" + unit: "KB" + type: "number" + - name: "Error Count" + endpoint: "Order Service API (POST)" + json_path: "error_count" + unit: "count" + type: "number" + - name: "Login Success Rate" + endpoint: "Authentication API" + json_path: "success_rate" + unit: "%" + type: "number" + +# Alerting Rules +alerts: + # Each entry defines an alerting rule. + - name: "High Response Time" # Descriptive name for the alert + metric: "Response Time (ms)" # The metric to monitor (must match an entry in metrics) + threshold: 200 # Threshold value for the alert + operator: ">" # Operator to compare the metric value with the threshold (>, <, >=, <=, ==, !=) + severity: "critical" # Severity of the alert (critical, warning, info) + notification_channels: # List of notification channels to send the alert to + - "email" + - "slack" + - name: "Low Data Size" + metric: "Data Size (KB)" + threshold: 10 + operator: "<" + severity: "warning" + notification_channels: + - "email" + - name: "High Error Count" + metric: "Error Count" + threshold: 5 + operator: ">=" + severity: "critical" + notification_channels: + - "slack" + - name: "Low Login Success Rate" + metric: "Login Success Rate" + threshold: 90 + operator: "<" + severity: "warning" + notification_channels: + - "email" + +# Notification Channel Configurations (REPLACE_ME) +notification_channels_config: + email: + smtp_server: "smtp.example.com" + smtp_port: 587 + sender_email: "monitoring@example.com" + recipient_email: "alerts@example.com" + smtp_username: "YOUR_SMTP_USERNAME" + smtp_password: "YOUR_SMTP_PASSWORD" + slack: + slack_webhook_url: "YOUR_SLACK_WEBHOOK_URL" + +# Dashboard Configuration +dashboard: + title: "API Monitoring Dashboard" + refresh_interval: 60 # Refresh interval in seconds + layout: # Define the layout of the dashboard (example only) + - "User Service API": ["Response Time (ms)"] + - "Product Service API": ["Data Size (KB)"] + - "Order Service API (POST)": ["Error Count"] + - "Authentication API": ["Login Success Rate"] \ No newline at end of file diff --git a/skills/skill-adapter/assets/skill-schema.json b/skills/skill-adapter/assets/skill-schema.json new file mode 100644 index 0000000..8dc154c --- /dev/null +++ b/skills/skill-adapter/assets/skill-schema.json @@ -0,0 +1,28 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Claude Skill Configuration", + "type": "object", + "required": ["name", "description"], + "properties": { + "name": { + "type": "string", + "pattern": "^[a-z0-9-]+$", + "maxLength": 64, + "description": "Skill identifier (lowercase, hyphens only)" + }, + "description": { + "type": "string", + "maxLength": 1024, + "description": "What the skill does and when to use it" + }, + "allowed-tools": { + "type": "string", + "description": "Comma-separated list of allowed tools" + }, + "version": { + "type": "string", + "pattern": "^\\d+\\.\\d+\\.\\d+$", + "description": "Semantic version (x.y.z)" + } + } +} diff --git a/skills/skill-adapter/assets/test-data.json b/skills/skill-adapter/assets/test-data.json new file mode 100644 index 0000000..f0cd871 --- /dev/null +++ b/skills/skill-adapter/assets/test-data.json @@ -0,0 +1,27 @@ +{ + "testCases": [ + { + "name": "Basic activation test", + "input": "trigger phrase example", + "expected": { + "activated": true, + "toolsUsed": ["Read", "Grep"], + "success": true + } + }, + { + "name": "Complex workflow test", + "input": "multi-step trigger example", + "expected": { + "activated": true, + "steps": 3, + "toolsUsed": ["Read", "Write", "Bash"], + "success": true + } + } + ], + "fixtures": { + "sampleInput": "example data", + "expectedOutput": "processed result" + } +} diff --git a/skills/skill-adapter/assets/visualization_examples.md b/skills/skill-adapter/assets/visualization_examples.md new file mode 100644 index 0000000..f335488 --- /dev/null +++ b/skills/skill-adapter/assets/visualization_examples.md @@ -0,0 +1,122 @@ +# API Monitoring Dashboard: Visualization Examples + +This document provides examples of different visualizations you can use in your API monitoring dashboards, created with the `api-monitoring-dashboard` plugin. Use these examples as inspiration and adapt them to your specific API and monitoring needs. + +## 1. Line Charts: Time-Series Data + +Line charts are excellent for visualizing trends over time. They are particularly useful for showing API response times, request rates, and error rates. + +**Example:** API Response Time over the Past 24 Hours + +* **Metric:** Average API Response Time (milliseconds) +* **Time Range:** Past 24 hours +* **Granularity:** 1 hour +* **Visualization:** Line Chart +* **Data Source:** [Placeholder: Your API Monitoring Data Source (e.g., Prometheus, Datadog, New Relic)] + +**Instructions:** + +1. Configure your data source to collect API response time data. +2. Specify the time range and granularity for the chart. Shorter granularities (e.g., 5 minutes) are useful for identifying short-term spikes, while longer granularities (e.g., 1 hour) are better for identifying long-term trends. +3. Ensure your data source returns data in a format compatible with the charting library used by the `api-monitoring-dashboard` plugin. + +**Placeholder for Chart Image (Optional):** + +[Insert Image of API Response Time Line Chart Here] + +## 2. Bar Graphs: Categorical Data + +Bar graphs are useful for comparing different categories of data, such as API endpoints, HTTP status codes, or geographic regions. + +**Example:** API Request Count by Endpoint + +* **Metric:** Number of API Requests +* **Category:** API Endpoint (e.g., `/users`, `/products`, `/orders`) +* **Time Range:** Past 7 days +* **Visualization:** Bar Graph +* **Data Source:** [Placeholder: Your API Monitoring Data Source] + +**Instructions:** + +1. Configure your data source to track API requests by endpoint. +2. Specify the time range for the chart. +3. Consider using different colors to represent different API endpoints. + +**Placeholder for Chart Image (Optional):** + +[Insert Image of API Request Count Bar Graph Here] + +## 3. Gauge Charts: Single Value Performance + +Gauge charts are effective for displaying a single, critical performance metric and its current status relative to a threshold. + +**Example:** CPU Utilization of API Server + +* **Metric:** CPU Utilization (%) +* **Threshold:** 80% (Warning), 95% (Critical) +* **Visualization:** Gauge Chart +* **Data Source:** [Placeholder: Your Server Monitoring Data Source] + +**Instructions:** + +1. Configure your server monitoring data source to collect CPU utilization data. +2. Define appropriate thresholds for warning and critical levels. These thresholds should be based on your API's performance requirements and resource constraints. +3. The gauge chart should visually indicate when the metric exceeds the warning or critical thresholds. + +**Placeholder for Chart Image (Optional):** + +[Insert Image of CPU Utilization Gauge Chart Here] + +## 4. Heatmaps: Correlation and Density + +Heatmaps are useful for visualizing correlations between different metrics or the density of events over time. + +**Example:** Latency Distribution by API Endpoint and Time of Day + +* **Metric:** API Latency (milliseconds) +* **X-Axis:** API Endpoint +* **Y-Axis:** Time of Day +* **Visualization:** Heatmap +* **Data Source:** [Placeholder: Your API Monitoring Data Source] + +**Instructions:** + +1. Configure your data source to track API latency by endpoint and time of day. +2. Choose a color palette that effectively represents the range of latency values. +3. Consider using a logarithmic scale for the latency values to better visualize variations in the data. + +**Placeholder for Chart Image (Optional):** + +[Insert Image of Latency Distribution Heatmap Here] + +## 5. Tables: Detailed Data + +Tables are useful for displaying detailed data and allowing users to sort and filter the data. + +**Example:** Recent API Errors + +* **Columns:** Timestamp, API Endpoint, HTTP Status Code, Error Message, Client IP Address +* **Data Source:** [Placeholder: Your API Error Logs] +* **Visualization:** Table + +**Instructions:** + +1. Configure your data source to collect detailed API error logs. +2. Include relevant columns in the table, such as timestamp, API endpoint, HTTP status code, error message, and client IP address. +3. Allow users to sort and filter the data by different columns. + +**Placeholder for Table Data (Example):** + +| Timestamp | API Endpoint | HTTP Status Code | Error Message | Client IP Address | +|---|---|---|---|---| +| 2023-10-27 10:00:00 | /users | 500 | Internal Server Error | 192.168.1.100 | +| 2023-10-27 10:01:00 | /products | 404 | Not Found | 192.168.1.101 | +| 2023-10-27 10:02:00 | /orders | 503 | Service Unavailable | 192.168.1.102 | + +## Important Considerations + +* **Data Source Integration:** Ensure the `api-monitoring-dashboard` plugin can seamlessly integrate with your existing monitoring data sources. Provide clear instructions on how to configure these integrations. +* **Customization:** Allow users to customize the appearance and behavior of the visualizations, such as color palettes, axis labels, and threshold values. +* **Alerting:** Integrate alerts with the visualizations to notify users when critical performance metrics exceed predefined thresholds. +* **Accessibility:** Ensure the visualizations are accessible to users with disabilities, following WCAG guidelines. +* **Performance:** Optimize the visualizations for performance, especially when dealing with large datasets. \ No newline at end of file diff --git a/skills/skill-adapter/references/README.md b/skills/skill-adapter/references/README.md new file mode 100644 index 0000000..4c15116 --- /dev/null +++ b/skills/skill-adapter/references/README.md @@ -0,0 +1,7 @@ +# References + +Bundled resources for api-monitoring-dashboard skill + +- [ ] api_metrics_reference.md: Documentation on available API metrics, their definitions, and how to interpret them. +- [ ] dashboard_templates.md: Pre-defined dashboard templates for different API types (e.g., REST, GraphQL) and use cases (e.g., performance monitoring, error tracking). +- [ ] alerting_strategies.md: Best practices for setting up alerts and notifications based on API metrics and thresholds. diff --git a/skills/skill-adapter/references/best-practices.md b/skills/skill-adapter/references/best-practices.md new file mode 100644 index 0000000..3505048 --- /dev/null +++ b/skills/skill-adapter/references/best-practices.md @@ -0,0 +1,69 @@ +# Skill Best Practices + +Guidelines for optimal skill usage and development. + +## For Users + +### Activation Best Practices + +1. **Use Clear Trigger Phrases** + - Match phrases from skill description + - Be specific about intent + - Provide necessary context + +2. **Provide Sufficient Context** + - Include relevant file paths + - Specify scope of analysis + - Mention any constraints + +3. **Understand Tool Permissions** + - Check allowed-tools in frontmatter + - Know what the skill can/cannot do + - Request appropriate actions + +### Workflow Optimization + +- Start with simple requests +- Build up to complex workflows +- Verify each step before proceeding +- Use skill consistently for related tasks + +## For Developers + +### Skill Development Guidelines + +1. **Clear Descriptions** + - Include explicit trigger phrases + - Document all capabilities + - Specify limitations + +2. **Proper Tool Permissions** + - Use minimal necessary tools + - Document security implications + - Test with restricted tools + +3. **Comprehensive Documentation** + - Provide usage examples + - Document common pitfalls + - Include troubleshooting guide + +### Maintenance + +- Keep version updated +- Test after tool updates +- Monitor user feedback +- Iterate on descriptions + +## Performance Tips + +- Scope skills to specific domains +- Avoid overlapping trigger phrases +- Keep descriptions under 1024 chars +- Test activation reliability + +## Security Considerations + +- Never include secrets in skill files +- Validate all inputs +- Use read-only tools when possible +- Document security requirements diff --git a/skills/skill-adapter/references/examples.md b/skills/skill-adapter/references/examples.md new file mode 100644 index 0000000..b1d8bd2 --- /dev/null +++ b/skills/skill-adapter/references/examples.md @@ -0,0 +1,70 @@ +# Skill Usage Examples + +This document provides practical examples of how to use this skill effectively. + +## Basic Usage + +### Example 1: Simple Activation + +**User Request:** +``` +[Describe trigger phrase here] +``` + +**Skill Response:** +1. Analyzes the request +2. Performs the required action +3. Returns results + +### Example 2: Complex Workflow + +**User Request:** +``` +[Describe complex scenario] +``` + +**Workflow:** +1. Step 1: Initial analysis +2. Step 2: Data processing +3. Step 3: Result generation +4. Step 4: Validation + +## Advanced Patterns + +### Pattern 1: Chaining Operations + +Combine this skill with other tools: +``` +Step 1: Use this skill for [purpose] +Step 2: Chain with [other tool] +Step 3: Finalize with [action] +``` + +### Pattern 2: Error Handling + +If issues occur: +- Check trigger phrase matches +- Verify context is available +- Review allowed-tools permissions + +## Tips & Best Practices + +- ✅ Be specific with trigger phrases +- ✅ Provide necessary context +- ✅ Check tool permissions match needs +- ❌ Avoid vague requests +- ❌ Don't mix unrelated tasks + +## Common Issues + +**Issue:** Skill doesn't activate +**Solution:** Use exact trigger phrases from description + +**Issue:** Unexpected results +**Solution:** Check input format and context + +## See Also + +- Main SKILL.md for full documentation +- scripts/ for automation helpers +- assets/ for configuration examples diff --git a/skills/skill-adapter/scripts/README.md b/skills/skill-adapter/scripts/README.md new file mode 100644 index 0000000..63bbb5b --- /dev/null +++ b/skills/skill-adapter/scripts/README.md @@ -0,0 +1,7 @@ +# Scripts + +Bundled resources for api-monitoring-dashboard skill + +- [ ] create_dashboard.py: Automates the creation of API monitoring dashboards based on user-defined metrics and thresholds. +- [ ] update_dashboard.py: Updates existing API monitoring dashboards with new metrics, alerts, or visualizations. +- [ ] get_api_metrics.py: Fetches API metrics from various sources (e.g., Prometheus, Datadog) based on API endpoints and time ranges. diff --git a/skills/skill-adapter/scripts/helper-template.sh b/skills/skill-adapter/scripts/helper-template.sh new file mode 100755 index 0000000..c4aae90 --- /dev/null +++ b/skills/skill-adapter/scripts/helper-template.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Helper script template for skill automation +# Customize this for your skill's specific needs + +set -e + +function show_usage() { + echo "Usage: $0 [options]" + echo "" + echo "Options:" + echo " -h, --help Show this help message" + echo " -v, --verbose Enable verbose output" + echo "" +} + +# Parse arguments +VERBOSE=false + +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + show_usage + exit 0 + ;; + -v|--verbose) + VERBOSE=true + shift + ;; + *) + echo "Unknown option: $1" + show_usage + exit 1 + ;; + esac +done + +# Your skill logic here +if [ "$VERBOSE" = true ]; then + echo "Running skill automation..." +fi + +echo "✅ Complete" diff --git a/skills/skill-adapter/scripts/validation.sh b/skills/skill-adapter/scripts/validation.sh new file mode 100755 index 0000000..590af58 --- /dev/null +++ b/skills/skill-adapter/scripts/validation.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Skill validation helper +# Validates skill activation and functionality + +set -e + +echo "🔍 Validating skill..." + +# Check if SKILL.md exists +if [ ! -f "../SKILL.md" ]; then + echo "❌ Error: SKILL.md not found" + exit 1 +fi + +# Validate frontmatter +if ! grep -q "^---$" "../SKILL.md"; then + echo "❌ Error: No frontmatter found" + exit 1 +fi + +# Check required fields +if ! grep -q "^name:" "../SKILL.md"; then + echo "❌ Error: Missing 'name' field" + exit 1 +fi + +if ! grep -q "^description:" "../SKILL.md"; then + echo "❌ Error: Missing 'description' field" + exit 1 +fi + +echo "✅ Skill validation passed"