Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:51:46 +08:00
commit 00486a9b97
66 changed files with 29954 additions and 0 deletions

View File

@@ -0,0 +1,633 @@
---
name: monitoring-alerting
description: Automatically applies when implementing monitoring and alerting. Ensures proper metric instrumentation, alerts, SLO/SLI definition, dashboards, and observability patterns.
category: observability
---
# Monitoring and Alerting Patterns
When implementing monitoring and alerting, follow these patterns for reliable observability.
**Trigger Keywords**: monitoring, alerting, metrics, SLO, SLI, SLA, dashboard, observability, instrumentation, Prometheus, Grafana, alert, threshold
**Agent Integration**: Used by `backend-architect`, `devops-engineer`, `sre-engineer`, `performance-engineer`
## ✅ Correct Pattern: Metric Instrumentation
```python
from prometheus_client import Counter, Histogram, Gauge, Summary
from typing import Callable
from functools import wraps
import time
# Define metrics
http_requests_total = Counter(
'http_requests_total',
'Total HTTP requests',
['method', 'endpoint', 'status']
)
http_request_duration_seconds = Histogram(
'http_request_duration_seconds',
'HTTP request latency',
['method', 'endpoint'],
buckets=[0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]
)
active_connections = Gauge(
'active_connections',
'Number of active connections'
)
database_query_duration = Summary(
'database_query_duration_seconds',
'Database query duration',
['query_type']
)
def track_request_metrics(endpoint: str):
"""Decorator to track request metrics."""
def decorator(func: Callable) -> Callable:
@wraps(func)
async def wrapper(*args, **kwargs):
start_time = time.time()
try:
# Execute request
result = await func(*args, **kwargs)
# Track success
http_requests_total.labels(
method='GET',
endpoint=endpoint,
status='200'
).inc()
# Track duration
duration = time.time() - start_time
http_request_duration_seconds.labels(
method='GET',
endpoint=endpoint
).observe(duration)
return result
except Exception as e:
# Track error
http_requests_total.labels(
method='GET',
endpoint=endpoint,
status='500'
).inc()
raise
return wrapper
return decorator
# Usage
@track_request_metrics('/api/users')
async def get_users():
"""Get users with metrics tracking."""
return await db.query(User).all()
```
## FastAPI Middleware for Metrics
```python
from fastapi import FastAPI, Request
from starlette.middleware.base import BaseHTTPMiddleware
import time
class MetricsMiddleware(BaseHTTPMiddleware):
"""Middleware to track HTTP metrics."""
async def dispatch(self, request: Request, call_next):
"""Track request metrics."""
start_time = time.time()
# Increment active connections
active_connections.inc()
try:
response = await call_next(request)
# Track metrics
duration = time.time() - start_time
http_requests_total.labels(
method=request.method,
endpoint=request.url.path,
status=response.status_code
).inc()
http_request_duration_seconds.labels(
method=request.method,
endpoint=request.url.path
).observe(duration)
return response
except Exception as e:
# Track errors
http_requests_total.labels(
method=request.method,
endpoint=request.url.path,
status='500'
).inc()
raise
finally:
# Decrement active connections
active_connections.dec()
# Add middleware
app = FastAPI()
app.add_middleware(MetricsMiddleware)
# Metrics endpoint
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
@app.get("/metrics")
async def metrics():
"""Expose Prometheus metrics."""
return Response(
content=generate_latest(),
media_type=CONTENT_TYPE_LATEST
)
```
## SLO/SLI Definition
```python
from dataclasses import dataclass
from typing import List, Dict
from datetime import datetime, timedelta
@dataclass
class SLI:
"""Service Level Indicator."""
name: str
description: str
metric_query: str # Prometheus query
target: float # Target percentage (0-100)
window: timedelta # Time window
@dataclass
class SLO:
"""Service Level Objective."""
name: str
description: str
slis: List[SLI]
error_budget: float # Percentage (0-100)
# Define SLOs
api_availability_slo = SLO(
name="API Availability",
description="API should be available 99.9% of the time",
slis=[
SLI(
name="Request Success Rate",
description="Percentage of successful requests",
metric_query="""
sum(rate(http_requests_total{status=~"2.."}[5m]))
/
sum(rate(http_requests_total[5m]))
""",
target=99.9,
window=timedelta(days=30)
)
],
error_budget=0.1 # 0.1% error budget
)
api_latency_slo = SLO(
name="API Latency",
description="95th percentile latency should be under 500ms",
slis=[
SLI(
name="P95 Latency",
description="95th percentile request duration",
metric_query="""
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m]))
by (le)
)
""",
target=0.5, # 500ms
window=timedelta(days=30)
)
],
error_budget=5.0 # 5% can be over threshold
)
class SLOTracker:
"""Track SLO compliance."""
def __init__(self, prometheus_url: str):
self.prometheus_url = prometheus_url
async def check_sli(self, sli: SLI) -> Dict[str, any]:
"""
Check SLI against target.
Returns:
Dict with current value, target, and compliance
"""
# Query Prometheus
current_value = await self._query_prometheus(sli.metric_query)
is_compliant = current_value >= sli.target
return {
"name": sli.name,
"current": current_value,
"target": sli.target,
"compliant": is_compliant,
"margin": current_value - sli.target
}
async def check_slo(self, slo: SLO) -> Dict[str, any]:
"""Check all SLIs for SLO."""
sli_results = []
for sli in slo.slis:
result = await self.check_sli(sli)
sli_results.append(result)
all_compliant = all(r["compliant"] for r in sli_results)
return {
"name": slo.name,
"slis": sli_results,
"compliant": all_compliant,
"error_budget_remaining": self._calculate_error_budget(slo, sli_results)
}
```
## Alert Rules
```python
from typing import Dict, List, Callable
from pydantic import BaseModel
class AlertRule(BaseModel):
"""Alert rule definition."""
name: str
description: str
query: str # Prometheus query
threshold: float
duration: str # e.g., "5m"
severity: str # critical, warning, info
annotations: Dict[str, str] = {}
labels: Dict[str, str] = {}
# Define alert rules
high_error_rate_alert = AlertRule(
name="HighErrorRate",
description="Error rate exceeds 1%",
query="""
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
> 0.01
""",
threshold=0.01,
duration="5m",
severity="critical",
annotations={
"summary": "High error rate detected",
"description": "Error rate is {{ $value | humanizePercentage }}"
},
labels={
"team": "backend",
"component": "api"
}
)
high_latency_alert = AlertRule(
name="HighLatency",
description="P95 latency exceeds 1 second",
query="""
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m]))
by (le)
) > 1.0
""",
threshold=1.0,
duration="10m",
severity="warning",
annotations={
"summary": "High API latency",
"description": "P95 latency is {{ $value }}s"
}
)
error_budget_exhausted_alert = AlertRule(
name="ErrorBudgetExhausted",
description="Error budget for availability SLO exhausted",
query="""
(1 - sum(rate(http_requests_total{status=~"2.."}[30d]))
/
sum(rate(http_requests_total[30d])))
> 0.001
""",
threshold=0.001,
duration="1h",
severity="critical",
annotations={
"summary": "Error budget exhausted",
"description": "SLO violation: error rate {{ $value | humanizePercentage }}"
}
)
# Export to Prometheus alert format
def export_prometheus_alert(rule: AlertRule) -> str:
"""Export alert rule to Prometheus format."""
return f"""
- alert: {rule.name}
expr: {rule.query}
for: {rule.duration}
labels:
severity: {rule.severity}
{' '.join(f'{k}: {v}' for k, v in rule.labels.items())}
annotations:
{' '.join(f'{k}: {v}' for k, v in rule.annotations.items())}
"""
```
## Alert Notification
```python
import httpx
from typing import Dict
class AlertNotifier:
"""Send alert notifications."""
def __init__(
self,
slack_webhook_url: str,
pagerduty_key: str = None
):
self.slack_webhook_url = slack_webhook_url
self.pagerduty_key = pagerduty_key
async def notify_slack(
self,
alert_name: str,
severity: str,
description: str,
details: Dict[str, any] = None
):
"""Send Slack notification."""
color = {
"critical": "#FF0000",
"warning": "#FFA500",
"info": "#00FF00"
}.get(severity, "#808080")
message = {
"attachments": [{
"color": color,
"title": f"🚨 {alert_name}",
"text": description,
"fields": [
{
"title": "Severity",
"value": severity.upper(),
"short": True
},
{
"title": "Time",
"value": datetime.utcnow().isoformat(),
"short": True
}
],
"footer": "Monitoring System"
}]
}
if details:
message["attachments"][0]["fields"].extend([
{
"title": k,
"value": str(v),
"short": True
}
for k, v in details.items()
])
async with httpx.AsyncClient() as client:
await client.post(
self.slack_webhook_url,
json=message
)
async def notify_pagerduty(
self,
alert_name: str,
severity: str,
description: str
):
"""Trigger PagerDuty incident."""
if not self.pagerduty_key:
return
event = {
"routing_key": self.pagerduty_key,
"event_action": "trigger",
"payload": {
"summary": f"{alert_name}: {description}",
"severity": severity,
"source": "monitoring-system",
"timestamp": datetime.utcnow().isoformat()
}
}
async with httpx.AsyncClient() as client:
await client.post(
"https://events.pagerduty.com/v2/enqueue",
json=event
)
# Usage
notifier = AlertNotifier(
slack_webhook_url="https://hooks.slack.com/services/...",
pagerduty_key="..."
)
await notifier.notify_slack(
alert_name="High Error Rate",
severity="critical",
description="Error rate exceeded 1%",
details={
"current_rate": "1.5%",
"threshold": "1.0%"
}
)
```
## Dashboard Configuration
```python
from typing import List
@dataclass
class DashboardPanel:
"""Grafana dashboard panel."""
title: str
query: str
visualization: str # graph, gauge, table
unit: str = "short"
thresholds: List[float] = None
@dataclass
class Dashboard:
"""Complete dashboard definition."""
title: str
panels: List[DashboardPanel]
refresh: str = "30s"
# Define dashboard
api_dashboard = Dashboard(
title="API Monitoring",
panels=[
DashboardPanel(
title="Request Rate",
query="sum(rate(http_requests_total[5m]))",
visualization="graph",
unit="reqps"
),
DashboardPanel(
title="Error Rate",
query="""
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
""",
visualization="gauge",
unit="percentunit",
thresholds=[0.01, 0.05]
),
DashboardPanel(
title="P95 Latency",
query="""
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m]))
by (le)
)
""",
visualization="graph",
unit="s",
thresholds=[0.5, 1.0]
),
DashboardPanel(
title="Active Connections",
query="active_connections",
visualization="gauge",
unit="short"
)
]
)
```
## ❌ Anti-Patterns
```python
# ❌ No metric labels
requests_total = Counter('requests_total', 'Total requests')
# Can't filter by endpoint or status!
# ✅ Better: Use labels
requests_total = Counter(
'requests_total',
'Total requests',
['method', 'endpoint', 'status']
)
# ❌ Too many metric labels
requests = Counter(
'requests',
'Requests',
['method', 'endpoint', 'status', 'user_id', 'request_id']
) # Cardinality explosion!
# ✅ Better: Reasonable labels
requests = Counter(
'requests',
'Requests',
['method', 'endpoint', 'status']
)
# ❌ No alert threshold
# Just logging errors without alerts
# ✅ Better: Define alert rules
if error_rate > 0.01:
trigger_alert("High error rate")
# ❌ Vague alert messages
"Something is wrong"
# ✅ Better: Specific with context
f"Error rate {error_rate:.2%} exceeds threshold 1% for endpoint /api/users"
```
## Best Practices Checklist
- ✅ Instrument critical paths with metrics
- ✅ Use appropriate metric types (Counter, Gauge, Histogram)
- ✅ Add meaningful labels to metrics
- ✅ Define SLOs/SLIs for key services
- ✅ Create alert rules with thresholds
- ✅ Set up alert notifications (Slack, PagerDuty)
- ✅ Build dashboards for visualization
- ✅ Track error budgets
- ✅ Monitor latency percentiles (P50, P95, P99)
- ✅ Track request rates and error rates
- ✅ Monitor resource usage (CPU, memory)
- ✅ Set up on-call rotation
## Auto-Apply
When implementing monitoring:
1. Add Prometheus metrics to key endpoints
2. Define SLOs/SLIs for service
3. Create alert rules with thresholds
4. Set up alert notifications
5. Build Grafana dashboards
6. Track error budgets
7. Monitor latency and error rates
## Related Skills
- `observability-logging` - For logging patterns
- `performance-profiling` - For performance analysis
- `fastapi-patterns` - For API instrumentation
- `structured-errors` - For error tracking