Initial commit
This commit is contained in:
543
skills/spring-boot-actuator/references/observability.md
Normal file
543
skills/spring-boot-actuator/references/observability.md
Normal file
@@ -0,0 +1,543 @@
|
||||
# Observability with Spring Boot Actuator
|
||||
|
||||
Spring Boot Actuator provides comprehensive observability features through integration with Micrometer, including metrics, tracing, and structured logging. This enables monitoring, alerting, and debugging of Spring Boot applications in production.
|
||||
|
||||
## Three Pillars of Observability
|
||||
|
||||
### 1. Metrics
|
||||
Quantitative measurements of application behavior:
|
||||
- Application metrics (requests/second, response times)
|
||||
- JVM metrics (memory usage, garbage collection)
|
||||
- System metrics (CPU, disk usage)
|
||||
- Custom business metrics
|
||||
|
||||
### 2. Tracing
|
||||
Request flow tracking across distributed systems:
|
||||
- Distributed tracing with OpenTelemetry or Zipkin
|
||||
- Span creation and propagation
|
||||
- Request correlation across services
|
||||
- Performance bottleneck identification
|
||||
|
||||
### 3. Logging
|
||||
Structured application event recording:
|
||||
- Centralized logging with correlation IDs
|
||||
- Log level management
|
||||
- Structured logging formats (JSON)
|
||||
- Integration with tracing context
|
||||
|
||||
## Observability Configuration
|
||||
|
||||
### Basic Setup
|
||||
|
||||
```yaml
|
||||
management:
|
||||
endpoints:
|
||||
web:
|
||||
exposure:
|
||||
include: "health,info,metrics,prometheus,loggers"
|
||||
|
||||
metrics:
|
||||
export:
|
||||
prometheus:
|
||||
enabled: true
|
||||
distribution:
|
||||
percentiles-histogram:
|
||||
http.server.requests: true
|
||||
http.client.requests: true
|
||||
|
||||
tracing:
|
||||
sampling:
|
||||
probability: 0.1 # 10% sampling in production
|
||||
|
||||
zipkin:
|
||||
tracing:
|
||||
endpoint: "http://zipkin:9411/api/v2/spans"
|
||||
|
||||
logging:
|
||||
pattern:
|
||||
level: "%5p [%X{traceId:-},%X{spanId:-}]"
|
||||
level:
|
||||
org.springframework.web: DEBUG
|
||||
```
|
||||
|
||||
### Micrometer Integration
|
||||
|
||||
```java
|
||||
@Configuration
|
||||
public class ObservabilityConfiguration {
|
||||
|
||||
@Bean
|
||||
public MeterRegistryCustomizer<MeterRegistry> metricsCommonTags() {
|
||||
return registry -> registry.config()
|
||||
.commonTags("application", "my-app")
|
||||
.commonTags("environment", getEnvironment());
|
||||
}
|
||||
|
||||
@Bean
|
||||
public ObservationRegistryCustomizer<ObservationRegistry> observationRegistryCustomizer() {
|
||||
return registry -> registry.observationConfig()
|
||||
.observationHandler(new LoggingObservationHandler())
|
||||
.observationHandler(new MetricsObservationHandler(meterRegistry()))
|
||||
.observationHandler(new TracingObservationHandler(tracer()));
|
||||
}
|
||||
|
||||
private String getEnvironment() {
|
||||
return System.getProperty("spring.profiles.active", "development");
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Custom Observability Components
|
||||
|
||||
### Custom Health Indicators
|
||||
|
||||
```java
|
||||
@Component
|
||||
public class DatabaseHealthIndicator implements HealthIndicator {
|
||||
|
||||
private final DataSource dataSource;
|
||||
|
||||
public DatabaseHealthIndicator(DataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Health health() {
|
||||
try (Connection connection = dataSource.getConnection()) {
|
||||
boolean isValid = connection.isValid(5);
|
||||
|
||||
if (isValid) {
|
||||
return Health.up()
|
||||
.withDetail("database", "PostgreSQL")
|
||||
.withDetail("connection_pool", getConnectionPoolInfo())
|
||||
.build();
|
||||
} else {
|
||||
return Health.down()
|
||||
.withDetail("database", "Connection validation failed")
|
||||
.build();
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
return Health.down(ex)
|
||||
.withDetail("database", "Connection failed")
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, Object> getConnectionPoolInfo() {
|
||||
// Return connection pool metrics
|
||||
return Map.of(
|
||||
"active", 5,
|
||||
"idle", 3,
|
||||
"max", 10
|
||||
);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Custom Metrics
|
||||
|
||||
```java
|
||||
@Component
|
||||
public class BusinessMetrics {
|
||||
|
||||
private final Counter orderCounter;
|
||||
private final Timer orderProcessingTime;
|
||||
private final Gauge activeUsers;
|
||||
|
||||
public BusinessMetrics(MeterRegistry meterRegistry) {
|
||||
this.orderCounter = Counter.builder("orders.total")
|
||||
.description("Total number of orders")
|
||||
.tag("type", "all")
|
||||
.register(meterRegistry);
|
||||
|
||||
this.orderProcessingTime = Timer.builder("orders.processing.time")
|
||||
.description("Order processing time")
|
||||
.register(meterRegistry);
|
||||
|
||||
this.activeUsers = Gauge.builder("users.active")
|
||||
.description("Number of active users")
|
||||
.register(meterRegistry, this, BusinessMetrics::getActiveUserCount);
|
||||
}
|
||||
|
||||
public void recordOrder(String orderType) {
|
||||
orderCounter.increment(Tags.of("type", orderType));
|
||||
}
|
||||
|
||||
public void recordOrderProcessingTime(Duration duration) {
|
||||
orderProcessingTime.record(duration);
|
||||
}
|
||||
|
||||
private double getActiveUserCount() {
|
||||
// Implement logic to get active user count
|
||||
return 150.0;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Observation Aspects
|
||||
|
||||
```java
|
||||
@Aspect
|
||||
@Component
|
||||
public class ObservationAspect {
|
||||
|
||||
private final ObservationRegistry observationRegistry;
|
||||
|
||||
public ObservationAspect(ObservationRegistry observationRegistry) {
|
||||
this.observationRegistry = observationRegistry;
|
||||
}
|
||||
|
||||
@Around("@annotation(observed)")
|
||||
public Object observe(ProceedingJoinPoint joinPoint, Observed observed) throws Throwable {
|
||||
String operationName = observed.name().isEmpty() ?
|
||||
joinPoint.getSignature().getName() : observed.name();
|
||||
|
||||
return Observation.createNotStarted(operationName, observationRegistry)
|
||||
.lowCardinalityKeyValues(observed.lowCardinalityKeyValues())
|
||||
.observe(() -> {
|
||||
try {
|
||||
return joinPoint.proceed();
|
||||
} catch (RuntimeException ex) {
|
||||
throw ex;
|
||||
} catch (Throwable ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Distributed Observability
|
||||
|
||||
### Service Correlation
|
||||
|
||||
```java
|
||||
@RestController
|
||||
public class OrderController {
|
||||
|
||||
private final OrderService orderService;
|
||||
private final ObservationRegistry observationRegistry;
|
||||
|
||||
public OrderController(OrderService orderService, ObservationRegistry observationRegistry) {
|
||||
this.orderService = orderService;
|
||||
this.observationRegistry = observationRegistry;
|
||||
}
|
||||
|
||||
@PostMapping("/orders")
|
||||
public ResponseEntity<Order> createOrder(@RequestBody CreateOrderRequest request) {
|
||||
return Observation.createNotStarted("order.create", observationRegistry)
|
||||
.lowCardinalityKeyValue("operation", "create")
|
||||
.lowCardinalityKeyValue("service", "order-service")
|
||||
.observe(() -> {
|
||||
Order order = orderService.createOrder(request);
|
||||
return ResponseEntity.ok(order);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@Service
|
||||
public class OrderService {
|
||||
|
||||
private final PaymentServiceClient paymentClient;
|
||||
|
||||
@Observed(name = "order.processing")
|
||||
public Order createOrder(CreateOrderRequest request) {
|
||||
// Business logic with automatic observation
|
||||
PaymentResult payment = paymentClient.processPayment(request.getPayment());
|
||||
|
||||
if (payment.isSuccessful()) {
|
||||
return saveOrder(request);
|
||||
} else {
|
||||
throw new PaymentFailedException("Payment failed");
|
||||
}
|
||||
}
|
||||
|
||||
private Order saveOrder(CreateOrderRequest request) {
|
||||
// Save order logic
|
||||
return new Order();
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Cross-Service Tracing
|
||||
|
||||
```java
|
||||
@Component
|
||||
public class PaymentServiceClient {
|
||||
|
||||
private final WebClient webClient;
|
||||
private final ObservationRegistry observationRegistry;
|
||||
|
||||
public PaymentServiceClient(WebClient.Builder webClientBuilder,
|
||||
ObservationRegistry observationRegistry) {
|
||||
this.webClient = webClientBuilder
|
||||
.baseUrl("http://payment-service")
|
||||
.build();
|
||||
this.observationRegistry = observationRegistry;
|
||||
}
|
||||
|
||||
public PaymentResult processPayment(PaymentRequest request) {
|
||||
return Observation.createNotStarted("payment.process", observationRegistry)
|
||||
.lowCardinalityKeyValue("service", "payment-service")
|
||||
.lowCardinalityKeyValue("method", "POST")
|
||||
.observe(() -> {
|
||||
return webClient
|
||||
.post()
|
||||
.uri("/payments")
|
||||
.bodyValue(request)
|
||||
.retrieve()
|
||||
.bodyToMono(PaymentResult.class)
|
||||
.block();
|
||||
});
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Alerting and Monitoring
|
||||
|
||||
### Health-based Alerting
|
||||
|
||||
```java
|
||||
@Component
|
||||
public class HealthAlertManager {
|
||||
|
||||
private final HealthEndpoint healthEndpoint;
|
||||
private final NotificationService notificationService;
|
||||
|
||||
@Scheduled(fixedRate = 30000) // Check every 30 seconds
|
||||
public void checkHealth() {
|
||||
HealthComponent health = healthEndpoint.health();
|
||||
|
||||
if (!Status.UP.equals(health.getStatus())) {
|
||||
Alert alert = Alert.builder()
|
||||
.severity(Alert.Severity.HIGH)
|
||||
.title("Application Health Check Failed")
|
||||
.description("Application health status: " + health.getStatus())
|
||||
.details(health.getDetails())
|
||||
.build();
|
||||
|
||||
notificationService.sendAlert(alert);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Metric-based Alerting
|
||||
|
||||
```java
|
||||
@Component
|
||||
public class MetricAlertManager {
|
||||
|
||||
private final MeterRegistry meterRegistry;
|
||||
private final NotificationService notificationService;
|
||||
|
||||
@Scheduled(fixedRate = 60000) // Check every minute
|
||||
public void checkMetrics() {
|
||||
// Check error rate
|
||||
double errorRate = getErrorRate();
|
||||
if (errorRate > 0.05) { // 5% error rate threshold
|
||||
sendAlert("High Error Rate",
|
||||
String.format("Error rate: %.2f%%", errorRate * 100));
|
||||
}
|
||||
|
||||
// Check response time
|
||||
double avgResponseTime = getAverageResponseTime();
|
||||
if (avgResponseTime > 1000) { // 1 second threshold
|
||||
sendAlert("High Response Time",
|
||||
String.format("Average response time: %.2f ms", avgResponseTime));
|
||||
}
|
||||
|
||||
// Check memory usage
|
||||
double memoryUsage = getMemoryUsage();
|
||||
if (memoryUsage > 0.9) { // 90% memory usage
|
||||
sendAlert("High Memory Usage",
|
||||
String.format("Memory usage: %.2f%%", memoryUsage * 100));
|
||||
}
|
||||
}
|
||||
|
||||
private double getErrorRate() {
|
||||
Timer successTimer = meterRegistry.find("http.server.requests")
|
||||
.tag("status", "200")
|
||||
.timer();
|
||||
Timer errorTimer = meterRegistry.find("http.server.requests")
|
||||
.tag("status", "500")
|
||||
.timer();
|
||||
|
||||
if (successTimer != null && errorTimer != null) {
|
||||
double total = successTimer.count() + errorTimer.count();
|
||||
return total > 0 ? errorTimer.count() / total : 0.0;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
private double getAverageResponseTime() {
|
||||
Timer timer = meterRegistry.find("http.server.requests").timer();
|
||||
return timer != null ? timer.mean(TimeUnit.MILLISECONDS) : 0.0;
|
||||
}
|
||||
|
||||
private double getMemoryUsage() {
|
||||
Gauge memoryUsed = meterRegistry.find("jvm.memory.used").gauge();
|
||||
Gauge memoryMax = meterRegistry.find("jvm.memory.max").gauge();
|
||||
|
||||
if (memoryUsed != null && memoryMax != null) {
|
||||
return memoryUsed.value() / memoryMax.value();
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
private void sendAlert(String title, String message) {
|
||||
Alert alert = Alert.builder()
|
||||
.severity(Alert.Severity.MEDIUM)
|
||||
.title(title)
|
||||
.description(message)
|
||||
.timestamp(Instant.now())
|
||||
.build();
|
||||
|
||||
notificationService.sendAlert(alert);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Production Observability Setup
|
||||
|
||||
### Prometheus Configuration
|
||||
|
||||
```yaml
|
||||
management:
|
||||
endpoints:
|
||||
web:
|
||||
exposure:
|
||||
include: "health,info,metrics,prometheus"
|
||||
|
||||
metrics:
|
||||
export:
|
||||
prometheus:
|
||||
enabled: true
|
||||
step: 30s
|
||||
descriptions: true
|
||||
distribution:
|
||||
percentiles-histogram:
|
||||
"[http.server.requests]": true
|
||||
percentiles:
|
||||
"[http.server.requests]": 0.5, 0.95, 0.99
|
||||
slo:
|
||||
"[http.server.requests]": 100ms, 500ms, 1s
|
||||
|
||||
prometheus:
|
||||
metrics:
|
||||
export:
|
||||
enabled: true
|
||||
```
|
||||
|
||||
### Grafana Dashboard Configuration
|
||||
|
||||
```json
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Spring Boot Application Dashboard",
|
||||
"panels": [
|
||||
{
|
||||
"title": "Request Rate",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(http_server_requests_seconds_count[5m])",
|
||||
"legendFormat": "{{method}} {{uri}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Response Time",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(http_server_requests_seconds_bucket[5m]))",
|
||||
"legendFormat": "95th percentile"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "JVM Memory",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "jvm_memory_used_bytes / jvm_memory_max_bytes * 100",
|
||||
"legendFormat": "Memory Usage %"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Observability Strategy**: Define clear observability goals and SLIs/SLOs
|
||||
2. **Metric Cardinality**: Keep metric labels low-cardinality to avoid performance issues
|
||||
3. **Sampling**: Use appropriate sampling rates for tracing in high-throughput applications
|
||||
4. **Security**: Secure observability endpoints and ensure no sensitive data is exposed
|
||||
5. **Performance**: Monitor the performance impact of observability instrumentation
|
||||
6. **Alerting**: Set up meaningful alerts based on business metrics, not just technical metrics
|
||||
7. **Documentation**: Document your observability setup and runbooks for incident response
|
||||
|
||||
### Complete Production Configuration
|
||||
|
||||
```yaml
|
||||
# Production observability configuration
|
||||
management:
|
||||
endpoints:
|
||||
web:
|
||||
exposure:
|
||||
include: "health,info,metrics,prometheus"
|
||||
base-path: "/actuator"
|
||||
|
||||
endpoint:
|
||||
health:
|
||||
show-details: when-authorized
|
||||
probes:
|
||||
enabled: true
|
||||
metrics:
|
||||
enabled: true
|
||||
prometheus:
|
||||
enabled: true
|
||||
|
||||
metrics:
|
||||
export:
|
||||
prometheus:
|
||||
enabled: true
|
||||
step: 30s
|
||||
distribution:
|
||||
percentiles-histogram:
|
||||
http.server.requests: true
|
||||
percentiles:
|
||||
http.server.requests: 0.5, 0.95, 0.99
|
||||
|
||||
tracing:
|
||||
sampling:
|
||||
probability: 0.01 # 1% sampling in production
|
||||
|
||||
zipkin:
|
||||
tracing:
|
||||
endpoint: "${ZIPKIN_URL:http://localhost:9411/api/v2/spans}"
|
||||
|
||||
logging:
|
||||
pattern:
|
||||
level: "%5p [%X{traceId:-},%X{spanId:-}]"
|
||||
level:
|
||||
root: INFO
|
||||
com.example: DEBUG
|
||||
org.springframework.web: WARN
|
||||
|
||||
# Custom application properties
|
||||
app:
|
||||
observability:
|
||||
alerts:
|
||||
error-rate-threshold: 0.05
|
||||
response-time-threshold: 1000
|
||||
memory-threshold: 0.9
|
||||
retention:
|
||||
metrics: 30d
|
||||
traces: 7d
|
||||
logs: 30d
|
||||
```
|
||||
Reference in New Issue
Block a user