Initial commit
This commit is contained in:
@@ -0,0 +1,471 @@
|
||||
# Common Pitfalls and Solutions
|
||||
|
||||
## Pitfall 1: Lost Messages
|
||||
|
||||
### Problem
|
||||
Messages get lost due to broker failures, network issues, or consumer crashes before acknowledgment.
|
||||
|
||||
### Solution
|
||||
Use persistent messages with acknowledgments:
|
||||
|
||||
```java
|
||||
@Bean
|
||||
public ProducerFactory<String, Object> producerFactory() {
|
||||
Map<String, Object> config = new HashMap<>();
|
||||
config.put(ProducerConfig.ACKS_CONFIG, "all"); // All replicas must acknowledge
|
||||
config.put(ProducerConfig.RETRIES_CONFIG, 3); // Retry failed sends
|
||||
config.put(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG, true); // Prevent duplicates
|
||||
return new DefaultKafkaProducerFactory<>(config);
|
||||
}
|
||||
|
||||
@Bean
|
||||
public ConsumerFactory<String, Object> consumerFactory() {
|
||||
Map<String, Object> config = new HashMap<>();
|
||||
config.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false); // Manual commit
|
||||
return new DefaultKafkaConsumerFactory<>(config);
|
||||
}
|
||||
```
|
||||
|
||||
### Prevention Checklist
|
||||
- ✓ Configure producer to wait for all replicas (`acks=all`)
|
||||
- ✓ Enable idempotence to prevent duplicate messages
|
||||
- ✓ Use manual commit for consumers
|
||||
- ✓ Monitor message lag and broker health
|
||||
- ✓ Use transactional outbox pattern
|
||||
|
||||
---
|
||||
|
||||
## Pitfall 2: Duplicate Processing
|
||||
|
||||
### Problem
|
||||
Same message processed multiple times due to failed acknowledgments or retries, causing side effects.
|
||||
|
||||
### Solution
|
||||
Implement idempotency with deduplication:
|
||||
|
||||
```java
|
||||
@Service
|
||||
public class DeduplicationService {
|
||||
|
||||
private final DeduplicationRepository repository;
|
||||
|
||||
public boolean isDuplicate(String messageId) {
|
||||
return repository.existsById(messageId);
|
||||
}
|
||||
|
||||
public void recordProcessed(String messageId) {
|
||||
DeduplicatedMessage entry = new DeduplicatedMessage(
|
||||
messageId,
|
||||
Instant.now()
|
||||
);
|
||||
repository.save(entry);
|
||||
}
|
||||
}
|
||||
|
||||
@Component
|
||||
public class PaymentEventListener {
|
||||
|
||||
private final DeduplicationService deduplicationService;
|
||||
private final PaymentService paymentService;
|
||||
|
||||
@Bean
|
||||
public Consumer<PaymentEvent> handlePaymentEvent() {
|
||||
return event -> {
|
||||
String messageId = event.getMessageId();
|
||||
|
||||
if (deduplicationService.isDuplicate(messageId)) {
|
||||
logger.info("Duplicate message ignored: {}", messageId);
|
||||
return;
|
||||
}
|
||||
|
||||
paymentService.processPayment(event);
|
||||
deduplicationService.recordProcessed(messageId);
|
||||
};
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Prevention Checklist
|
||||
- ✓ Add unique message ID to all events
|
||||
- ✓ Implement deduplication cache/database
|
||||
- ✓ Make all operations idempotent
|
||||
- ✓ Use version control for entity updates
|
||||
- ✓ Test with message replay
|
||||
|
||||
---
|
||||
|
||||
## Pitfall 3: Saga State Inconsistency
|
||||
|
||||
### Problem
|
||||
Saga state in database doesn't match actual service states, leading to orphaned or stuck sagas.
|
||||
|
||||
### Solution
|
||||
Use event sourcing or state reconciliation:
|
||||
|
||||
```java
|
||||
@Service
|
||||
public class SagaStateReconciler {
|
||||
|
||||
private final SagaStateRepository stateRepository;
|
||||
private final OrderRepository orderRepository;
|
||||
private final PaymentRepository paymentRepository;
|
||||
|
||||
@Scheduled(fixedDelay = 60000) // Run every minute
|
||||
public void reconcileSagaStates() {
|
||||
List<SagaState> processingSagas = stateRepository
|
||||
.findByStatus(SagaStatus.PROCESSING);
|
||||
|
||||
processingSagas.forEach(saga -> {
|
||||
if (isActuallyCompleted(saga)) {
|
||||
logger.info("Reconciling saga {} - marking as completed", saga.getSagaId());
|
||||
saga.setStatus(SagaStatus.COMPLETED);
|
||||
saga.setCompletedAt(Instant.now());
|
||||
stateRepository.save(saga);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private boolean isActuallyCompleted(SagaState saga) {
|
||||
String orderId = saga.getSagaId();
|
||||
|
||||
Order order = orderRepository.findById(orderId).orElse(null);
|
||||
if (order == null || order.getStatus() != OrderStatus.COMPLETED) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Payment payment = paymentRepository.findByOrderId(orderId).orElse(null);
|
||||
if (payment == null || payment.getStatus() != PaymentStatus.PROCESSED) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Prevention Checklist
|
||||
- ✓ Use event sourcing for complete audit trail
|
||||
- ✓ Implement state reconciliation job
|
||||
- ✓ Add health checks for saga coordinator
|
||||
- ✓ Monitor saga state transitions
|
||||
- ✓ Persist compensation steps
|
||||
|
||||
---
|
||||
|
||||
## Pitfall 4: Orchestrator Single Point of Failure
|
||||
|
||||
### Problem
|
||||
Orchestration-based saga fails when orchestrator is down, blocking all sagas.
|
||||
|
||||
### Solution
|
||||
Implement clustering and failover:
|
||||
|
||||
```java
|
||||
@Configuration
|
||||
public class SagaOrchestratorClusterConfig {
|
||||
|
||||
@Bean
|
||||
public SagaStateRepository sagaStateRepository() {
|
||||
// Use shared database for cluster-wide state
|
||||
return new DatabaseSagaStateRepository();
|
||||
}
|
||||
|
||||
@Bean
|
||||
@Primary
|
||||
public CommandGateway clusterAwareCommandGateway(
|
||||
CommandBus commandBus) {
|
||||
|
||||
return new ClusterAwareCommandGateway(commandBus);
|
||||
}
|
||||
}
|
||||
|
||||
@Component
|
||||
public class OrchestratorHealthCheck extends AbstractHealthIndicator {
|
||||
|
||||
private final SagaStateRepository repository;
|
||||
|
||||
@Override
|
||||
protected void doHealthCheck(Health.Builder builder) {
|
||||
long stuckSagas = repository.countStuckSagas(Duration.ofMinutes(30));
|
||||
|
||||
if (stuckSagas > 100) {
|
||||
builder.down()
|
||||
.withDetail("stuckSagas", stuckSagas)
|
||||
.withDetail("severity", "critical");
|
||||
} else if (stuckSagas > 10) {
|
||||
builder.degraded()
|
||||
.withDetail("stuckSagas", stuckSagas)
|
||||
.withDetail("severity", "warning");
|
||||
} else {
|
||||
builder.up()
|
||||
.withDetail("stuckSagas", stuckSagas);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Prevention Checklist
|
||||
- ✓ Deploy orchestrator in cluster with shared state
|
||||
- ✓ Use distributed coordination (ZooKeeper, Consul)
|
||||
- ✓ Implement heartbeat monitoring
|
||||
- ✓ Set up automatic failover
|
||||
- ✓ Use circuit breakers for service calls
|
||||
|
||||
---
|
||||
|
||||
## Pitfall 5: Non-Idempotent Compensations
|
||||
|
||||
### Problem
|
||||
Compensation logic fails on retry because it's not idempotent, leaving system in inconsistent state.
|
||||
|
||||
### Solution
|
||||
Design all compensations to be idempotent:
|
||||
|
||||
```java
|
||||
// Bad - Not idempotent
|
||||
@Service
|
||||
public class BadPaymentService {
|
||||
public void refundPayment(String paymentId) {
|
||||
Payment payment = paymentRepository.findById(paymentId).orElseThrow();
|
||||
payment.setStatus(PaymentStatus.REFUNDED);
|
||||
paymentRepository.save(payment);
|
||||
|
||||
// If this fails partway, retry causes problems
|
||||
externalPaymentGateway.refund(payment.getTransactionId());
|
||||
}
|
||||
}
|
||||
|
||||
// Good - Idempotent
|
||||
@Service
|
||||
public class GoodPaymentService {
|
||||
public void refundPayment(String paymentId) {
|
||||
Payment payment = paymentRepository.findById(paymentId)
|
||||
.orElse(null);
|
||||
|
||||
if (payment == null) {
|
||||
// Already deleted or doesn't exist
|
||||
logger.info("Payment {} not found, skipping refund", paymentId);
|
||||
return;
|
||||
}
|
||||
|
||||
if (payment.getStatus() == PaymentStatus.REFUNDED) {
|
||||
// Already refunded
|
||||
logger.info("Payment {} already refunded", paymentId);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
externalPaymentGateway.refund(payment.getTransactionId());
|
||||
payment.setStatus(PaymentStatus.REFUNDED);
|
||||
paymentRepository.save(payment);
|
||||
} catch (Exception e) {
|
||||
logger.error("Refund failed, will retry", e);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Prevention Checklist
|
||||
- ✓ Check current state before making changes
|
||||
- ✓ Use status flags to track compensation completion
|
||||
- ✓ Make database updates idempotent
|
||||
- ✓ Test compensation with replays
|
||||
- ✓ Document compensation logic
|
||||
|
||||
---
|
||||
|
||||
## Pitfall 6: Missing Timeouts
|
||||
|
||||
### Problem
|
||||
Sagas hang indefinitely waiting for events that never arrive due to service failures.
|
||||
|
||||
### Solution
|
||||
Implement timeout mechanisms:
|
||||
|
||||
```java
|
||||
@Configuration
|
||||
public class SagaTimeoutConfig {
|
||||
|
||||
@Bean
|
||||
public SagaLifecycle sagaLifecycle(SagaStateRepository repository) {
|
||||
return new SagaLifecycle() {
|
||||
@Override
|
||||
public void onSagaFinished(Saga saga) {
|
||||
// Update saga state
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@Saga
|
||||
public class OrderSaga {
|
||||
|
||||
@Autowired
|
||||
private transient CommandGateway commandGateway;
|
||||
|
||||
private String orderId;
|
||||
private String paymentId;
|
||||
private DeadlineManager deadlineManager;
|
||||
|
||||
@StartSaga
|
||||
@SagaEventHandler(associationProperty = "orderId")
|
||||
public void handle(OrderCreatedEvent event) {
|
||||
this.orderId = event.orderId();
|
||||
|
||||
// Schedule timeout for payment processing
|
||||
deadlineManager.scheduleDeadline(
|
||||
Duration.ofSeconds(30),
|
||||
"PaymentTimeout",
|
||||
orderId
|
||||
);
|
||||
|
||||
commandGateway.send(new ProcessPaymentCommand(...));
|
||||
}
|
||||
|
||||
@DeadlineHandler(deadlineName = "PaymentTimeout")
|
||||
public void handlePaymentTimeout() {
|
||||
logger.warn("Payment processing timed out for order {}", orderId);
|
||||
|
||||
// Compensate
|
||||
commandGateway.send(new CancelOrderCommand(orderId));
|
||||
end();
|
||||
}
|
||||
|
||||
@SagaEventHandler(associationProperty = "orderId")
|
||||
public void handle(PaymentProcessedEvent event) {
|
||||
// Cancel timeout
|
||||
deadlineManager.cancelDeadline("PaymentTimeout", orderId);
|
||||
// Continue saga...
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Prevention Checklist
|
||||
- ✓ Set timeout for each saga step
|
||||
- ✓ Use deadline manager to track timeouts
|
||||
- ✓ Cancel timeouts when step completes
|
||||
- ✓ Log timeout events
|
||||
- ✓ Alert operations on repeated timeouts
|
||||
|
||||
---
|
||||
|
||||
## Pitfall 7: Tight Coupling Between Services
|
||||
|
||||
### Problem
|
||||
Saga logic couples services tightly, making independent deployment impossible.
|
||||
|
||||
### Solution
|
||||
Use event-driven communication:
|
||||
|
||||
```java
|
||||
// Bad - Tight coupling
|
||||
@Service
|
||||
public class TightlyAgedOrderService {
|
||||
public void createOrder(OrderRequest request) {
|
||||
Order order = orderRepository.save(new Order(...));
|
||||
|
||||
// Direct coupling to payment service
|
||||
paymentService.processPayment(order.getId(), request.getAmount());
|
||||
}
|
||||
}
|
||||
|
||||
// Good - Event-driven
|
||||
@Service
|
||||
public class LooselyAgedOrderService {
|
||||
public void createOrder(OrderRequest request) {
|
||||
Order order = orderRepository.save(new Order(...));
|
||||
|
||||
// Publish event - services listen independently
|
||||
eventPublisher.publish(new OrderCreatedEvent(
|
||||
order.getId(),
|
||||
request.getAmount()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@Component
|
||||
public class PaymentServiceListener {
|
||||
|
||||
@Bean
|
||||
public Consumer<OrderCreatedEvent> handleOrderCreated() {
|
||||
return event -> {
|
||||
// Payment service can be deployed independently
|
||||
paymentService.processPayment(
|
||||
event.orderId(),
|
||||
event.amount()
|
||||
);
|
||||
};
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Prevention Checklist
|
||||
- ✓ Use events for inter-service communication
|
||||
- ✓ Avoid direct service-to-service calls
|
||||
- ✓ Define clear contracts for events
|
||||
- ✓ Version events for backward compatibility
|
||||
- ✓ Deploy services independently
|
||||
|
||||
---
|
||||
|
||||
## Pitfall 8: Inadequate Monitoring
|
||||
|
||||
### Problem
|
||||
Sagas fail silently or get stuck without visibility, making troubleshooting impossible.
|
||||
|
||||
### Solution
|
||||
Implement comprehensive monitoring:
|
||||
|
||||
```java
|
||||
@Component
|
||||
public class SagaMonitoring {
|
||||
|
||||
private final MeterRegistry meterRegistry;
|
||||
|
||||
@Bean
|
||||
public MeterBinder sagaMetrics(SagaStateRepository repository) {
|
||||
return (registry) -> {
|
||||
Gauge.builder("saga.active", repository::countByStatus)
|
||||
.description("Number of active sagas")
|
||||
.register(registry);
|
||||
|
||||
Gauge.builder("saga.stuck", () ->
|
||||
repository.countStuckSagas(Duration.ofMinutes(30)))
|
||||
.description("Number of stuck sagas")
|
||||
.register(registry);
|
||||
};
|
||||
}
|
||||
|
||||
public void recordSagaStart(String sagaType) {
|
||||
Counter.builder("saga.started")
|
||||
.tag("type", sagaType)
|
||||
.register(meterRegistry)
|
||||
.increment();
|
||||
}
|
||||
|
||||
public void recordSagaCompletion(String sagaType, long durationMs) {
|
||||
Timer.builder("saga.duration")
|
||||
.tag("type", sagaType)
|
||||
.publishPercentiles(0.5, 0.95, 0.99)
|
||||
.register(meterRegistry)
|
||||
.record(Duration.ofMillis(durationMs));
|
||||
}
|
||||
|
||||
public void recordSagaFailure(String sagaType, String reason) {
|
||||
Counter.builder("saga.failed")
|
||||
.tag("type", sagaType)
|
||||
.tag("reason", reason)
|
||||
.register(meterRegistry)
|
||||
.increment();
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Prevention Checklist
|
||||
- ✓ Track saga state transitions
|
||||
- ✓ Monitor step execution times
|
||||
- ✓ Alert on stuck sagas
|
||||
- ✓ Log all failures with details
|
||||
- ✓ Use distributed tracing (Sleuth, Zipkin)
|
||||
- ✓ Create dashboards for visibility
|
||||
Reference in New Issue
Block a user