# What we had — Resilience4j defaults, lightly tuned
resilience4j.circuitbreaker: instances: paymentService: failureRateThreshold: 50 slidingWindowSize: 100 slidingWindowType: COUNT_BASED waitDurationInOpenState: 30s permittedNumberOfCallsInHalfOpenState: 10
# What we had — Resilience4j defaults, lightly tuned
resilience4j.circuitbreaker: instances: paymentService: failureRateThreshold: 50 slidingWindowSize: 100 slidingWindowType: COUNT_BASED waitDurationInOpenState: 30s permittedNumberOfCallsInHalfOpenState: 10
# What we had — Resilience4j defaults, lightly tuned
resilience4j.circuitbreaker: instances: paymentService: failureRateThreshold: 50 slidingWindowSize: 100 slidingWindowType: COUNT_BASED waitDurationInOpenState: 30s permittedNumberOfCallsInHalfOpenState: 10
// Resilience4j state transitions, simplified
CircuitBreaker cb = CircuitBreaker.of("paymentService", config); cb.getEventPublisher() .onStateTransition(event -> { log.warn("CB {} : {} -> {}", event.getCircuitBreakerName(), event.getStateTransition().getFromState(), event.getStateTransition().getToState()); meterRegistry.counter("cb.transition", "name", event.getCircuitBreakerName(), "to", event.getStateTransition().getToState().name() ).increment(); });
// Resilience4j state transitions, simplified
CircuitBreaker cb = CircuitBreaker.of("paymentService", config); cb.getEventPublisher() .onStateTransition(event -> { log.warn("CB {} : {} -> {}", event.getCircuitBreakerName(), event.getStateTransition().getFromState(), event.getStateTransition().getToState()); meterRegistry.counter("cb.transition", "name", event.getCircuitBreakerName(), "to", event.getStateTransition().getToState().name() ).increment(); });
// Resilience4j state transitions, simplified
CircuitBreaker cb = CircuitBreaker.of("paymentService", config); cb.getEventPublisher() .onStateTransition(event -> { log.warn("CB {} : {} -> {}", event.getCircuitBreakerName(), event.getStateTransition().getFromState(), event.getStateTransition().getToState()); meterRegistry.counter("cb.transition", "name", event.getCircuitBreakerName(), "to", event.getStateTransition().getToState().name() ).increment(); });
resilience4j.circuitbreaker: instances: paymentService: failureRateThreshold: 30 # was 50 slowCallRateThreshold: 50 # NEW — slow calls also count slowCallDurationThreshold: 2s # NEW slidingWindowSize: 20 # was 100 minimumNumberOfCalls: 10 waitDurationInOpenState: 15s # was 30s permittedNumberOfCallsInHalfOpenState: 5
resilience4j.circuitbreaker: instances: paymentService: failureRateThreshold: 30 # was 50 slowCallRateThreshold: 50 # NEW — slow calls also count slowCallDurationThreshold: 2s # NEW slidingWindowSize: 20 # was 100 minimumNumberOfCalls: 10 waitDurationInOpenState: 15s # was 30s permittedNumberOfCallsInHalfOpenState: 5
resilience4j.circuitbreaker: instances: paymentService: failureRateThreshold: 30 # was 50 slowCallRateThreshold: 50 # NEW — slow calls also count slowCallDurationThreshold: 2s # NEW slidingWindowSize: 20 # was 100 minimumNumberOfCalls: 10 waitDurationInOpenState: 15s # was 30s permittedNumberOfCallsInHalfOpenState: 5
@Bean
public ThreadPoolBulkhead stripeBulkhead() { ThreadPoolBulkheadConfig config = ThreadPoolBulkheadConfig.custom() .maxThreadPoolSize(20) .coreThreadPoolSize(10) .queueCapacity(50) .keepAliveDuration(Duration.ofMillis(500)) .build(); return ThreadPoolBulkhead.of("stripe", config);
} @Bean
public ThreadPoolBulkhead fraudBulkhead() { // Smaller — fraud is allowed to be slow, not allowed to starve payment return ThreadPoolBulkhead.of("fraud", ThreadPoolBulkheadConfig.custom() .maxThreadPoolSize(8) .coreThreadPoolSize(4) .build());
}
@Bean
public ThreadPoolBulkhead stripeBulkhead() { ThreadPoolBulkheadConfig config = ThreadPoolBulkheadConfig.custom() .maxThreadPoolSize(20) .coreThreadPoolSize(10) .queueCapacity(50) .keepAliveDuration(Duration.ofMillis(500)) .build(); return ThreadPoolBulkhead.of("stripe", config);
} @Bean
public ThreadPoolBulkhead fraudBulkhead() { // Smaller — fraud is allowed to be slow, not allowed to starve payment return ThreadPoolBulkhead.of("fraud", ThreadPoolBulkheadConfig.custom() .maxThreadPoolSize(8) .coreThreadPoolSize(4) .build());
}
@Bean
public ThreadPoolBulkhead stripeBulkhead() { ThreadPoolBulkheadConfig config = ThreadPoolBulkheadConfig.custom() .maxThreadPoolSize(20) .coreThreadPoolSize(10) .queueCapacity(50) .keepAliveDuration(Duration.ofMillis(500)) .build(); return ThreadPoolBulkhead.of("stripe", config);
} @Bean
public ThreadPoolBulkhead fraudBulkhead() { // Smaller — fraud is allowed to be slow, not allowed to starve payment return ThreadPoolBulkhead.of("fraud", ThreadPoolBulkheadConfig.custom() .maxThreadPoolSize(8) .coreThreadPoolSize(4) .build());
}
@Transactional
public Order placeOrder(OrderRequest req) { Order order = orderRepo.save(Order.from(req)); outboxRepo.save(new OutboxEvent( "payment.charge.requested", order.getId(), objectMapper.writeValueAsString(req.payment()) )); return order; // returns in <50ms regardless of Stripe latency
}
@Transactional
public Order placeOrder(OrderRequest req) { Order order = orderRepo.save(Order.from(req)); outboxRepo.save(new OutboxEvent( "payment.charge.requested", order.getId(), objectMapper.writeValueAsString(req.payment()) )); return order; // returns in <50ms regardless of Stripe latency
}
@Transactional
public Order placeOrder(OrderRequest req) { Order order = orderRepo.save(Order.from(req)); outboxRepo.save(new OutboxEvent( "payment.charge.requested", order.getId(), objectMapper.writeValueAsString(req.payment()) )); return order; // returns in <50ms regardless of Stripe latency
} - Flash-sale spike hits the gateway at 10x RPS.
- Order Service synchronously calls Payment for every checkout.
- Stripe's p99 spikes to 14s under provider-side load.
- Payment Service threads block on those timeouts.
- failureRateThreshold=50% breached → Payment CB transitions to OPEN.
- Subsequent calls fail-fast → fallback handler enqueues "deferred order" responses to Kafka.
- Order Service's own CB drops to HALF-OPEN, probing with limited concurrency.
- Bulkhead isolation prevents the cascade from reaching Inventory, Notifications, or User services. - Trip on latency, not just errors. slowCallRateThreshold is the most underused knob in Resilience4j.
- One bulkhead per downstream, always. Coarse pools will betray you the moment two dependencies fail differently.
- Synchronous chains across third-party APIs are tech debt. An outbox + queue is more code, but it's the difference between a postmortem and an incident report.