Resilience Patterns Skill
Use this skill when:
- Building fault-tolerant distributed systems
- Implementing retry strategies with exponential backoff
- Creating circuit breaker patterns for failure isolation
- Designing graceful degradation strategies
- Building systems that continue working despite partial failures
- Implementing timeout handling and bulkhead patterns
When to Use
Use this skill when:
- Your application calls external services (APIs, databases, message queues)
- You need high availability despite component failures
- You're building distributed systems with unreliable networks
- You need to prevent cascading failures
- You're implementing retry logic for transient failures
Key Scenarios
- External Service Calls: APIs, databases, message queues may fail
- Network Issues: Packet loss, latency spikes, partitions
- Service Degradation: Third-party services may be slow or unavailable
- Resource Exhaustion: Database connections, memory limits
- Partial System Failure: Some components fail while others work
Circuit Breaker Pattern
1. State Machine
defmodule MyApp.CircuitBreaker do
use GenServer
require Logger
@states [:closed, :open, :half_open]
@default_threshold 5
@default_timeout 60_000
@default_retry_timeout 10_000
# Client API
def start_link(opts), do: GenServer.start_link(__MODULE__, opts, name: Keyword.get(opts, :name, __MODULE__})
def call(service_name, fun, opts \\ []) do
GenServer.call(__MODULE__, {:call, service_name, fun, opts})
end
# Server Callbacks
@impl true
def init(opts) do
Logger.info("Starting circuit breaker")
{:ok, %{
state: :closed,
failures: 0,
threshold: Keyword.get(opts, :threshold, @default_threshold),
timeout: Keyword.get(opts, :timeout, @default_timeout),
retry_timeout: Keyword.get(opts, :retry_timeout, @default_retry_timeout),
last_failure_time: nil,
services: %{}
}}
end
@impl true
def handle_call({:call, service_name, fun, opts}, from, state) do
case get_circuit_state(state, service_name) do
:closed ->
result = execute_and_track(service_name, fun, state)
{:reply, result, state}
:open ->
Logger.warning("Circuit breaker OPEN for #{service_name}")
{:reply, {:error, :circuit_open}, state}
:half_open ->
result = test_service(service_name, fun, state)
{:reply, result, update_after_test(service_name, result, state)}
end
end
@impl true
def handle_info({:reset_circuit, service_name}, state) do
Logger.info("Resetting circuit breaker for #{service_name}")
{:noreply, reset_circuit(service_name, state)}
end
# Implementation
defp get_circuit_state(state, service_name) do
case get_in(state, [:services, service_name, :state]) do
nil -> :closed
circuit_state -> circuit_state
end
end
defp execute_and_track(service_name, fun, state) do
try do
result = fun.()
# Success: reset failures
new_state = put_in(state, [:services, service_name], %{
state: :closed,
failures: 0,
last_failure_time: nil
})
{:ok, result}
rescue
error ->
Logger.error("Service #{service_name} failed: #{inspect(error)}")
# Increment failures and check threshold
failures = get_in(state, [:services, service_name, :failures, 0) + 1
threshold = get_in(state, [:services, service_name, :threshold, state[:threshold])
new_state = if failures >= threshold do
Logger.warning("Circuit breaker opening for #{service_name} after #{failures} failures")
timeout = get_in(state, [:services, service_name, :timeout, state[:timeout])
# Schedule reset
Process.send_after(self(), {:reset_circuit, service_name}, timeout)
put_in(state, [:services, service_name], %{
state: :open,
failures: failures,
last_failure_time: DateTime.utc_now()
})
else
put_in(state, [:services, service_name, :failures, failures)
end
{:error, error}
end
end
defp test_service(service_name, fun, state) do
retry_timeout = get_in(state, [:services, service_name, :retry_timeout, state[:retry_timeout])
try do
result = fun.()
# Success: close circuit
new_state = put_in(state, [:services, service_name], %{
state: :closed,
failures: 0,
last_failure_time: nil
})
{:ok, result}
rescue
error ->
Logger.error("Service #{service_name} failed in half-open state")
# Failure: open circuit
timeout = get_in(state, [:services, service_name, :timeout, state[:timeout])
Process.send_after(self(), {:reset_circuit, service_name}, timeout)
new_state = put_in(state, [:services, service_name], %{
state: :open,
failures: get_in(state, [:services, service_name, :failures, 0) + 1,
last_failure_time: DateTime.utc_now()
})
{:error, error}
end
end
defp update_after_test(service_name, {:ok, _result}, state) do
# Success in half-open: close circuit
put_in(state, [:services, service_name, :state, :closed)
end
defp update_after_test(service_name, {:error, _reason}, state) do
# Failure in half-open: open circuit
timeout = get_in(state, [:services, service_name, :timeout, state[:timeout])
Process.send_after(self(), {:reset_circuit, service_name}, timeout)
put_in(state, [:services, service_name, %{
state: :open,
failures: get_in(state, [:services, service_name, :failures, 0) + 1,
last_failure_time: DateTime.utc_now()
})
end
defp reset_circuit(service_name, state) do
put_in(state, [:services, service_name], %{
state: :half_open,
failures: 0,
last_failure_time: nil
})
end
end
Retry Strategies
1. Exponential Backoff
defmodule MyApp.Retry do
require Logger
@default_max_attempts 3
@default_base_delay 100
@default_max_delay 10_000
@default_jitter 0.1
# Client API
def with_retry(fun, opts \\ []) do
max_attempts = Keyword.get(opts, :max_attempts, @default_max_attempts)
base_delay = Keyword.get(opts, :base_delay, @default_base_delay)
max_delay = Keyword.get(opts, :max_delay, @default_max_delay)
jitter = Keyword.get(opts, :jitter, @default_jitter)
do_retry(fun, max_attempts, base_delay, max_delay, jitter, 1)
end
# Implementation
defp do_retry(fun, max_attempts, base_delay, max_delay, jitter, attempt) do
Logger.info("Attempt #{attempt}/#{max_attempts}")
try do
result = fun.()
{:ok, result}
rescue
error ->
Logger.error("Attempt #{attempt} failed: #{inspect(error)}")
if attempt < max_attempts do
# Calculate delay with exponential backoff and jitter
delay = calculate_delay(base_delay, max_delay, attempt, jitter)
Logger.info("Retrying in #{delay}ms...")
:timer.sleep(delay)
# Retry
do_retry(fun, max_attempts, base_delay, max_delay, jitter, attempt + 1)
else
# Max attempts reached
Logger.error("Max attempts (#{max_attempts}) reached")
{:error, error}
end
end
end
defp calculate_delay(base_delay, max_delay, attempt, jitter) do
# Exponential backoff: base * 2^(attempt - 1)
exponential_delay = base_delay * :math.pow(2, attempt - 1) |> trunc()
# Cap at max delay
capped_delay = min(exponential_delay, max_delay)
# Add jitter to avoid thundering herd
jitter_amount = trunc(capped_delay * jitter)
random_jitter = :rand.uniform(jitter_amount)
capped_delay + random_jitter
end
end
2. Retry with Backoff Library
# Use retry library
defp deps do
[
{:retry, "~> 0.18"}
]
end
defmodule MyApp.ExternalService do
require Logger
def call_api(data) do
Retry.retry_while with_retry(data, &attempt_api_call/1, 3_000)
end
defp with_retry(data, fun, timeout) do
Retry.retry(
data,
fn
{:ok, result} = fun.(data)
{:stop, {:ok, result}}
end,
# Retry only on transient errors
should_retry?(&1),
# Wait exponentially between retries
retry_exp_backoff(100, 1_000),
# Maximum timeout
timeout
)
end
defp attempt_api_call(data) do
# Make API call
case HTTPoison.post("https://api.example.com/endpoint", Jason.encode!(data)) do
{:ok, %{status_code: 200, body: body}} -> {:ok, Jason.decode!(body)}
{:ok, %{status_code: 503}} -> {:retry, :service_unavailable}
{:ok, %{status_code: 504}} -> {:stop, {:error, :not_found}}
{:error, _reason} -> {:retry, :network_error}
end
end
defp should_retry?({:retry, _reason}), do: true
defp should_retry?(_), do: false
end
Bulkhead Patterns
1. Task Pool Limitation
defmodule MyApp.Bulkhead do
use GenServer
require Logger
@default_pool_size 10
@default_queue_size 100
# Client API
def start_link(opts), do: GenServer.start_link(__MODULE__, opts, name: Keyword.get(opts, :name, __MODULE__))
def submit_task(task_name, fun), do: GenServer.cast(__MODULE__, {:submit_task, task_name, fun})
# Server Callbacks
@impl true
def init(opts) do
pool_size = Keyword.get(opts, :pool_size, @default_pool_size)
queue_size = Keyword.get(opts, :queue_size, @default_queue_size)
Logger.info("Starting bulkhead with pool_size: #{pool_size}, queue_size: #{queue_size}")
# Start task pool
{:ok, task_supervisor} = Task.Supervisor.start_link(__MODULE__, [], name: :task_supervisor)
{:ok, %{
task_supervisor: task_supervisor,
pool_size: pool_size,
queue_size: queue_size,
active_tasks: 0,
queue: :queue.new()
}}
end
@impl true
def handle_cast({:submit_task, task_name, fun}, state) do
if :queue.len(state.queue) >= state.queue_size do
Logger.warning("Queue full, rejecting task #{task_name}")
{:noreply, state}
else
# Queue task
new_queue = :queue.in({task_name, fun}, state.queue)
new_state = %{state | queue: new_queue}
# Try to process
{:noreply, process_queue(new_state)}
end
end
@impl true
def handle_info({:DOWN, _ref, :process, _pid, reason}, state) do
Logger.info("Task completed, reason: #{inspect(reason)}")
new_state = %{state | active_tasks: state.active_tasks - 1}
# Process next task from queue
{:noreply, process_queue(new_state)}
end
defp process_queue(state) do
cond do
# Queue empty or at pool limit: do nothing
:queue.is_empty(state.queue) or state.active_tasks >= state.pool_size ->
state
# Process next task
true ->
case :queue.out(state.queue) do
{{:value, {task_name, fun}}, new_queue} ->
# Start task
Task.Supervisor.start_child(state.task_supervisor, Task, fn ->
Logger.info("Starting task: #{task_name}")
fun.()
end)
new_state = %{state |
queue: new_queue,
active_tasks: state.active_tasks + 1
}
{:empty, _} ->
state
end
end
end
end
Timeout Handling
1. GenServer with Timeout
defmodule MyApp.TimeoutHandler do
use GenServer
require Logger
@default_timeout 5_000
# Client API
def start_link(opts), do: GenServer.start_link(__MODULE__, opts, name: Keyword.get(opts, :name, __MODULE__))
def process_data(data, timeout \\ @default_timeout), do
GenServer.call(__MODULE__, {:process, data}, timeout)
end
# Server Callbacks
@impl true
def init(opts), do: {:ok, opts}
@impl true
def handle_call({:process, data}, from, state) do
Logger.info("Processing data")
# Process data with timeout
Task.start(fn ->
result = do_long_operation(data)
GenServer.reply(from, {:ok, result})
end)
# Don't reply here - task will reply
{:noreply, state}
end
@impl true
def handle_info({:DOWN, _ref, :process, _pid, reason}, state) do
Logger.error("Task crashed: #{inspect(reason)}")
{:noreply, state}
end
defp do_long_operation(data) do
# Simulate long operation
:timer.sleep(2_000)
{:processed, data}
end
end
Graceful Degradation
1. Feature Flags
defmodule MyApp.FeatureFlags do
use GenServer
require Logger
# Client API
def start_link(opts), do: GenServer.start_link(__MODULE__, opts, name: Keyword.get(opts, :name, __MODULE__))
def is_enabled?(feature_name), do: GenServer.call(__MODULE__, {:is_enabled, feature_name})
def enable(feature_name), do: GenServer.cast(__MODULE__, {:enable, feature_name})
def disable(feature_name), do: GenServer.cast(__MODULE__, {:disable, feature_name})
# Server Callbacks
@impl true
def init(opts) do
Logger.info("Starting feature flags")
features = Application.get_env(:my_app, :features, %{})
{:ok, features}
end
@impl true
def handle_call({:is_enabled, feature_name}, _from, state) do
enabled = Map.get(state, feature_name, false)
Logger.info("Feature #{feature_name} enabled: #{enabled}")
{:reply, enabled, state}
end
@impl true
def handle_cast({:enable, feature_name}, state) do
Logger.info("Enabling feature: #{feature_name}")
{:noreply, Map.put(state, feature_name, true)}
end
@impl true
def handle_cast({:disable, feature_name}, state) do
Logger.warning("Disabling feature: #{feature_name}")
{:noreply, Map.put(state, feature_name, false)}
end
end
# Use in application
defmodule MyApp.UserController do
alias MyApp.FeatureFlags
def index(conn, _params) do
# Check if advanced search is enabled
if FeatureFlags.is_enabled?(:advanced_search) do
render_advanced_search(conn)
else
render_basic_search(conn)
end
end
end
Best Practices
DO
✅ Start with circuit breaker: Protect against cascading failures ✅ Use exponential backoff: Avoid overwhelming failing services ✅ Add jitter to retries: Prevent thundering herd ✅ Implement timeouts: Prevent hanging requests ✅ Use bulkheads: Limit concurrent operations ✅ Graceful degradation: Provide fallback behavior ✅ Log failures: Track failure patterns ✅ Monitor circuit states: Alert when circuits open ✅ Test failure scenarios: Chaos engineering ✅ Configure thresholds: Adjust based on service reliability
DON'T
❌ Infinite retries: Always limit retry attempts ❌ Ignore timeout errors: Handle timeout explicitly ❌ Retry on client errors: Only retry transient failures ❌ Hardcode delays: Use configurable timeout/delay values ❌ Ignore circuit state: Don't call services when circuit is open ❌ Forget about monitoring: Alert on circuit state changes ❌ Mix concerns: Keep retry logic separate from business logic ❌ Forget to back off: Use exponential backoff with jitter ❌ Silently swallow errors: Always log or handle errors
Integration with ai-rules
Roles to Reference
- Architect: Use for fault-tolerant system design
- Orchestrator: Implement resilience patterns in features
- Reviewer: Verify resilience patterns are properly implemented
- DevOps Engineer: Configure timeouts and thresholds
- QA: Test failure scenarios (network partitions, service failures)
Skills to Reference
- distributed-systems: Combine with clustering strategies
- observability: Monitor circuit states and failure patterns
- test-generation: Write tests for failure scenarios
Summary
Resilience patterns provide:
- ✅ Circuit breaker for failure isolation
- ✅ Exponential backoff for retry strategies
- ✅ Timeout handling to prevent hanging
- ✅ Bulkheads to limit resource usage
- ✅ Graceful degradation for partial failures
- ✅ Feature flags for controlled rollout
Key: Design for failure, protect against cascading failures, and monitor resilience patterns.