99from enum import Enum
1010from typing import NotRequired , TypedDict
1111
12+ from django .conf import settings
13+
14+ from sentry .ratelimits .sliding_windows import Quota , RedisSlidingWindowRateLimiter
15+
1216logger = logging .getLogger (__name__ )
1317
18+ # XXX: If either of these values changes, the `CircuitBreakerConfig` docstrings
19+ # need to be updated
20+ # How many times stricter to be with the error limit during recovery
21+ DEFAULT_RECOVERY_STRICTNESS = 10
22+ # How many times the length of the error window to make the recovery window
23+ DEFAULT_RECOVERY_WINDOW_MULTIPLIER = 2
24+
1425
1526class CircuitBreakerState (Enum ):
1627 OK = "circuit_okay"
@@ -37,3 +48,133 @@ class CircuitBreakerConfig(TypedDict):
3748 # error limit) before returning to normal operation. Will be set to twice `error_limit_window`
3849 # if not provided.
3950 recovery_duration : NotRequired [int ]
51+
52+
53+ class CircuitBreaker :
54+ """
55+ A circuit breaker to be used to temporarily block requests to or calls of a service or function
56+ which is throwing too many errors.
57+
58+ The breaker has three states: circuit OK, circuit BROKEN, and circuit in RECOVERY. (These states
59+ respectively correspond to the closed, open, and half-open states of the traditional circuit
60+ breaker model, but are hopefully easier to keep straight than a model where closed is good and
61+ open is bad.)
62+
63+ In a OK state (normal operation), the breaker tracks errors but allows through all requests.
64+ If the frequency of errors passes a given threshold, it moves to BROKEN state.
65+
66+ In a BROKEN state, all requests are blocked. Once a set amount of time has passed, it moves
67+ to RECOVERY state.
68+
69+ RECOVERY state is identical to OK state, except that the threshold for the circuit breaking
70+ (moving back into BROKEN state) is much stricter. Once a set amount of time has passed
71+ without the breaker being tripped, it moves back to OK state.
72+
73+ The overall idea is to stop hitting a service which seems to be failing, but periodically make
74+ short attempts to use it in order to be able to resume requests once it comes back up.
75+
76+ Usage:
77+
78+ # See `CircuitBreakerConfig` class for config options
79+ breaker = CircuitBreaker("squirrel_chasing", config)
80+
81+ def get_top_dogs(payload):
82+ # Check the state of the breaker before calling the service
83+ try:
84+ if breaker.should_allow_request():
85+ response = call_chase_simulation_service("/hall-of-fame", payload)
86+ else:
87+ logger.warning("Request blocked by circuit breaker!")
88+ return None
89+
90+ # Call `record_error` only in `except` blocks whose errors should count towards the quota
91+ except TimeoutError:
92+ breaker.record_error()
93+ return "timeout" # or reraise
94+ except BadInputError:
95+ return "bad input"
96+ except Exception:
97+ breaker.record_error()
98+ return "unknown error"
99+
100+ # Call `record_error` for other problems which should count as errors
101+ if response.status == 500:
102+ breaker.record_error()
103+ return f"got {response.status}"
104+
105+ return format_hof_entries(response)
106+
107+ The `breaker.should_allow_request()` check can alternatively be used outside of `get_top_dogs`,
108+ to prevent calls to it. In that case, the original `breaker` object can be imported alongside
109+ `get_top_dogs` or reinstantiated with the same config - it has no state of its own, instead
110+ relying on redis-backed rate limiters and redis itself to track error count and breaker status.
111+ """
112+
113+ def __init__ (self , key : str , config : CircuitBreakerConfig ):
114+ self .key = key
115+ self .broken_state_key = f"{ key } .circuit_breaker.broken"
116+ self .recovery_state_key = f"{ key } .circuit_breaker.in_recovery"
117+
118+ self .error_limit = config ["error_limit" ]
119+ default_recovery_error_limit = max (self .error_limit // DEFAULT_RECOVERY_STRICTNESS , 1 )
120+ self .recovery_error_limit = config .get ("recovery_error_limit" , default_recovery_error_limit )
121+
122+ self .window = config ["error_limit_window" ]
123+ self .window_granularity = config .get (
124+ "error_limit_window_granularity" , max (self .window // 20 , 5 )
125+ )
126+
127+ self .broken_state_duration = config ["broken_state_duration" ]
128+ self .recovery_duration = config .get (
129+ "recovery_duration" , self .window * DEFAULT_RECOVERY_WINDOW_MULTIPLIER
130+ )
131+
132+ self .limiter = RedisSlidingWindowRateLimiter ()
133+ self .redis_pipeline = self .limiter .client .pipeline ()
134+
135+ self .primary_quota = Quota (
136+ self .window ,
137+ self .window_granularity ,
138+ self .error_limit ,
139+ f"{ key } .circuit_breaker.ok" ,
140+ )
141+ self .recovery_quota = Quota (
142+ self .window ,
143+ self .window_granularity ,
144+ self .recovery_error_limit ,
145+ f"{ key } .circuit_breaker.recovery" ,
146+ )
147+
148+ # In dev, throw an error on bad config so it can be fixed permanently. In prod, just warn
149+ # and fix it ourselves.
150+ log = logger .error if settings .DEBUG else logger .warning
151+
152+ if self .recovery_error_limit >= self .error_limit :
153+ log (
154+ "Circuit breaker '%s' has a recovery error limit (%d) greater than or equal"
155+ + " to its primary error limit (%d). Using the stricter error-limit-based"
156+ + " default (%d) instead." ,
157+ key ,
158+ self .recovery_error_limit ,
159+ self .error_limit ,
160+ default_recovery_error_limit ,
161+ )
162+ self .recovery_error_limit = default_recovery_error_limit
163+
164+ # XXX: If we discover we have a config where we want this combo to work, we can consider
165+ # using the `MockCircuitBreaker._clear_quota` helper, which is currently only used in tests,
166+ # to clear out the main quota when we switch to the BROKEN state. (It will need tests of its
167+ # own if so.)
168+ if self .broken_state_duration + self .recovery_duration < self .window :
169+ logger .warning (
170+ "Circuit breaker '%s' has BROKEN and RECOVERY state durations (%d and %d sec, respectively)"
171+ + " which together are less than the main error limit window (%d sec). This can lead to the"
172+ + " breaker getting tripped unexpectedly, until the original spike in errors clears the"
173+ + " main time window. Extending RECOVERY period to %d seconds, to give the primary quota time"
174+ + " to clear." ,
175+ key ,
176+ self .broken_state_duration ,
177+ self .recovery_duration ,
178+ self .window ,
179+ self .window - self .broken_state_duration ,
180+ )
0 commit comments