11import logging
22import time
33from enum import Enum
4- from math import ceil
5- from typing import Any , Literal , NotRequired , TypedDict , cast , overload
4+ from typing import Literal , NotRequired , TypedDict , overload
65
76from django .core .cache import cache
87
1514
1615logger = logging .getLogger (__name__ )
1716
17+ # How many times stricter to be with the error limit during recovery
18+ DEFAULT_RECOVERY_STRICTNESS = 10
19+
1820
1921class CircuitBreakerState (Enum ):
2022 CLOSED = "circuit_closed"
@@ -27,41 +29,35 @@ class CircuitBreakerConfig(TypedDict):
2729 error_limit : int
2830 # The time period, in seconds, over which we're tracking errors
2931 error_limit_window : int
30- # The length, in seconds, of each time bucket ("granule") used by the underlying rate limiter -
31- # effectively the resolution of the time window. Will be set automatically based on
32- # `error_limit_window` if not provided.
33- error_limit_window_granularity : NotRequired [int ]
3432 # How long, in seconds, to stay in the broken state (blocking all requests) before entering the
3533 # recovery phase
3634 broken_state_duration : int
37- # The number of errors within the given time period necessary to trip the breaker while in recovery
38- recovery_error_limit : int
39- # The time period, in seconds, over which we're tracking errors in recovery
40- recovery_error_limit_window : int
35+ # The number of errors within the given time period necessary to trip the breaker while in
36+ # recovery. Will be set automatically based on `error_limit` if not provided
37+ recovery_error_limit : NotRequired [int ]
4138 # The length, in seconds, of each time bucket ("granule") used by the underlying rate limiter -
4239 # effectively the resolution of the time window. Will be set automatically based on
43- # `recovery_error_limit_window ` if not provided.
44- recovery_error_limit_window_granularity : NotRequired [int ]
40+ # `error_limit_window ` if not provided.
41+ error_limit_window_granularity : NotRequired [int ]
4542 # How long, in seconds, to stay in the recovery state (allowing requests but with a stricter
46- # error limit) before returning to normal operation.
47- recovery_duration : int
43+ # error limit) before returning to normal operation. Will be set based on `error_limit_window`
44+ # if not provided.
45+ recovery_duration : NotRequired [int ]
4846
4947
50- # TODO: These limits were estimated based on EA traffic. (In an average 10 min period, there are
51- # roughly 35K events without matching hashes. About 2% of orgs are EA, so for simplicity, assume 2%
52- # of those events are from EA orgs. If we're willing to tolerate up to a 95% failure rate, then we
53- # need 35K * 0.02 * 0.95 events to fail to trip the breaker. Technically that's 665, not 666, but
54- # we're talking about everything going to hell, so the bump to 666 seemed appropriate!)
48+ # TODO: The default error limit here was estimated based on EA traffic. (In an average 10 min
49+ # period, there are roughly 35K events without matching hashes. About 2% of orgs are EA, so for
50+ # simplicity, assume 2% of those events are from EA orgs. If we're willing to tolerate up to a 95%
51+ # failure rate, then we need 35K * 0.02 * 0.95 events to fail to trip the breaker. Technically
52+ # that's 665, not 666, but we're talking about everything going to hell, so the bump to 666 seemed
53+ # appropriate!)
5554#
5655# When we GA, we should multiply both the limits by 50 (to remove the 2% part of the current
57- # calculation).
56+ # calculation), and remove this TODO .
5857CIRCUIT_BREAKER_DEFAULT_CONFIG : CircuitBreakerConfig = {
5958 "error_limit" : 666 ,
6059 "error_limit_window" : 600 , # 10 min
6160 "broken_state_duration" : 300 , # 5 min
62- "recovery_error_limit" : 3 , # In recovery, we're twice as strict as normal limit
63- "recovery_error_limit_window" : 60 , # And we bail much more quickly
64- "recovery_duration" : 300 , # 5 min
6561}
6662
6763
@@ -108,66 +104,80 @@ def get_top_dogs(payload):
108104
109105 return format_hof_entries(response)
110106
111- The `breaker.should_allow_request()` check can alternatively be used outside of
112- `get_top_dogs`, to prevent calls to it. In that case, the original `breaker` object can be
113- imported alongside `get_top_dogs` or reinstantiated with the same config - it has no state of
114- its own, instead relying on redis and the cache to track error count and breaker status.
107+ The `breaker.should_allow_request()` check can alternatively be used outside of `get_top_dogs`,
108+ to prevent calls to it. In that case, the original `breaker` object can be imported alongside
109+ `get_top_dogs` or reinstantiated with the same config - it has no state of its own, instead
110+ relying on redis-backed rate limiters and the cache to track error count and breaker status.
115111 """
116112
117- def __init__ (self , key : str , config : CircuitBreakerConfig | None = None ):
113+ def __init__ (self , key : str , config : CircuitBreakerConfig = CIRCUIT_BREAKER_DEFAULT_CONFIG ):
118114 self .key = key
119115 self .broken_state_key = f"{ key } .circuit_breaker.broken"
120116 self .recovery_state_key = f"{ key } .circuit_breaker.in_recovery"
121117
122- final_config : CircuitBreakerConfig = {
123- ** CIRCUIT_BREAKER_DEFAULT_CONFIG ,
124- ** (config or cast (Any , {})),
125- }
126- default_window_granularity = self ._get_default_window_granularity (
127- final_config ["error_limit_window" ]
118+ self .error_limit = config ["error_limit" ]
119+ self .recovery_error_limit = config .get (
120+ "recovery_error_limit" , max (self .error_limit // DEFAULT_RECOVERY_STRICTNESS , 1 )
128121 )
129- default_recovery_window_granularity = self ._get_default_window_granularity (
130- final_config ["recovery_error_limit_window" ]
122+ self .window = config ["error_limit_window" ]
123+ self .window_granularity = config .get (
124+ "error_limit_window_granularity" , max (self .window // 10 , 5 )
131125 )
126+ self .broken_state_duration = config ["broken_state_duration" ]
127+ self .recovery_duration = config .get ("recovery_duration" , self .window * 2 )
132128
133129 self .limiter = RedisSlidingWindowRateLimiter ()
134130 self .primary_quota = Quota (
135- final_config [ "error_limit_window" ] ,
136- final_config . get ( "error_limit_window_granularity" , default_window_granularity ) ,
137- final_config [ " error_limit" ] ,
131+ self . window ,
132+ self . window_granularity ,
133+ self . error_limit ,
138134 f"{ key } .circuit_breaker" ,
139135 )
140136 self .recovery_quota = Quota (
141- final_config ["recovery_error_limit_window" ],
142- final_config .get (
143- "recovery_error_limit_window_granularity" , default_recovery_window_granularity
144- ),
145- final_config ["recovery_error_limit" ],
137+ self .window ,
138+ self .window_granularity ,
139+ self .recovery_error_limit ,
146140 f"{ key } .circuit_breaker_recovery" ,
147141 )
148142
149- self .broken_state_duration = final_config ["broken_state_duration" ]
150- self .recovery_duration = final_config ["recovery_duration" ]
151-
152- if self .recovery_duration < final_config ["recovery_error_limit_window" ]:
143+ if self .recovery_error_limit >= self .error_limit :
144+ self .recovery_error_limit = max (self .error_limit // DEFAULT_RECOVERY_STRICTNESS , 1 )
153145 logger .warning (
154- "Circuit breaker %s has `recovery_duration` < `recovery_error_limit_window`."
155- + " Recovery duration has been reset to match the window." ,
146+ "Circuit breaker '%s' has a recovery error limit (%d) greater than"
147+ + " or equal to its error limit (%d). Using the stricter error-limit-based default"
148+ + " (%d) instead." ,
156149 key ,
150+ config ["recovery_error_limit" ],
151+ self .error_limit ,
152+ self .recovery_error_limit ,
157153 )
158- self .recovery_duration = final_config ["recovery_error_limit_window" ]
159- if self .recovery_duration < final_config ["recovery_error_limit_window" ]:
154+
155+ # XXX: If we discover we have a config where we want this combo to work, we can consider
156+ # using the `MockCircuitBreaker._clear_quota` helper, which is currently only used in tests,
157+ # to clear out the main quota when we switch to the broken state. (It will need tests of its
158+ # own if so.)
159+ if self .broken_state_duration + self .recovery_duration < self .window :
160160 logger .warning (
161- "Circuit breaker %s has `recovery_duration` < `recovery_error_limit_window`."
162- + " Recovery duration has been reset to match the window." ,
161+ "Circuit breaker '%s' has broken state and recovery durations (%d and %d sec)"
162+ + " which together are less than the main error limit window (%d sec). This"
163+ + " can lead to the breaker getting tripped unexpectedly, until the original"
164+ + " spike in errors clears the main time window." ,
163165 key ,
166+ self .broken_state_duration ,
167+ self .recovery_duration ,
168+ self .window ,
164169 )
165- self .recovery_duration = final_config ["recovery_error_limit_window" ]
166170
167171 def record_error (self ) -> None :
172+ """
173+ Record a single error towards the breaker's quota, and handle the case where that error puts
174+ us over the limit.
175+ """
168176 state , seconds_left_in_state = self ._get_state_and_remaining_time ()
169177
170178 if state == CircuitBreakerState .BROKEN :
179+ assert seconds_left_in_state is not None # mypy appeasement
180+
171181 # If the circuit is broken, and `should_allow_request` is being used correctly, requests
172182 # should be blocked and we shouldn't even be here. That said, maybe there was a race
173183 # condition, so make sure the circuit hasn't just broken before crying foul.
@@ -222,6 +232,10 @@ def record_error(self) -> None:
222232 cache .set (self .recovery_state_key , recovery_state_expiry , recovery_state_timeout )
223233
224234 def should_allow_request (self ) -> bool :
235+ """
236+ Determine, based on the current state of the breaker and the number of allowable errors
237+ remaining, whether requests should be allowed through.
238+ """
225239 state , _ = self ._get_state_and_remaining_time ()
226240
227241 if state == CircuitBreakerState .BROKEN :
@@ -255,8 +269,13 @@ def get_remaining_error_quota(
255269 self , quota : Quota | None = None , window_end : int | None = None
256270 ) -> int | None :
257271 """
258- # TODO: write me
259- returns None when in broken state
272+ Get the number of allowable errors remaining in the given quota for the time window ending
273+ at the given time.
274+
275+ If no quota is given, in closed and recovery states, return the current controlling quota's
276+ remaining errors. In broken state, return None.
277+
278+ If no time window end is given, return the current amount of quota remaining.
260279 """
261280 if not quota :
262281 quota = self ._get_controlling_quota ()
@@ -273,12 +292,6 @@ def get_remaining_error_quota(
273292
274293 return result [0 ].granted
275294
276- def _get_default_window_granularity (self , window_duration : int ) -> int :
277- # Never more than 10 buckets, and no bucket smaller than 5 seconds. If greater precision is
278- # needed, the `error_limit_window_granularity` and `recovery_error_limit_window_granularity`
279- # config options can be used.
280- return max (ceil (window_duration / 10 ), 5 )
281-
282295 @overload
283296 def _get_controlling_quota (
284297 self , state : Literal [CircuitBreakerState .CLOSED , CircuitBreakerState .RECOVERY ]
@@ -295,8 +308,8 @@ def _get_controlling_quota(self) -> Quota | None:
295308
296309 def _get_controlling_quota (self , state : CircuitBreakerState | None = None ) -> Quota | None :
297310 """
298- # TODO: write me
299- returns None when in broken state
311+ Return the Quota corresponding to the given breaker state (or the current breaker state, if
312+ no state is provided). If the state is question is the broken state, return None.
300313 """
301314 controlling_quota_by_state = {
302315 CircuitBreakerState .CLOSED : self .primary_quota ,
@@ -308,10 +321,13 @@ def _get_controlling_quota(self, state: CircuitBreakerState | None = None) -> Qu
308321
309322 return controlling_quota_by_state [_state ]
310323
311- def _get_state_and_remaining_time (self ) -> tuple [CircuitBreakerState , int ]:
324+ def _get_state_and_remaining_time (
325+ self ,
326+ ) -> tuple [CircuitBreakerState , int | None ]:
312327 """
313328 Return the current state of the breaker (closed, broken, or in recovery), along with the
314- number of seconds until that state expires.
329+ number of seconds until that state expires (or `None` when in closed state, as it has no
330+ expiry).
315331 """
316332 now = int (time .time ())
317333
@@ -322,7 +338,4 @@ def _get_state_and_remaining_time(self) -> tuple[CircuitBreakerState, int]:
322338 if cache .has_key (self .recovery_state_key ):
323339 return (CircuitBreakerState .RECOVERY , cache .get (self .recovery_state_key ) - now )
324340
325- # TODO Fix this with overloads?
326- # 0 here is just a placeholder, as "remaining seconds" doesn't really apply to a state we
327- # hope to stay in indefinitely
328- return (CircuitBreakerState .CLOSED , 0 )
341+ return (CircuitBreakerState .CLOSED , None )
0 commit comments