Skip to content

Commit cfb06b5

Browse files
committed
add CircuitBreaker class
1 parent a024f03 commit cfb06b5

File tree

1 file changed

+141
-0
lines changed

1 file changed

+141
-0
lines changed

src/sentry/utils/circuit_breaker2.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,19 @@
99
from enum import Enum
1010
from typing import NotRequired, TypedDict
1111

12+
from django.conf import settings
13+
14+
from sentry.ratelimits.sliding_windows import Quota, RedisSlidingWindowRateLimiter
15+
1216
logger = logging.getLogger(__name__)
1317

18+
# XXX: If either of these values changes, the `CircuitBreakerConfig` docstrings
19+
# need to be updated
20+
# How many times stricter to be with the error limit during recovery
21+
DEFAULT_RECOVERY_STRICTNESS = 10
22+
# How many times the length of the error window to make the recovery window
23+
DEFAULT_RECOVERY_WINDOW_MULTIPLIER = 2
24+
1425

1526
class CircuitBreakerState(Enum):
1627
OK = "circuit_okay"
@@ -37,3 +48,133 @@ class CircuitBreakerConfig(TypedDict):
3748
# error limit) before returning to normal operation. Will be set to twice `error_limit_window`
3849
# if not provided.
3950
recovery_duration: NotRequired[int]
51+
52+
53+
class CircuitBreaker:
54+
"""
55+
A circuit breaker to be used to temporarily block requests to or calls of a service or function
56+
which is throwing too many errors.
57+
58+
The breaker has three states: circuit OK, circuit BROKEN, and circuit in RECOVERY. (These states
59+
respectively correspond to the closed, open, and half-open states of the traditional circuit
60+
breaker model, but are hopefully easier to keep straight than a model where closed is good and
61+
open is bad.)
62+
63+
In a OK state (normal operation), the breaker tracks errors but allows through all requests.
64+
If the frequency of errors passes a given threshold, it moves to BROKEN state.
65+
66+
In a BROKEN state, all requests are blocked. Once a set amount of time has passed, it moves
67+
to RECOVERY state.
68+
69+
RECOVERY state is identical to OK state, except that the threshold for the circuit breaking
70+
(moving back into BROKEN state) is much stricter. Once a set amount of time has passed
71+
without the breaker being tripped, it moves back to OK state.
72+
73+
The overall idea is to stop hitting a service which seems to be failing, but periodically make
74+
short attempts to use it in order to be able to resume requests once it comes back up.
75+
76+
Usage:
77+
78+
# See `CircuitBreakerConfig` class for config options
79+
breaker = CircuitBreaker("squirrel_chasing", config)
80+
81+
def get_top_dogs(payload):
82+
# Check the state of the breaker before calling the service
83+
try:
84+
if breaker.should_allow_request():
85+
response = call_chase_simulation_service("/hall-of-fame", payload)
86+
else:
87+
logger.warning("Request blocked by circuit breaker!")
88+
return None
89+
90+
# Call `record_error` only in `except` blocks whose errors should count towards the quota
91+
except TimeoutError:
92+
breaker.record_error()
93+
return "timeout" # or reraise
94+
except BadInputError:
95+
return "bad input"
96+
except Exception:
97+
breaker.record_error()
98+
return "unknown error"
99+
100+
# Call `record_error` for other problems which should count as errors
101+
if response.status == 500:
102+
breaker.record_error()
103+
return f"got {response.status}"
104+
105+
return format_hof_entries(response)
106+
107+
The `breaker.should_allow_request()` check can alternatively be used outside of `get_top_dogs`,
108+
to prevent calls to it. In that case, the original `breaker` object can be imported alongside
109+
`get_top_dogs` or reinstantiated with the same config - it has no state of its own, instead
110+
relying on redis-backed rate limiters and redis itself to track error count and breaker status.
111+
"""
112+
113+
def __init__(self, key: str, config: CircuitBreakerConfig):
114+
self.key = key
115+
self.broken_state_key = f"{key}.circuit_breaker.broken"
116+
self.recovery_state_key = f"{key}.circuit_breaker.in_recovery"
117+
118+
self.error_limit = config["error_limit"]
119+
default_recovery_error_limit = max(self.error_limit // DEFAULT_RECOVERY_STRICTNESS, 1)
120+
self.recovery_error_limit = config.get("recovery_error_limit", default_recovery_error_limit)
121+
122+
self.window = config["error_limit_window"]
123+
self.window_granularity = config.get(
124+
"error_limit_window_granularity", max(self.window // 20, 5)
125+
)
126+
127+
self.broken_state_duration = config["broken_state_duration"]
128+
self.recovery_duration = config.get(
129+
"recovery_duration", self.window * DEFAULT_RECOVERY_WINDOW_MULTIPLIER
130+
)
131+
132+
self.limiter = RedisSlidingWindowRateLimiter()
133+
self.redis_pipeline = self.limiter.client.pipeline()
134+
135+
self.primary_quota = Quota(
136+
self.window,
137+
self.window_granularity,
138+
self.error_limit,
139+
f"{key}.circuit_breaker.ok",
140+
)
141+
self.recovery_quota = Quota(
142+
self.window,
143+
self.window_granularity,
144+
self.recovery_error_limit,
145+
f"{key}.circuit_breaker.recovery",
146+
)
147+
148+
# In dev, throw an error on bad config so it can be fixed permanently. In prod, just warn
149+
# and fix it ourselves.
150+
log = logger.error if settings.DEBUG else logger.warning
151+
152+
if self.recovery_error_limit >= self.error_limit:
153+
log(
154+
"Circuit breaker '%s' has a recovery error limit (%d) greater than or equal"
155+
+ " to its primary error limit (%d). Using the stricter error-limit-based"
156+
+ " default (%d) instead.",
157+
key,
158+
self.recovery_error_limit,
159+
self.error_limit,
160+
default_recovery_error_limit,
161+
)
162+
self.recovery_error_limit = default_recovery_error_limit
163+
164+
# XXX: If we discover we have a config where we want this combo to work, we can consider
165+
# using the `MockCircuitBreaker._clear_quota` helper, which is currently only used in tests,
166+
# to clear out the main quota when we switch to the BROKEN state. (It will need tests of its
167+
# own if so.)
168+
if self.broken_state_duration + self.recovery_duration < self.window:
169+
logger.warning(
170+
"Circuit breaker '%s' has BROKEN and RECOVERY state durations (%d and %d sec, respectively)"
171+
+ " which together are less than the main error limit window (%d sec). This can lead to the"
172+
+ " breaker getting tripped unexpectedly, until the original spike in errors clears the"
173+
+ " main time window. Extending RECOVERY period to %d seconds, to give the primary quota time"
174+
+ " to clear.",
175+
key,
176+
self.broken_state_duration,
177+
self.recovery_duration,
178+
self.window,
179+
self.window - self.broken_state_duration,
180+
)

0 commit comments

Comments
 (0)