-
Notifications
You must be signed in to change notification settings - Fork 4.5k
xds: client outlier detection metrics [A91] #8437
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
394b6a5
7cd46ea
dc641b1
8956133
48df6e4
46d27ff
19021a8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,7 @@ | |
|
||
"google.golang.org/grpc/balancer" | ||
"google.golang.org/grpc/connectivity" | ||
estats "google.golang.org/grpc/experimental/stats" | ||
"google.golang.org/grpc/internal/balancer/gracefulswitch" | ||
"google.golang.org/grpc/internal/buffer" | ||
"google.golang.org/grpc/internal/channelz" | ||
|
@@ -52,6 +53,26 @@ | |
// Name is the name of the outlier detection balancer. | ||
const Name = "outlier_detection_experimental" | ||
|
||
var ( | ||
ejectionsEnforcedMetric = estats.RegisterInt64Count(estats.MetricDescriptor{ | ||
Name: "grpc.lb.outlier_detection.ejections_enforced", | ||
Description: "EXPERIMENTAL. Number of outlier ejections enforced by detection method", | ||
Unit: "ejection", | ||
Labels: []string{"grpc.target", "grpc.lb.outlier_detection.detection_method"}, | ||
OptionalLabels: []string{"grpc.lb.backend_service"}, | ||
Default: false, | ||
}) | ||
|
||
ejectionsUnenforcedMetric = estats.RegisterInt64Count(estats.MetricDescriptor{ | ||
Name: "grpc.lb.outlier_detection.ejections_unenforced", | ||
Description: "EXPERIMENTAL. Number of unenforced outlier ejections due to either max ejection percentage or enforcement_percentage", | ||
Unit: "ejection", | ||
Labels: []string{"grpc.target", "grpc.lb.outlier_detection.detection_method", "grpc.lb.outlier_detection.unenforced_reason"}, | ||
OptionalLabels: []string{"grpc.lb.backend_service"}, | ||
Default: false, | ||
}) | ||
) | ||
|
||
func init() { | ||
balancer.Register(bb{}) | ||
} | ||
|
@@ -60,14 +81,16 @@ | |
|
||
func (bb) Build(cc balancer.ClientConn, bOpts balancer.BuildOptions) balancer.Balancer { | ||
b := &outlierDetectionBalancer{ | ||
ClientConn: cc, | ||
closed: grpcsync.NewEvent(), | ||
done: grpcsync.NewEvent(), | ||
addrs: make(map[string]*endpointInfo), | ||
scUpdateCh: buffer.NewUnbounded(), | ||
pickerUpdateCh: buffer.NewUnbounded(), | ||
channelzParent: bOpts.ChannelzParent, | ||
endpoints: resolver.NewEndpointMap[*endpointInfo](), | ||
ClientConn: cc, | ||
closed: grpcsync.NewEvent(), | ||
done: grpcsync.NewEvent(), | ||
addrs: make(map[string]*endpointInfo), | ||
scUpdateCh: buffer.NewUnbounded(), | ||
pickerUpdateCh: buffer.NewUnbounded(), | ||
channelzParent: bOpts.ChannelzParent, | ||
endpoints: resolver.NewEndpointMap[*endpointInfo](), | ||
metricsRecorder: cc.MetricsRecorder(), | ||
target: bOpts.Target.String(), | ||
} | ||
b.logger = prefixLogger(b) | ||
b.logger.Infof("Created") | ||
|
@@ -134,6 +157,15 @@ | |
return Name | ||
} | ||
|
||
// extractBackendService extracts the backend service from resolver state attributes. | ||
// This is a placeholder implementation - the actual extraction logic should be | ||
// implemented based on the specific resolver attributes available. | ||
func extractBackendService(resolver.State) string { | ||
// TODO: Implement backend service extraction from resolver attributes per A89 and A75 | ||
// For now, return empty string as this is optional | ||
return "" | ||
} | ||
|
||
Comment on lines
+160
to
+168
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please remove this? If it's not clear this needs to be done as part of A75/A89 implementation work, then you could file a bug so that we don't forget about it. Thanks! |
||
// scUpdate wraps a subConn update to be sent to the child balancer. | ||
type scUpdate struct { | ||
scw *subConnWrapper | ||
|
@@ -169,10 +201,13 @@ | |
// to suppress redundant picker updates. | ||
recentPickerNoop bool | ||
|
||
closed *grpcsync.Event | ||
done *grpcsync.Event | ||
logger *grpclog.PrefixLogger | ||
channelzParent channelz.Identifier | ||
closed *grpcsync.Event | ||
done *grpcsync.Event | ||
logger *grpclog.PrefixLogger | ||
channelzParent channelz.Identifier | ||
metricsRecorder estats.MetricsRecorder | ||
target string | ||
backendService string | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can also be removed along with the above. |
||
|
||
child synchronizingBalancerWrapper | ||
|
||
|
@@ -294,6 +329,7 @@ | |
b.inhibitPickerUpdates = true | ||
b.updateUnconditionally = false | ||
b.cfg = lbCfg | ||
b.backendService = extractBackendService(s.ResolverState) | ||
|
||
newEndpoints := resolver.NewEndpointMap[bool]() | ||
for _, ep := range s.ResolverState.Endpoints { | ||
|
@@ -791,15 +827,21 @@ | |
for _, epInfo := range endpointsToConsider { | ||
bucket := epInfo.callCounter.inactiveBucket | ||
ejectionCfg := b.cfg.SuccessRateEjection | ||
if float64(b.numEndpointsEjected)/float64(b.endpoints.Len())*100 >= float64(b.cfg.MaxEjectionPercent) { | ||
return | ||
} | ||
successRate := float64(bucket.numSuccesses) / float64(bucket.numSuccesses+bucket.numFailures) | ||
requiredSuccessRate := mean - stddev*(float64(ejectionCfg.StdevFactor)/1000) | ||
if successRate < requiredSuccessRate { | ||
channelz.Infof(logger, b.channelzParent, "SuccessRate algorithm detected outlier: %s. Parameters: successRate=%f, mean=%f, stddev=%f, requiredSuccessRate=%f", epInfo, successRate, mean, stddev, requiredSuccessRate) | ||
// Check if max ejection percentage would prevent ejection | ||
if float64(b.numEndpointsEjected)/float64(b.endpoints.Len())*100 >= float64(b.cfg.MaxEjectionPercent) { | ||
// Record unenforced ejection due to max ejection percentage | ||
ejectionsUnenforcedMetric.Record(b.metricsRecorder, 1, b.target, "success_rate", "max_ejection_overflow", b.backendService) | ||
continue | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please add tests that include the cases that are currently missed? I.e. ejection did not occur for various reasons. |
||
} | ||
if uint32(rand.Int32N(100)) < ejectionCfg.EnforcementPercentage { | ||
b.ejectEndpoint(epInfo) | ||
b.ejectEndpoint(epInfo, "success_rate") | ||
} else { | ||
// Record unenforced ejection due to enforcement percentage | ||
ejectionsUnenforcedMetric.Record(b.metricsRecorder, 1, b.target, "success_rate", "enforcement_percentage", b.backendService) | ||
} | ||
} | ||
} | ||
|
@@ -819,21 +861,27 @@ | |
for _, epInfo := range endpointsToConsider { | ||
bucket := epInfo.callCounter.inactiveBucket | ||
ejectionCfg := b.cfg.FailurePercentageEjection | ||
if float64(b.numEndpointsEjected)/float64(b.endpoints.Len())*100 >= float64(b.cfg.MaxEjectionPercent) { | ||
return | ||
} | ||
failurePercentage := (float64(bucket.numFailures) / float64(bucket.numSuccesses+bucket.numFailures)) * 100 | ||
if failurePercentage > float64(b.cfg.FailurePercentageEjection.Threshold) { | ||
channelz.Infof(logger, b.channelzParent, "FailurePercentage algorithm detected outlier: %s, failurePercentage=%f", epInfo, failurePercentage) | ||
// Check if max ejection percentage would prevent ejection | ||
if float64(b.numEndpointsEjected)/float64(b.endpoints.Len())*100 >= float64(b.cfg.MaxEjectionPercent) { | ||
// Record unenforced ejection due to max ejection percentage | ||
ejectionsUnenforcedMetric.Record(b.metricsRecorder, 1, b.target, "failure_percentage", "max_ejection_overflow", b.backendService) | ||
continue | ||
} | ||
if uint32(rand.Int32N(100)) < ejectionCfg.EnforcementPercentage { | ||
b.ejectEndpoint(epInfo) | ||
b.ejectEndpoint(epInfo, "failure_percentage") | ||
} else { | ||
// Record unenforced ejection due to enforcement percentage | ||
ejectionsUnenforcedMetric.Record(b.metricsRecorder, 1, b.target, "failure_percentage", "enforcement_percentage", b.backendService) | ||
} | ||
} | ||
} | ||
} | ||
|
||
// Caller must hold b.mu. | ||
func (b *outlierDetectionBalancer) ejectEndpoint(epInfo *endpointInfo) { | ||
func (b *outlierDetectionBalancer) ejectEndpoint(epInfo *endpointInfo, detectionMethod string) { | ||
b.numEndpointsEjected++ | ||
epInfo.latestEjectionTimestamp = b.timerStartTime | ||
epInfo.ejectionTimeMultiplier++ | ||
|
@@ -842,6 +890,8 @@ | |
channelz.Infof(logger, b.channelzParent, "Subchannel ejected: %s", sbw) | ||
} | ||
|
||
// Record the enforced ejection metric | ||
ejectionsEnforcedMetric.Record(b.metricsRecorder, 1, b.target, detectionMethod, b.backendService) | ||
} | ||
|
||
// Caller must hold b.mu. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Optional: since we have
cc
captured, maybe we don't also want to save theMetricsRecorder
in another field? Less state seems better.