@@ -149,16 +149,30 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun
149149 return func (ctx context.Context , qs string , t time.Time ) (promql.Vector , error ) {
150150 queries .Inc ()
151151 result , err := qf (ctx , qs , t )
152- // We rely on TranslateToPromqlApiError to do its job here... it returns nil, if err is nil.
153- // It returns promql.ErrStorage, if error should be reported back as 500.
154- // Other errors it returns are either for canceled or timed-out queriers (we're not reporting those as failures),
155- // or various user-errors (limits, duplicate samples, etc. ... also not failures).
156- //
157- // All errors will still be counted towards "evaluation failures" metrics and logged by Prometheus Ruler,
158- // but we only want internal errors here.
159- if _ , ok := querier .TranslateToPromqlAPIError (err ).(promql.ErrStorage ); ok {
160- failedQueries .Inc ()
152+
153+ // We only care about errors returned by underlying Queryable. Errors returned by PromQL engine are "user-errors",
154+ // and not interesting here.
155+ qerr := QueryableError {}
156+ if err != nil && errors .As (err , & qerr ) {
157+ origErr := qerr .Unwrap ()
158+
159+ // Not all errors returned by Queryable are interesting, only those that would result in 500 status code.
160+ //
161+ // We rely on TranslateToPromqlApiError to do its job here... it returns nil, if err is nil.
162+ // It returns promql.ErrStorage, if error should be reported back as 500.
163+ // Other errors it returns are either for canceled or timed-out queriers (we're not reporting those as failures),
164+ // or various user-errors (limits, duplicate samples, etc. ... also not failures).
165+ //
166+ // All errors will still be counted towards "evaluation failures" metrics and logged by Prometheus Ruler,
167+ // but we only want internal errors here.
168+ if _ , ok := querier .TranslateToPromqlAPIError (origErr ).(promql.ErrStorage ); ok {
169+ failedQueries .Inc ()
170+ }
171+
172+ // Return unwrapped error.
173+ return result , origErr
161174 }
175+
162176 return result , err
163177 }
164178}
@@ -234,10 +248,10 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi
234248 }, []string {"user" })
235249 }
236250
237- // Make sure that queryable reports errors as expected by our MetricsQueryFunc.
238- // Note that if queryable is already wrapped into ErrorTranslateQueryable, wrapping it again
239- // is fine (see TestApiStatusCodesDoubleWrapper in querier package) .
240- q = querier .NewErrorTranslateQueryable ( q )
251+ // Wrap errors returned by Queryable to our wrapper, so that we can distinguish between those errors
252+ // and errors returned by PromQL engine. Errors from Queryable can be either caused by user (limits) or internal errors.
253+ // Errors from PromQL are always "user" errors .
254+ q = querier .NewErrorTranslateQueryableWithFn ( q , WrapQueryableErrors )
241255
242256 return func (ctx context.Context , userID string , notifier * notifier.Manager , logger log.Logger , reg prometheus.Registerer ) RulesManager {
243257 var queryTime prometheus.Counter = nil
@@ -260,3 +274,23 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi
260274 })
261275 }
262276}
277+
278+ type QueryableError struct {
279+ err error
280+ }
281+
282+ func (q QueryableError ) Unwrap () error {
283+ return q .err
284+ }
285+
286+ func (q QueryableError ) Error () string {
287+ return q .err .Error ()
288+ }
289+
290+ func WrapQueryableErrors (err error ) error {
291+ if err == nil {
292+ return err
293+ }
294+
295+ return QueryableError {err : err }
296+ }
0 commit comments