@@ -523,6 +523,166 @@ func TestRulerAlertmanagerTLS(t *testing.T) {
523523 require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (1 ), []string {"cortex_prometheus_notifications_alertmanagers_discovered" }, e2e .WaitMissingMetrics ))
524524}
525525
526+ func TestRulerMetricsForInvalidQueries (t * testing.T ) {
527+ s , err := e2e .NewScenario (networkName )
528+ require .NoError (t , err )
529+ defer s .Close ()
530+
531+ // Start dependencies.
532+ consul := e2edb .NewConsul ()
533+ minio := e2edb .NewMinio (9000 , bucketName , rulestoreBucketName )
534+ require .NoError (t , s .StartAndWaitReady (consul , minio ))
535+
536+ // Configure the ruler.
537+ flags := mergeFlags (
538+ BlocksStorageFlags (),
539+ RulerFlags (false ),
540+ map [string ]string {
541+ // Since we're not going to run any rule (our only rule is invalid), we don't need the
542+ // store-gateway to be configured to a valid address.
543+ "-querier.store-gateway-addresses" : "localhost:12345" ,
544+ // Enable the bucket index so we can skip the initial bucket scan.
545+ "-blocks-storage.bucket-store.bucket-index.enabled" : "true" ,
546+ // Evaluate rules often, so that we don't need to wait for metrics to show up.
547+ "-ruler.evaluation-interval" : "2s" ,
548+ "-ruler.poll-interval" : "2s" ,
549+ // No delay
550+ "-ruler.evaluation-delay-duration" : "0" ,
551+
552+ "-blocks-storage.tsdb.block-ranges-period" : "1h" ,
553+ "-blocks-storage.bucket-store.sync-interval" : "1s" ,
554+ "-blocks-storage.tsdb.retention-period" : "2h" ,
555+
556+ // We run single ingester only, no replication.
557+ "-distributor.replication-factor" : "1" ,
558+
559+ // Very low limit so that ruler hits it.
560+ "-querier.max-fetched-chunks-per-query" : "5" ,
561+ // We need this to make limit work.
562+ "-ingester.stream-chunks-when-using-blocks" : "true" ,
563+ },
564+ )
565+
566+ const namespace = "test"
567+ const user = "user"
568+
569+ distributor := e2ecortex .NewDistributor ("distributor" , consul .NetworkHTTPEndpoint (), flags , "" )
570+ ruler := e2ecortex .NewRuler ("ruler" , consul .NetworkHTTPEndpoint (), flags , "" )
571+ ingester := e2ecortex .NewIngester ("ingester" , consul .NetworkHTTPEndpoint (), flags , "" )
572+ require .NoError (t , s .StartAndWaitReady (distributor , ingester , ruler ))
573+
574+ // Wait until both the distributor and ruler have updated the ring. The querier will also watch
575+ // the store-gateway ring if blocks sharding is enabled.
576+ require .NoError (t , distributor .WaitSumMetrics (e2e .Equals (512 ), "cortex_ring_tokens_total" ))
577+ require .NoError (t , ruler .WaitSumMetrics (e2e .Equals (512 ), "cortex_ring_tokens_total" ))
578+
579+ c , err := e2ecortex .NewClient (distributor .HTTPEndpoint (), "" , "" , ruler .HTTPEndpoint (), user )
580+ require .NoError (t , err )
581+
582+ // Push some series to Cortex -- enough so that we can hit some limits.
583+ for i := 0 ; i < 10 ; i ++ {
584+ series , _ := generateSeries ("metric" , time .Now (), prompb.Label {Name : "foo" , Value : fmt .Sprintf ("%d" , i )})
585+
586+ res , err := c .Push (series )
587+ require .NoError (t , err )
588+ require .Equal (t , 200 , res .StatusCode )
589+ }
590+
591+ totalQueries , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_total" })
592+ require .NoError (t , err )
593+
594+ // Verify that user-failures don't increase cortex_ruler_queries_failed_total
595+ for groupName , expression := range map [string ]string {
596+ // Syntactically correct expression (passes check in ruler), but failing because of invalid regex. This fails in PromQL engine.
597+ "invalid_group" : `label_replace(metric, "foo", "$1", "service", "[")` ,
598+
599+ // This one fails in querier code, because of limits.
600+ "too_many_chunks_group" : `sum(metric)` ,
601+ } {
602+ t .Run (groupName , func (t * testing.T ) {
603+ require .NoError (t , c .SetRuleGroup (ruleGroupWithRule (groupName , "rule" , expression ), namespace ))
604+ m := ruleGroupMatcher (user , namespace , groupName )
605+
606+ // Wait until ruler has loaded the group.
607+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (1 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
608+
609+ // Wait until rule group has tried to evaluate the rule.
610+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluations_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
611+
612+ // Verify that evaluation of the rule failed.
613+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluation_failures_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
614+
615+ // But these failures were not reported as "failed queries"
616+ sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" })
617+ require .NoError (t , err )
618+ require .Equal (t , float64 (0 ), sum [0 ])
619+
620+ // Delete rule before checkin "cortex_ruler_queries_total", as we want to reuse value for next test.
621+ require .NoError (t , c .DeleteRuleGroup (namespace , groupName ))
622+
623+ // Wait until ruler has unloaded the group. We don't use any matcher, so there should be no groups (in fact, metric disappears).
624+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .SkipMissingMetrics ))
625+
626+ // Check that cortex_ruler_queries_total went up since last test.
627+ newTotalQueries , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_total" })
628+ require .NoError (t , err )
629+ require .Greater (t , newTotalQueries [0 ], totalQueries [0 ])
630+
631+ // Remember totalQueries for next test.
632+ totalQueries = newTotalQueries
633+ })
634+ }
635+
636+ // Now let's upload a non-failing rule, and make sure that it works.
637+ t .Run ("real_error" , func (t * testing.T ) {
638+ const groupName = "good_rule"
639+ const expression = `sum(metric{foo=~"1|2"})`
640+
641+ require .NoError (t , c .SetRuleGroup (ruleGroupWithRule (groupName , "rule" , expression ), namespace ))
642+ m := ruleGroupMatcher (user , namespace , groupName )
643+
644+ // Wait until ruler has loaded the group.
645+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (1 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
646+
647+ // Wait until rule group has tried to evaluate the rule, and succeeded.
648+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluations_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
649+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_prometheus_rule_evaluation_failures_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
650+
651+ // Still no failures.
652+ sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" })
653+ require .NoError (t , err )
654+ require .Equal (t , float64 (0 ), sum [0 ])
655+
656+ // Now let's stop ingester, and recheck metrics. This should increase cortex_ruler_queries_failed_total failures.
657+ require .NoError (t , s .Stop (ingester ))
658+
659+ // We should start getting "real" failures now.
660+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_ruler_queries_failed_total" }))
661+ })
662+ }
663+
664+ func ruleGroupMatcher (user , namespace , groupName string ) * labels.Matcher {
665+ return labels .MustNewMatcher (labels .MatchEqual , "rule_group" , fmt .Sprintf ("/rules/%s/%s;%s" , user , namespace , groupName ))
666+ }
667+
668+ func ruleGroupWithRule (groupName string , ruleName string , expression string ) rulefmt.RuleGroup {
669+ // Prepare rule group with invalid rule.
670+ var recordNode = yaml.Node {}
671+ var exprNode = yaml.Node {}
672+
673+ recordNode .SetString (ruleName )
674+ exprNode .SetString (expression )
675+
676+ return rulefmt.RuleGroup {
677+ Name : groupName ,
678+ Interval : 10 ,
679+ Rules : []rulefmt.RuleNode {{
680+ Record : recordNode ,
681+ Expr : exprNode ,
682+ }},
683+ }
684+ }
685+
526686func createTestRuleGroup (t * testing.T ) rulefmt.RuleGroup {
527687 t .Helper ()
528688
0 commit comments