@@ -528,30 +528,11 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
528528 require .NoError (t , err )
529529 defer s .Close ()
530530
531- // Prepare rule group with invalid rule.
532- var recordNode = yaml.Node {}
533- var exprNode = yaml.Node {}
534-
535- recordNode .SetString ("invalid_rule" )
536- exprNode .SetString (`label_replace(metric, "foo", "$1", "service", "[")` ) // Syntactically correct expression (passes check in ruler), but it fails because of invalid regex.
537-
538- ruleGroup := rulefmt.RuleGroup {
539- Name : "group_with_invalid_rule" ,
540- Interval : 10 ,
541- Rules : []rulefmt.RuleNode {{
542- Record : recordNode ,
543- Expr : exprNode ,
544- }},
545- }
546-
547531 // Start dependencies.
548532 consul := e2edb .NewConsul ()
549533 minio := e2edb .NewMinio (9000 , bucketName , rulestoreBucketName )
550534 require .NoError (t , s .StartAndWaitReady (consul , minio ))
551535
552- // Start Cortex components.
553- require .NoError (t , copyFileToSharedDir (s , "docs/configuration/single-process-config-blocks.yaml" , cortexConfigFile ))
554-
555536 // Configure the ruler.
556537 flags := mergeFlags (
557538 BlocksStorageFlags (),
@@ -564,37 +545,142 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
564545 "-blocks-storage.bucket-store.bucket-index.enabled" : "true" ,
565546 // Evaluate rules often, so that we don't need to wait for metrics to show up.
566547 "-ruler.evaluation-interval" : "2s" ,
548+ "-ruler.poll-interval" : "2s" ,
549+ // No delay
550+ "-ruler.evaluation-delay-duration" : "0" ,
551+
552+ "-blocks-storage.tsdb.block-ranges-period" : "1h" ,
553+ "-blocks-storage.bucket-store.sync-interval" : "1s" ,
554+ "-blocks-storage.tsdb.retention-period" : "2h" ,
555+
556+ // We run single ingester only, no replication.
557+ "-distributor.replication-factor" : "1" ,
558+
559+ // Very low limit so that ruler hits it.
560+ "-querier.max-fetched-chunks-per-query" : "5" ,
561+ // We need this to make limit work.
562+ "-ingester.stream-chunks-when-using-blocks" : "true" ,
567563 },
568564 )
569565
570- // Start ruler.
571- cortex := e2ecortex .NewSingleBinaryWithConfigFile ("cortex" , cortexConfigFile , flags , "" , 9009 , 9095 )
572- require .NoError (t , s .StartAndWaitReady (cortex ))
566+ const namespace = "test"
567+ const user = "user"
573568
574- c , err := e2ecortex .NewClient (cortex .HTTPEndpoint (), cortex .HTTPEndpoint (), "" , cortex .HTTPEndpoint (), "user" )
575- require .NoError (t , err )
569+ distributor := e2ecortex .NewDistributor ("distributor" , consul .NetworkHTTPEndpoint (), flags , "" )
570+ ruler := e2ecortex .NewRuler ("ruler" , consul .NetworkHTTPEndpoint (), flags , "" )
571+ ingester := e2ecortex .NewIngester ("ingester" , consul .NetworkHTTPEndpoint (), flags , "" )
572+ require .NoError (t , s .StartAndWaitReady (distributor , ingester , ruler ))
576573
577- // Push some series to Cortex.
578- series , _ := generateSeries ("metric" , time .Now (), prompb.Label {Name : "foo" , Value : "bar" })
574+ // Wait until both the distributor and ruler have updated the ring. The querier will also watch
575+ // the store-gateway ring if blocks sharding is enabled.
576+ require .NoError (t , distributor .WaitSumMetrics (e2e .Equals (512 ), "cortex_ring_tokens_total" ))
577+ require .NoError (t , ruler .WaitSumMetrics (e2e .Equals (512 ), "cortex_ring_tokens_total" ))
579578
580- res , err := c . Push ( series )
579+ c , err := e2ecortex . NewClient ( distributor . HTTPEndpoint (), "" , "" , ruler . HTTPEndpoint (), user )
581580 require .NoError (t , err )
582- require .Equal (t , 200 , res .StatusCode )
583581
584- // Upload rule group to the ruler.
582+ // Push some series to Cortex -- enough so that we can hit some limits.
583+ for i := 0 ; i < 10 ; i ++ {
584+ series , _ := generateSeries ("metric" , time .Now (), prompb.Label {Name : "foo" , Value : fmt .Sprintf ("%d" , i )})
585+
586+ res , err := c .Push (series )
587+ require .NoError (t , err )
588+ require .Equal (t , 200 , res .StatusCode )
589+ }
590+
591+ totalQueries , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_total" })
585592 require .NoError (t , err )
586- require .NoError (t , c .SetRuleGroup (ruleGroup , "test" ))
587593
588- // Wait until ruler has loaded the group.
589- require .NoError (t , cortex .WaitSumMetricsWithOptions (e2e .Equals (1 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .WaitMissingMetrics ))
594+ // Verify that user-failures don't increase cortex_ruler_queries_failed_total
595+ for groupName , expression := range map [string ]string {
596+ // Syntactically correct expression (passes check in ruler), but failing because of invalid regex. This fails in PromQL engine.
597+ "invalid_group" : `label_replace(metric, "foo", "$1", "service", "[")` ,
598+
599+ // This one fails in querier code, because of limits.
600+ "too_many_chunks_group" : `sum(metric)` ,
601+ } {
602+ t .Run (groupName , func (t * testing.T ) {
603+ require .NoError (t , c .SetRuleGroup (ruleGroupWithRule (groupName , "rule" , expression ), namespace ))
604+ m := ruleGroupMatcher (user , namespace , groupName )
605+
606+ // Wait until ruler has loaded the group.
607+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (1 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
608+
609+ // Wait until rule group has tried to evaluate the rule.
610+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluations_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
611+
612+ // Verify that evaluation of the rule failed.
613+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluation_failures_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
614+
615+ // But these failures were not reported as "failed queries"
616+ sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" })
617+ require .NoError (t , err )
618+ require .Equal (t , float64 (0 ), sum [0 ])
590619
591- // Wait until rule group has tried to evaluate few times.
592- require .NoError (t , cortex .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluations_total" }, e2e .WaitMissingMetrics ))
593- require .NoError (t , cortex .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_ruler_queries_total" }, e2e .WaitMissingMetrics ))
620+ // Delete rule before checkin "cortex_ruler_queries_total", as we want to reuse value for next test.
621+ require .NoError (t , c .DeleteRuleGroup (namespace , groupName ))
594622
595- // We want to verify that ruler doesn't report evaluation failure as "queries failed", but only as "evaluation failures".
596- require .NoError (t , cortex .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluation_failures_total" }, e2e .WaitMissingMetrics ))
597- require .NoError (t , cortex .WaitSumMetrics (e2e .Equals (0 ), "cortex_ruler_queries_failed_total" ))
623+ // Wait until ruler has unloaded the group. We don't use any matcher, so there should be no groups (in fact, metric disappears).
624+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .SkipMissingMetrics ))
625+
626+ // Check that cortex_ruler_queries_total went up since last test.
627+ newTotalQueries , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_total" })
628+ require .NoError (t , err )
629+ require .Greater (t , newTotalQueries [0 ], totalQueries [0 ])
630+
631+ // Remember totalQueries for next test.
632+ totalQueries = newTotalQueries
633+ })
634+ }
635+
636+ // Now let's upload a non-failing rule, and make sure that it works.
637+ t .Run ("real_error" , func (t * testing.T ) {
638+ const groupName = "good_rule"
639+ const expression = `sum(metric{foo=~"1|2"})`
640+
641+ require .NoError (t , c .SetRuleGroup (ruleGroupWithRule (groupName , "rule" , expression ), namespace ))
642+ m := ruleGroupMatcher (user , namespace , groupName )
643+
644+ // Wait until ruler has loaded the group.
645+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (1 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
646+
647+ // Wait until rule group has tried to evaluate the rule, and succeeded.
648+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluations_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
649+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_prometheus_rule_evaluation_failures_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
650+
651+ // Still no failures.
652+ sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" })
653+ require .NoError (t , err )
654+ require .Equal (t , float64 (0 ), sum [0 ])
655+
656+ // Now let's stop ingester, and recheck metrics. This should increase cortex_ruler_queries_failed_total failures.
657+ require .NoError (t , s .Stop (ingester ))
658+
659+ // We should start getting "real" failures now.
660+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_ruler_queries_failed_total" }))
661+ })
662+ }
663+
664+ func ruleGroupMatcher (user , namespace , groupName string ) * labels.Matcher {
665+ return labels .MustNewMatcher (labels .MatchEqual , "rule_group" , fmt .Sprintf ("/rules/%s/%s;%s" , user , namespace , groupName ))
666+ }
667+
668+ func ruleGroupWithRule (groupName string , ruleName string , expression string ) rulefmt.RuleGroup {
669+ // Prepare rule group with invalid rule.
670+ var recordNode = yaml.Node {}
671+ var exprNode = yaml.Node {}
672+
673+ recordNode .SetString (ruleName )
674+ exprNode .SetString (expression )
675+
676+ return rulefmt.RuleGroup {
677+ Name : groupName ,
678+ Interval : 10 ,
679+ Rules : []rulefmt.RuleNode {{
680+ Record : recordNode ,
681+ Expr : exprNode ,
682+ }},
683+ }
598684}
599685
600686func createTestRuleGroup (t * testing.T ) rulefmt.RuleGroup {
0 commit comments