Skip to content

Commit de45446

Browse files
committed
Improve ruler test to check for more scenarios.
Signed-off-by: Peter Štibraný <[email protected]>
1 parent 91d0e15 commit de45446

File tree

2 files changed

+129
-38
lines changed

2 files changed

+129
-38
lines changed

integration/e2ecortex/client.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,11 @@ func (c *Client) SetRuleGroup(rulegroup rulefmt.RuleGroup, namespace string) err
318318
}
319319

320320
defer res.Body.Close()
321+
322+
if res.StatusCode != 202 {
323+
return fmt.Errorf("unexpected status code: %d", res.StatusCode)
324+
}
325+
321326
return nil
322327
}
323328

integration/ruler_test.go

Lines changed: 124 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -528,30 +528,11 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
528528
require.NoError(t, err)
529529
defer s.Close()
530530

531-
// Prepare rule group with invalid rule.
532-
var recordNode = yaml.Node{}
533-
var exprNode = yaml.Node{}
534-
535-
recordNode.SetString("invalid_rule")
536-
exprNode.SetString(`label_replace(metric, "foo", "$1", "service", "[")`) // Syntactically correct expression (passes check in ruler), but it fails because of invalid regex.
537-
538-
ruleGroup := rulefmt.RuleGroup{
539-
Name: "group_with_invalid_rule",
540-
Interval: 10,
541-
Rules: []rulefmt.RuleNode{{
542-
Record: recordNode,
543-
Expr: exprNode,
544-
}},
545-
}
546-
547531
// Start dependencies.
548532
consul := e2edb.NewConsul()
549533
minio := e2edb.NewMinio(9000, bucketName, rulestoreBucketName)
550534
require.NoError(t, s.StartAndWaitReady(consul, minio))
551535

552-
// Start Cortex components.
553-
require.NoError(t, copyFileToSharedDir(s, "docs/configuration/single-process-config-blocks.yaml", cortexConfigFile))
554-
555536
// Configure the ruler.
556537
flags := mergeFlags(
557538
BlocksStorageFlags(),
@@ -564,37 +545,142 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
564545
"-blocks-storage.bucket-store.bucket-index.enabled": "true",
565546
// Evaluate rules often, so that we don't need to wait for metrics to show up.
566547
"-ruler.evaluation-interval": "2s",
548+
"-ruler.poll-interval": "2s",
549+
// No delay
550+
"-ruler.evaluation-delay-duration": "0",
551+
552+
"-blocks-storage.tsdb.block-ranges-period": "1h",
553+
"-blocks-storage.bucket-store.sync-interval": "1s",
554+
"-blocks-storage.tsdb.retention-period": "2h",
555+
556+
// We run single ingester only, no replication.
557+
"-distributor.replication-factor": "1",
558+
559+
// Very low limit so that ruler hits it.
560+
"-querier.max-fetched-chunks-per-query": "5",
561+
// We need this to make limit work.
562+
"-ingester.stream-chunks-when-using-blocks": "true",
567563
},
568564
)
569565

570-
// Start ruler.
571-
cortex := e2ecortex.NewSingleBinaryWithConfigFile("cortex", cortexConfigFile, flags, "", 9009, 9095)
572-
require.NoError(t, s.StartAndWaitReady(cortex))
566+
const namespace = "test"
567+
const user = "user"
573568

574-
c, err := e2ecortex.NewClient(cortex.HTTPEndpoint(), cortex.HTTPEndpoint(), "", cortex.HTTPEndpoint(), "user")
575-
require.NoError(t, err)
569+
distributor := e2ecortex.NewDistributor("distributor", consul.NetworkHTTPEndpoint(), flags, "")
570+
ruler := e2ecortex.NewRuler("ruler", consul.NetworkHTTPEndpoint(), flags, "")
571+
ingester := e2ecortex.NewIngester("ingester", consul.NetworkHTTPEndpoint(), flags, "")
572+
require.NoError(t, s.StartAndWaitReady(distributor, ingester, ruler))
576573

577-
// Push some series to Cortex.
578-
series, _ := generateSeries("metric", time.Now(), prompb.Label{Name: "foo", Value: "bar"})
574+
// Wait until both the distributor and ruler have updated the ring. The querier will also watch
575+
// the store-gateway ring if blocks sharding is enabled.
576+
require.NoError(t, distributor.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
577+
require.NoError(t, ruler.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
579578

580-
res, err := c.Push(series)
579+
c, err := e2ecortex.NewClient(distributor.HTTPEndpoint(), "", "", ruler.HTTPEndpoint(), user)
581580
require.NoError(t, err)
582-
require.Equal(t, 200, res.StatusCode)
583581

584-
// Upload rule group to the ruler.
582+
// Push some series to Cortex -- enough so that we can hit some limits.
583+
for i := 0; i < 10; i++ {
584+
series, _ := generateSeries("metric", time.Now(), prompb.Label{Name: "foo", Value: fmt.Sprintf("%d", i)})
585+
586+
res, err := c.Push(series)
587+
require.NoError(t, err)
588+
require.Equal(t, 200, res.StatusCode)
589+
}
590+
591+
totalQueries, err := ruler.SumMetrics([]string{"cortex_ruler_queries_total"})
585592
require.NoError(t, err)
586-
require.NoError(t, c.SetRuleGroup(ruleGroup, "test"))
587593

588-
// Wait until ruler has loaded the group.
589-
require.NoError(t, cortex.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_prometheus_rule_group_rules"}, e2e.WaitMissingMetrics))
594+
// Verify that user-failures don't increase cortex_ruler_queries_failed_total
595+
for groupName, expression := range map[string]string{
596+
// Syntactically correct expression (passes check in ruler), but failing because of invalid regex. This fails in PromQL engine.
597+
"invalid_group": `label_replace(metric, "foo", "$1", "service", "[")`,
598+
599+
// This one fails in querier code, because of limits.
600+
"too_many_chunks_group": `sum(metric)`,
601+
} {
602+
t.Run(groupName, func(t *testing.T) {
603+
require.NoError(t, c.SetRuleGroup(ruleGroupWithRule(groupName, "rule", expression), namespace))
604+
m := ruleGroupMatcher(user, namespace, groupName)
605+
606+
// Wait until ruler has loaded the group.
607+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_prometheus_rule_group_rules"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
608+
609+
// Wait until rule group has tried to evaluate the rule.
610+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_prometheus_rule_evaluations_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
611+
612+
// Verify that evaluation of the rule failed.
613+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_prometheus_rule_evaluation_failures_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
614+
615+
// But these failures were not reported as "failed queries"
616+
sum, err := ruler.SumMetrics([]string{"cortex_ruler_queries_failed_total"})
617+
require.NoError(t, err)
618+
require.Equal(t, float64(0), sum[0])
590619

591-
// Wait until rule group has tried to evaluate few times.
592-
require.NoError(t, cortex.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_prometheus_rule_evaluations_total"}, e2e.WaitMissingMetrics))
593-
require.NoError(t, cortex.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_ruler_queries_total"}, e2e.WaitMissingMetrics))
620+
// Delete rule before checkin "cortex_ruler_queries_total", as we want to reuse value for next test.
621+
require.NoError(t, c.DeleteRuleGroup(namespace, groupName))
594622

595-
// We want to verify that ruler doesn't report evaluation failure as "queries failed", but only as "evaluation failures".
596-
require.NoError(t, cortex.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_prometheus_rule_evaluation_failures_total"}, e2e.WaitMissingMetrics))
597-
require.NoError(t, cortex.WaitSumMetrics(e2e.Equals(0), "cortex_ruler_queries_failed_total"))
623+
// Wait until ruler has unloaded the group. We don't use any matcher, so there should be no groups (in fact, metric disappears).
624+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_prometheus_rule_group_rules"}, e2e.SkipMissingMetrics))
625+
626+
// Check that cortex_ruler_queries_total went up since last test.
627+
newTotalQueries, err := ruler.SumMetrics([]string{"cortex_ruler_queries_total"})
628+
require.NoError(t, err)
629+
require.Greater(t, newTotalQueries[0], totalQueries[0])
630+
631+
// Remember totalQueries for next test.
632+
totalQueries = newTotalQueries
633+
})
634+
}
635+
636+
// Now let's upload a non-failing rule, and make sure that it works.
637+
t.Run("real_error", func(t *testing.T) {
638+
const groupName = "good_rule"
639+
const expression = `sum(metric{foo=~"1|2"})`
640+
641+
require.NoError(t, c.SetRuleGroup(ruleGroupWithRule(groupName, "rule", expression), namespace))
642+
m := ruleGroupMatcher(user, namespace, groupName)
643+
644+
// Wait until ruler has loaded the group.
645+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_prometheus_rule_group_rules"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
646+
647+
// Wait until rule group has tried to evaluate the rule, and succeeded.
648+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_prometheus_rule_evaluations_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
649+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_prometheus_rule_evaluation_failures_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
650+
651+
// Still no failures.
652+
sum, err := ruler.SumMetrics([]string{"cortex_ruler_queries_failed_total"})
653+
require.NoError(t, err)
654+
require.Equal(t, float64(0), sum[0])
655+
656+
// Now let's stop ingester, and recheck metrics. This should increase cortex_ruler_queries_failed_total failures.
657+
require.NoError(t, s.Stop(ingester))
658+
659+
// We should start getting "real" failures now.
660+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_ruler_queries_failed_total"}))
661+
})
662+
}
663+
664+
func ruleGroupMatcher(user, namespace, groupName string) *labels.Matcher {
665+
return labels.MustNewMatcher(labels.MatchEqual, "rule_group", fmt.Sprintf("/rules/%s/%s;%s", user, namespace, groupName))
666+
}
667+
668+
func ruleGroupWithRule(groupName string, ruleName string, expression string) rulefmt.RuleGroup {
669+
// Prepare rule group with invalid rule.
670+
var recordNode = yaml.Node{}
671+
var exprNode = yaml.Node{}
672+
673+
recordNode.SetString(ruleName)
674+
exprNode.SetString(expression)
675+
676+
return rulefmt.RuleGroup{
677+
Name: groupName,
678+
Interval: 10,
679+
Rules: []rulefmt.RuleNode{{
680+
Record: recordNode,
681+
Expr: exprNode,
682+
}},
683+
}
598684
}
599685

600686
func createTestRuleGroup(t *testing.T) rulefmt.RuleGroup {

0 commit comments

Comments
 (0)