Skip to content

Commit 1e8f836

Browse files
committed
LOG-7896: Add alert when forwarder sink is generating errors
1 parent 76b3757 commit 1e8f836

File tree

2 files changed

+35
-0
lines changed

2 files changed

+35
-0
lines changed

bundle/manifests/collector_monitoring.coreos.com_v1_prometheusrule.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,24 @@ spec:
2020
labels:
2121
service: collector
2222
severity: error
23+
- alert: ClusterLogForwarderOutputErrorRate
24+
annotations:
25+
description: The rate of output errors detected for {{ $labels.namespace }}/{{
26+
$labels.pod }} pod exceeds the threshold of 10%.
27+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-logging-operator/ClusterLogForwarderOutputErrorRate.md
28+
summary: "The pod \"{{ $labels.pod }}\" owned by ClusterLogForwarder \"{{
29+
$labels.namespace }}/{{ $labels.app_kubernetes_io_instance }}\" \nfor output
30+
\"{{ $labels.component_id }}\" has been generating the error: \"{{ $labels.error_kind
31+
}}\" for the last 5m\nat the rate of {{ $value | humanizePercentage }} which
32+
exceeds the threshold of 10%.\nThis could indicate: the output URL is misconfigured,
33+
the receiver is unavailable, or there are networking issues for that pod."
34+
expr: "sum by (namespace,app_kubernetes_io_instance, pod, component_id, error_kind)(irate(vector_http_client_errors_total[5m])
35+
\n/ on (namespace,app_kubernetes_io_instance, pod, component_id) group_left
36+
irate(vector_http_client_requests_sent_total[5m])) > 0.10\n"
37+
for: 5m
38+
labels:
39+
service: clusterlogforwarder
40+
severity: critical
2341
- alert: ClusterLogForwarderDeprecations
2442
annotations:
2543
message: The Cluster Logging Operator version {{$labels.version}} includes

config/prometheus/collector_alerts.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,23 @@ spec:
2020
labels:
2121
service: collector
2222
severity: error
23+
- alert: ClusterLogForwarderOutputErrorRate
24+
annotations:
25+
description: |-
26+
The rate of output errors detected for {{ $labels.namespace }}/{{ $labels.pod }} pod exceeds the threshold of 10%.
27+
summary: |-
28+
The pod "{{ $labels.pod }}" owned by ClusterLogForwarder "{{ $labels.namespace }}/{{ $labels.app_kubernetes_io_instance }}"
29+
for output "{{ $labels.component_id }}" has been generating the error: "{{ $labels.error_kind }}" for the last 5m
30+
at the rate of {{ $value | humanizePercentage }} which exceeds the threshold of 10%.
31+
This could indicate: the output URL is misconfigured, the receiver is unavailable, or there are networking issues for that pod.
32+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-logging-operator/ClusterLogForwarderOutputErrorRate.md
33+
expr: |
34+
sum by (namespace,app_kubernetes_io_instance, pod, component_id, error_kind)(irate(vector_http_client_errors_total[5m])
35+
/ on (namespace,app_kubernetes_io_instance, pod, component_id) group_left irate(vector_http_client_requests_sent_total[5m])) > 0.10
36+
for: 5m
37+
labels:
38+
service: clusterlogforwarder
39+
severity: critical
2340
- alert: ClusterLogForwarderDeprecations
2441
annotations:
2542
message: "The Cluster Logging Operator version {{$labels.version}} includes deprecations to some feature of ClusterLogForwarder."

0 commit comments

Comments
 (0)