Skip to content

Commit 8443e7b

Browse files
committed
LOG-7896: Add alert when forwarder sink is generating errors
1 parent 6cdb348 commit 8443e7b

File tree

3 files changed

+51
-0
lines changed

3 files changed

+51
-0
lines changed

bundle/manifests/collector_monitoring.coreos.com_v1_prometheusrule.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,21 @@ spec:
66
groups:
77
- name: logging_collector.alerts
88
rules:
9+
- alert: ClusterLogForwarderSinkErrorRate
10+
annotations:
11+
message: The rate of errors detected for pod '{{ $labels.pod }}' owned by
12+
ClusterLogForwarder '{{ $labels.namespace }}/{{ $labels.app_kubernetes_io_instance
13+
}}' for output '{{ $labels.component_id }}' exceeds 0.
14+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-logging-operator/ClusterLogForwarderSinkErrorRate.md
15+
summary: |-
16+
The pod '{{ $labels.pod }}' owned by ClusterLogForwarder "{{ $labels.namespace }}/{{ $labels.app_kubernetes_io_instance }}" for output "{{ $labels.component_id }}" is generating the error: "{{ $labels.error_kind }}".
17+
This could indicate the output URL is misconfigured, the receiver is unavailable, or there are networking issues for that pod.
18+
expr: |
19+
sum by (namespace,app_kubernetes_io_instance, pod, component_id, error_kind)(irate(vector_http_client_errors_total[2m])) > 0
20+
for: 1m
21+
labels:
22+
service: clusterlogforwarder
23+
severity: critical
924
- alert: ClusterLogForwarderDeprecations
1025
annotations:
1126
message: The Cluster Logging Operator version {{$labels.version}} includes

config/prometheus/alert_test.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: collector-test
5+
namespace: openshift-logging
6+
spec:
7+
groups:
8+
- name: logging_collector_new.alerts
9+
rules:
10+
- alert: ClusterLogForwarderSinkErrorRate
11+
annotations:
12+
description: |-
13+
The rate of errors detected for ClusterLogForwarder "{{ $labels.app_kubernetes_io_instance }}" in namespace "{{ $labels.namespace }}" for output "{{ $labels.component_id }}" exceeds 0.
14+
summary: |-
15+
The ClusterLogForwarder "{{ $labels.namespace }}/{{labels.app_kubernetes_io_instance}}" output "{{ $labels.component_id }}" is generating the error: "{{error_kind}}".
16+
This could indicate the output URL is misconfigured or the receiver is unavailable.
17+
runbook_url: fixme
18+
expr: |
19+
sum by (namespace,app_kubernetes_io_instance, component_id, error_kind)(irate(vector_http_client_errors_total[2m])) > 0
20+
for: 1m
21+
labels:
22+
service: collector
23+
severity: critical

config/prometheus/collector_alerts.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,19 @@ spec:
77
groups:
88
- name: logging_collector.alerts
99
rules:
10+
- alert: ClusterLogForwarderSinkErrorRate
11+
annotations:
12+
message: "The rate of errors detected for pod '{{ $labels.pod }}' owned by ClusterLogForwarder '{{ $labels.namespace }}/{{ $labels.app_kubernetes_io_instance }}' for output '{{ $labels.component_id }}' exceeds 0."
13+
summary: |-
14+
The pod '{{ $labels.pod }}' owned by ClusterLogForwarder "{{ $labels.namespace }}/{{ $labels.app_kubernetes_io_instance }}" for output "{{ $labels.component_id }}" is generating the error: "{{ $labels.error_kind }}".
15+
This could indicate: the output URL is misconfigured, the receiver is unavailable, or there are networking issues for that pod.
16+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-logging-operator/ClusterLogForwarderSinkErrorRate.md
17+
expr: |
18+
sum by (namespace,app_kubernetes_io_instance, pod, component_id, error_kind)(irate(vector_http_client_errors_total[2m])) > 0
19+
for: 1m
20+
labels:
21+
service: clusterlogforwarder
22+
severity: critical
1023
- alert: ClusterLogForwarderDeprecations
1124
annotations:
1225
message: "The Cluster Logging Operator version {{$labels.version}} includes deprecations to some feature of ClusterLogForwarder."

0 commit comments

Comments
 (0)