Skip to content

Commit 86532ad

Browse files
committed
LOG-7896: Add alert when forwarder sink is generating errors
1 parent 76b3757 commit 86532ad

File tree

3 files changed

+51
-0
lines changed

3 files changed

+51
-0
lines changed

bundle/manifests/collector_monitoring.coreos.com_v1_prometheusrule.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,21 @@ spec:
2020
labels:
2121
service: collector
2222
severity: error
23+
- alert: ClusterLogForwarderSinkErrorRate
24+
annotations:
25+
message: The rate of errors detected for pod '{{ $labels.pod }}' owned by
26+
ClusterLogForwarder '{{ $labels.namespace }}/{{ $labels.app_kubernetes_io_instance
27+
}}' for output '{{ $labels.component_id }}' exceeds 0.
28+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-logging-operator/ClusterLogForwarderSinkErrorRate.md
29+
summary: |-
30+
The pod '{{ $labels.pod }}' owned by ClusterLogForwarder "{{ $labels.namespace }}/{{ $labels.app_kubernetes_io_instance }}" for output "{{ $labels.component_id }}" is generating the error: "{{ $labels.error_kind }}".
31+
This could indicate the output URL is misconfigured, the receiver is unavailable, or there are networking issues for that pod.
32+
expr: |
33+
sum by (namespace,app_kubernetes_io_instance, pod, component_id, error_kind)(irate(vector_http_client_errors_total[2m])) > 0
34+
for: 1m
35+
labels:
36+
service: clusterlogforwarder
37+
severity: critical
2338
- alert: ClusterLogForwarderDeprecations
2439
annotations:
2540
message: The Cluster Logging Operator version {{$labels.version}} includes

config/prometheus/alert_test.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: collector-test
5+
namespace: openshift-logging
6+
spec:
7+
groups:
8+
- name: logging_collector_new.alerts
9+
rules:
10+
- alert: ClusterLogForwarderSinkErrorRate
11+
annotations:
12+
description: |-
13+
The rate of errors detected for ClusterLogForwarder "{{ $labels.app_kubernetes_io_instance }}" in namespace "{{ $labels.namespace }}" for output "{{ $labels.component_id }}" exceeds 0.
14+
summary: |-
15+
The ClusterLogForwarder "{{ $labels.namespace }}/{{labels.app_kubernetes_io_instance}}" output "{{ $labels.component_id }}" is generating the error: "{{error_kind}}".
16+
This could indicate the output URL is misconfigured or the receiver is unavailable.
17+
runbook_url: fixme
18+
expr: |
19+
sum by (namespace,app_kubernetes_io_instance, component_id, error_kind)(irate(vector_http_client_errors_total[2m])) > 0
20+
for: 1m
21+
labels:
22+
service: collector
23+
severity: critical

config/prometheus/collector_alerts.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,19 @@ spec:
2020
labels:
2121
service: collector
2222
severity: error
23+
- alert: ClusterLogForwarderSinkErrorRate
24+
annotations:
25+
message: "The rate of errors detected for pod '{{ $labels.pod }}' owned by ClusterLogForwarder '{{ $labels.namespace }}/{{ $labels.app_kubernetes_io_instance }}' for output '{{ $labels.component_id }}' exceeds 0."
26+
summary: |-
27+
The pod '{{ $labels.pod }}' owned by ClusterLogForwarder "{{ $labels.namespace }}/{{ $labels.app_kubernetes_io_instance }}" for output "{{ $labels.component_id }}" is generating the error: "{{ $labels.error_kind }}".
28+
This could indicate: the output URL is misconfigured, the receiver is unavailable, or there are networking issues for that pod.
29+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-logging-operator/ClusterLogForwarderSinkErrorRate.md
30+
expr: |
31+
sum by (namespace,app_kubernetes_io_instance, pod, component_id, error_kind)(irate(vector_http_client_errors_total[2m])) > 0
32+
for: 1m
33+
labels:
34+
service: clusterlogforwarder
35+
severity: critical
2336
- alert: ClusterLogForwarderDeprecations
2437
annotations:
2538
message: "The Cluster Logging Operator version {{$labels.version}} includes deprecations to some feature of ClusterLogForwarder."

0 commit comments

Comments
 (0)