From 1e8f8366278ab353589173ffc3b9ab7fe1756820 Mon Sep 17 00:00:00 2001 From: Jeff Cantrill Date: Thu, 16 Oct 2025 15:59:40 -0400 Subject: [PATCH] LOG-7896: Add alert when forwarder sink is generating errors --- ...onitoring.coreos.com_v1_prometheusrule.yaml | 18 ++++++++++++++++++ config/prometheus/collector_alerts.yaml | 17 +++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/bundle/manifests/collector_monitoring.coreos.com_v1_prometheusrule.yaml b/bundle/manifests/collector_monitoring.coreos.com_v1_prometheusrule.yaml index 7a9d4da19..d1b9fd67f 100644 --- a/bundle/manifests/collector_monitoring.coreos.com_v1_prometheusrule.yaml +++ b/bundle/manifests/collector_monitoring.coreos.com_v1_prometheusrule.yaml @@ -20,6 +20,24 @@ spec: labels: service: collector severity: error + - alert: ClusterLogForwarderOutputErrorRate + annotations: + description: The rate of output errors detected for {{ $labels.namespace }}/{{ + $labels.pod }} pod exceeds the threshold of 10%. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-logging-operator/ClusterLogForwarderOutputErrorRate.md + summary: "The pod \"{{ $labels.pod }}\" owned by ClusterLogForwarder \"{{ + $labels.namespace }}/{{ $labels.app_kubernetes_io_instance }}\" \nfor output + \"{{ $labels.component_id }}\" has been generating the error: \"{{ $labels.error_kind + }}\" for the last 5m\nat the rate of {{ $value | humanizePercentage }} which + exceeds the threshold of 10%.\nThis could indicate: the output URL is misconfigured, + the receiver is unavailable, or there are networking issues for that pod." + expr: "sum by (namespace,app_kubernetes_io_instance, pod, component_id, error_kind)(irate(vector_http_client_errors_total[5m]) + \n/ on (namespace,app_kubernetes_io_instance, pod, component_id) group_left + irate(vector_http_client_requests_sent_total[5m])) > 0.10\n" + for: 5m + labels: + service: clusterlogforwarder + severity: critical - alert: ClusterLogForwarderDeprecations annotations: message: The Cluster Logging Operator version {{$labels.version}} includes diff --git a/config/prometheus/collector_alerts.yaml b/config/prometheus/collector_alerts.yaml index 86c3811ac..de86a0a23 100644 --- a/config/prometheus/collector_alerts.yaml +++ b/config/prometheus/collector_alerts.yaml @@ -20,6 +20,23 @@ spec: labels: service: collector severity: error + - alert: ClusterLogForwarderOutputErrorRate + annotations: + description: |- + The rate of output errors detected for {{ $labels.namespace }}/{{ $labels.pod }} pod exceeds the threshold of 10%. + summary: |- + The pod "{{ $labels.pod }}" owned by ClusterLogForwarder "{{ $labels.namespace }}/{{ $labels.app_kubernetes_io_instance }}" + for output "{{ $labels.component_id }}" has been generating the error: "{{ $labels.error_kind }}" for the last 5m + at the rate of {{ $value | humanizePercentage }} which exceeds the threshold of 10%. + This could indicate: the output URL is misconfigured, the receiver is unavailable, or there are networking issues for that pod. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-logging-operator/ClusterLogForwarderOutputErrorRate.md + expr: | + sum by (namespace,app_kubernetes_io_instance, pod, component_id, error_kind)(irate(vector_http_client_errors_total[5m]) + / on (namespace,app_kubernetes_io_instance, pod, component_id) group_left irate(vector_http_client_requests_sent_total[5m])) > 0.10 + for: 5m + labels: + service: clusterlogforwarder + severity: critical - alert: ClusterLogForwarderDeprecations annotations: message: "The Cluster Logging Operator version {{$labels.version}} includes deprecations to some feature of ClusterLogForwarder."