|
7 | 7 | { |
8 | 8 | alert: 'PostgreSQLMaxConnectionsReached', |
9 | 9 | annotations: { |
10 | | - description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy.', |
| 10 | + description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy).', |
11 | 11 | summary: 'Postgres connections count is over the maximum amount.', |
12 | 12 | }, |
13 | 13 | expr: ||| |
14 | | - sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s}) |
| 14 | + sum by (%(agg)s) (pg_stat_activity_count{%(postgresExporterSelector)s}) |
15 | 15 | >= |
16 | | - sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s}) |
| 16 | + sum by (%(agg)s) (pg_settings_max_connections{%(postgresExporterSelector)s}) |
17 | 17 | - |
18 | | - sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) |
19 | | - ||| % $._config, |
| 18 | + sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) |
| 19 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
20 | 20 | 'for': '1m', |
21 | 21 | labels: { |
22 | 22 | severity: 'warning', |
|
29 | 29 | summary: 'Postgres connections count is over 80% of maximum amount.', |
30 | 30 | }, |
31 | 31 | expr: ||| |
32 | | - sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s}) |
| 32 | + sum by (%(agg)s) (pg_stat_activity_count{%(postgresExporterSelector)s}) |
33 | 33 | > |
34 | 34 | ( |
35 | | - sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s}) |
| 35 | + sum by (%(agg)s) (pg_settings_max_connections{%(postgresExporterSelector)s}) |
36 | 36 | - |
37 | | - sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) |
| 37 | + sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) |
38 | 38 | ) * 0.8 |
39 | | - ||| % $._config, |
| 39 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
40 | 40 | 'for': '10m', |
41 | 41 | labels: { |
42 | 42 | severity: 'warning', |
|
61 | 61 | summary: 'PostgreSQL high number of slow queries.', |
62 | 62 | }, |
63 | 63 | expr: ||| |
64 | | - avg by (datname) ( |
| 64 | + avg by (datname, %(agg)s) ( |
65 | 65 | rate ( |
66 | | - pg_stat_activity_max_tx_duration{%(dbNameFilter)s,%(postgresExporterSelector)s}[2m] |
| 66 | + pg_stat_activity_max_tx_duration{%(dbNameFilter)s, %(postgresExporterSelector)s}[2m] |
67 | 67 | ) |
68 | 68 | ) > 2 * 60 |
69 | | - ||| % $._config, |
| 69 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
70 | 70 | 'for': '2m', |
71 | 71 | labels: { |
72 | 72 | severity: 'warning', |
|
79 | 79 | summary: 'PostgreSQL high number of queries per second.', |
80 | 80 | }, |
81 | 81 | expr: ||| |
82 | | - avg by (datname) ( |
| 82 | + avg by (datname, %(agg)s) ( |
83 | 83 | irate( |
84 | | - pg_stat_database_xact_commit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] |
| 84 | + pg_stat_database_xact_commit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m] |
85 | 85 | ) |
86 | 86 | + |
87 | 87 | irate( |
88 | | - pg_stat_database_xact_rollback{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] |
| 88 | + pg_stat_database_xact_rollback{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m] |
89 | 89 | ) |
90 | 90 | ) > 10000 |
91 | | - ||| % $._config, |
| 91 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
92 | 92 | 'for': '5m', |
93 | 93 | labels: { |
94 | 94 | severity: 'warning', |
|
101 | 101 | summary: 'PostgreSQL low cache hit rate.', |
102 | 102 | }, |
103 | 103 | expr: ||| |
104 | | - avg by (datname) ( |
105 | | - rate(pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]) |
| 104 | + avg by (datname, %(agg)s) ( |
| 105 | + rate(pg_stat_database_blks_hit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m]) |
106 | 106 | / |
107 | 107 | ( |
108 | 108 | rate( |
109 | | - pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] |
| 109 | + pg_stat_database_blks_hit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m] |
110 | 110 | ) |
111 | 111 | + |
112 | 112 | rate( |
113 | | - pg_stat_database_blks_read{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m] |
| 113 | + pg_stat_database_blks_read{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m] |
114 | 114 | ) |
115 | 115 | ) |
116 | 116 | ) < 0.98 |
117 | | - ||| % $._config, |
| 117 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
118 | 118 | 'for': '5m', |
119 | 119 | labels: { |
120 | 120 | severity: 'warning', |
|
157 | 157 | summary: 'PostgreSQL has high number of acquired locks.', |
158 | 158 | }, |
159 | 159 | expr: ||| |
160 | | - max by( server, job, datname, namespace) ((pg_locks_count{%(dbNameFilter)s}) / |
161 | | - on(instance, namespace) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20 |
162 | | - ||| % $._config, |
| 160 | + max by(datname, %(agg)s) ( |
| 161 | + (pg_locks_count{%(dbNameFilter)s}) |
| 162 | + / |
| 163 | + on(%(aggWithoutServer)s) group_left(server) ( |
| 164 | + pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{} |
| 165 | + ) |
| 166 | + ) > 0.20 |
| 167 | + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels), aggWithoutServer: std.join(',', std.filter(function(x) x != "server", $._config.groupLabels + $._config.instanceLabels)) }, |
163 | 168 | 'for': '5m', |
164 | 169 | labels: { |
165 | 170 | severity: 'warning', |
|
171 | 176 | description: '{{ $labels.instance }} replication lag exceeds 1 hour. Check for network issues or load imbalances.', |
172 | 177 | summary: 'PostgreSQL replication lagging more than 1 hour.', |
173 | 178 | }, |
174 | | - expr: '(pg_replication_lag{} > 3600) and on (instance) (pg_replication_is_replica{} == 1)', |
| 179 | + expr: ||| |
| 180 | + (pg_replication_lag{} > 3600) and on (%(agg)s) (pg_replication_is_replica{} == 1) |
| 181 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
175 | 182 | 'for': '5m', |
176 | 183 | labels: { |
177 | 184 | severity: 'warning', |
|
223 | 230 | timestamp( |
224 | 231 | pg_stat_user_tables_n_dead_tup{} > |
225 | 232 | pg_stat_user_tables_n_live_tup{} |
226 | | - * on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_scale_factor{} |
227 | | - + on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_threshold{} |
| 233 | + * on(%(agg)s) group_left pg_settings_autovacuum_vacuum_scale_factor{} |
| 234 | + + on(%(agg)s) group_left pg_settings_autovacuum_vacuum_threshold{} |
228 | 235 | ) |
229 | 236 | < time() - 36000 |
230 | 237 | ) |
231 | | - |||, |
| 238 | + ||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) }, |
232 | 239 | 'for': '30m', |
233 | 240 | labels: { |
234 | 241 | severity: 'critical', |
|
0 commit comments