]> git.openstreetmap.org Git - chef.git/blob - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Add an alert for unusually low apache request rates
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
1 # DO NOT EDIT - This file is being maintained by Chef
2
3 groups:
4   - name: alertmanager
5     rules:
6       - alert: prometheus target missing
7         expr: up == 0
8         for: 5m
9         labels:
10           alertgroup: "prometheus"
11   - name: amsterdam
12     rules:
13       - alert: pdu current draw
14         expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 10
15         for: 5m
16         labels:
17           alertgroup: "amsterdam"
18         annotations:
19           current: "{{ $value | humanize }}A"
20       - alert: site current draw
21         expr: sum(rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10) > 13
22         for: 5m
23         labels:
24           alertgroup: "amsterdam"
25         annotations:
26           current: "{{ $value | humanize }}A"
27       - alert: site temperature
28         expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 25
29         for: 5m
30         labels:
31           alertgroup: "amsterdam"
32         annotations:
33           temperature: "{{ $value | humanize }}C"
34       - alert: site humidity
35         expr: max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="amsterdam"}) / 100 < 0.25 or max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="amsterdam"}) / 100 > 0.65
36         for: 5m
37         labels:
38           alertgroup: "amsterdam"
39         annotations:
40           humidity: "{{ $value | humanizePercentage }}"
41   - name: apache
42     rules:
43       - alert: apache down
44         expr: apache_up == 0
45         for: 5m
46         labels:
47           alertgroup: "{{ $labels.instance }}"
48       - alert: apache workers busy
49         expr: sum(apache_workers{state="busy"}) by (instance) / sum(apache_scoreboard) by (instance) > 0.8
50         for: 5m
51         labels:
52           alertgroup: "{{ $labels.instance }}"
53         annotations:
54           busy_workers: "{{ $value | humanizePercentage }}"
55       - alert: apache low request rate
56         expr: rate(apache_accesses_total[$__rate_interval]) / rate(apache_accesses_total[$__rate_interval] offset 1w) < 0.25 and rate(apache_accesses_total[1h] offset 1w) > 2
57         for: 15m
58         labels:
59           alertgroup: "{{ $labels.instance }}"
60         annotations:
61           request_rate: "{{ $value | humanizePercentage }}"
62   - name: chef
63     rules:
64       - alert: chef client not running
65         expr: time() - node_systemd_timer_last_trigger_seconds{name="chef-client.timer"} > 3600
66         for: 12h
67         labels:
68           alertgroup: "{{ $labels.instance }}"
69         annotations:
70           down_time: "{{ $value | humanizeDuration }}"
71   - name: cpu
72     rules:
73       - alert: cpu pressure
74         expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.6
75         for: 15m
76         labels:
77           alertgroup: "{{ $labels.instance }}"
78         annotations:
79           pressure: "{{ $value | humanizePercentage }}"
80   - name: database
81     rules:
82       - alert: postgres replication delay
83         expr: pg_replication_lag_seconds > 5
84         for: 5m
85         labels:
86           alertgroup: database
87         annotations:
88           delay: "{{ $value | humanizeDuration }}"
89   - name: fastly
90     rules:
91       - alert: error rate
92         expr: sum(rate(fastly_rt_status_group_total{status_group="5xx"}[5m])) by (service_name, datacenter) / sum(rate(fastly_rt_status_group_total[5m])) by (service_name, datacenter) > 0.005
93         for: 15m
94         labels:
95           alertgroup: fastly
96         annotations:
97           error_rate: "{{ $value | humanizePercentage }}"
98   - name: filesystem
99     rules:
100       - alert: readonly filesystem
101         expr: node_filesystem_readonly == 1
102         for: 0m
103         labels:
104           alertgroup: "{{ $labels.instance }}"
105       - alert: filesystem low on space
106         expr: node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.05
107         for: 5m
108         labels:
109           alertgroup: "{{ $labels.instance }}"
110         annotations:
111           percentage_free: "{{ $value | humanizePercentage }}"
112           free_bytes: "{{ with printf \"node_filesystem_avail_bytes{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value | humanize1024 }}bytes{{end}}"
113           total_total: "{{ with printf \"node_filesystem_size_bytes{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value | humanize1024 }}bytes{{end}}"
114       - alert: filesystem low on inodes
115         expr: node_filesystem_files_free / node_filesystem_files < 0.1
116         for: 5m
117         labels:
118           alertgroup: "{{ $labels.instance }}"
119         annotations:
120           percentage_free: "{{ $value | humanizePercentage }}"
121           free_inodes: "{{ with printf \"node_filesystem_files_free{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value }}{{end}}"
122           total_inodes: "{{ with printf \"node_filesystem_files{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value }}{{end}}"
123   - name: hwmon
124     rules:
125       - alert: hwmon fan alarm
126         expr: node_hwmon_fan_alarm == 1
127         for: 5m
128         labels:
129           alertgroup: "{{ $labels.instance }}"
130         annotations:
131           fan_rpm: "{{ with printf \"node_hwmon_fan_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
132           fan_min_rpm: "{{ with printf \"node_hwmon_fan_min_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
133       - alert: hwmon temperature alarm
134         expr: node_hwmon_temp_alarm == 1
135         for: 5m
136         labels:
137           alertgroup: "{{ $labels.instance }}"
138         annotations:
139           temp_celsius: "{{ with printf \"node_hwmon_temp_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
140           temp_max_celsius: "{{ with printf \"node_hwmon_temp_max_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
141           temp_crit_celsius: "{{ with printf \"node_hwmon_temp_crit_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
142       - alert: hwmon voltage alarm
143         expr: node_hwmon_in_alarm == 1
144         for: 5m
145         labels:
146           alertgroup: "{{ $labels.instance }}"
147         annotations:
148           in_volts: "{{ with printf \"node_hwmon_in_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
149           in_min_volts: "{{ with printf \"node_hwmon_in_min_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
150           in_max_volts: "{{ with printf \"node_hwmon_in_max_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
151   - name: io
152     rules:
153       - alert: io pressure
154         expr: rate(node_pressure_io_waiting_seconds_total[5m]) > 0.6
155         for: 60m
156         labels:
157           alertgroup: "{{ $labels.instance }}"
158         annotations:
159           pressure: "{{ $value | humanizePercentage }}"
160   - name: ipmi
161     rules:
162       - alert: ipmi fan alarm
163         expr: ipmi_fan_speed_state > 0
164         for: 5m
165         labels:
166           alertgroup: "{{ $labels.instance }}"
167         annotations:
168           fan_speed_rpm: "{{ with printf \"ipmi_fan_speed_rpm{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}rpm{{end}}"
169       - alert: ipmi temperature alarm
170         expr: ipmi_temperature_state > 0
171         for: 5m
172         labels:
173           alertgroup: "{{ $labels.instance }}"
174         annotations:
175           temperature_celsius: "{{ with printf \"ipmi_temperature_celsius{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}C{{end}}"
176       - alert: ipmi voltage alarm
177         expr: ipmi_voltage_state > 0
178         for: 5m
179         labels:
180           alertgroup: "{{ $labels.instance }}"
181         annotations:
182           voltage_volts: "{{ with printf \"ipmi_voltage_volts{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}V{{end}}"
183       - alert: ipmi power alarm
184         expr: ipmi_power_state > 0 or ipmi_sensor_state{type=~"Power .*"} > 0
185         for: 5m
186         labels:
187           alertgroup: "{{ $labels.instance }}"
188   - name: mail
189     rules:
190       - alert: exim queue length
191         expr: exim_queue > exim_queue_limit
192         for: 60m
193         labels:
194           alertgroup: mail
195         annotations:
196           queue_length: "{{ $value }}"
197       - alert: mailman queue length
198         expr: mailman_queue_length > 200
199         for: 60m
200         labels:
201           alertgroup: mail
202         annotations:
203           queue_length: "{{ $value }}"
204   - name: mdadm
205     rules:
206       - alert: mdadm array inactive
207         expr: node_md_state{state="inactive"} > 0
208         for: 0m
209         labels:
210           alertgroup: "{{ $labels.instance }}"
211         annotations:
212           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
213           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
214           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
215           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
216       - alert: mdadm array degraded
217         expr: sum (node_md_disks{state="active"}) without (state) < node_md_disks_required
218         for: 0m
219         labels:
220           alertgroup: "{{ $labels.instance }}"
221         annotations:
222           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
223           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
224           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
225           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
226       - alert: mdadm disk failed
227         expr: node_md_disks{state="failed"} > 0
228         for: 0m
229         labels:
230           alertgroup: "{{ $labels.instance }}"
231         annotations:
232           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
233           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
234           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
235           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
236   - name: memory
237     rules:
238       - alert: low memory
239         expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.1
240         for: 15m
241         labels:
242           alertgroup: "{{ $labels.instance }}"
243         annotations:
244           memory_free: "{{ $value | humanizePercentage }}"
245       - alert: memory pressure
246         expr: rate(node_pressure_memory_waiting_seconds_total[5m]) > 0.6
247         for: 60m
248         labels:
249           alertgroup: "{{ $labels.instance }}"
250         annotations:
251           pressure: "{{ $value | humanizePercentage }}"
252       - alert: oom kill detected
253         expr: increase(node_vmstat_oom_kill[1m]) > 0
254         for: 0m
255         labels:
256           alertgroup: "{{ $labels.instance }}"
257         annotations:
258           new_oom_kills: "{{ $value }}"
259   - name: network
260     rules:
261       - alert: interface transmit rate
262         expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98
263         for: 5m
264         labels:
265           alertgroup: "{{ $labels.instance }}"
266         annotations:
267           bandwidth_used: "{{ $value | humanizePercentage }}"
268       - alert: interface receive rate
269         expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98
270         for: 5m
271         labels:
272           alertgroup: "{{ $labels.instance }}"
273         annotations:
274           bandwidth_used: "{{ $value | humanizePercentage }}"
275       - alert: interface transmit errors
276         expr: rate(node_network_transmit_errs_total[1m]) / rate(node_network_transmit_packets_total[1m]) > 0.01
277         for: 5m
278         labels:
279           alertgroup: "{{ $labels.instance }}"
280         annotations:
281           error_rate: "{{ $value | humanizePercentage }}"
282       - alert: interface receive errors
283         expr: rate(node_network_receive_errs_total[1m]) / rate(node_network_receive_packets_total[1m]) > 0.01
284         for: 5m
285         labels:
286           alertgroup: "{{ $labels.instance }}"
287         annotations:
288           error_rate: "{{ $value | humanizePercentage }}"
289       - alert: conntrack entries
290         expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
291         for: 5m
292         labels:
293           alertgroup: "{{ $labels.instance }}"
294         annotations:
295           entries_used: "{{ $value | humanizePercentage }}"
296   - name: planet
297     rules:
298       - alert: planet dump overdue
299         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/(pbf|planet)/.*"} > 7 * 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
300         for: 24h
301         labels:
302           alertgroup: planet
303         annotations:
304           overdue_by: "{{ $value | humanizeDuration }}"
305       - alert: notes dump overdue
306         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/notes/.*"} > 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
307         for: 6h
308         labels:
309           alertgroup: planet
310         annotations:
311           overdue_by: "{{ $value | humanizeDuration }}"
312       - alert: daily replication feed delayed
313         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/day/.*"} > 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
314         for: 3h
315         labels:
316           alertgroup: planet
317         annotations:
318           delayed_by: "{{ $value | humanizeDuration }}"
319       - alert: hourly replication feed delayed
320         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/hour/.*"} > 3600 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
321         for: 30m
322         labels:
323           alertgroup: planet
324         annotations:
325           delayed_by: "{{ $value | humanizeDuration }}"
326       - alert: minutely replication feed delayed
327         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/minute/.*"} > 60 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
328         for: 5m
329         labels:
330           alertgroup: planet
331         annotations:
332           delayed_by: "{{ $value | humanizeDuration }}"
333       - alert: changeset replication feed delayed
334         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/changesets/.*"} > 60 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
335         for: 5m
336         labels:
337           alertgroup: planet
338         annotations:
339           delayed_by: "{{ $value | humanizeDuration }}"
340   - name: postgresql
341     rules:
342       - alert: postgresql down
343         expr: pg_up == 0
344         for: 1m
345         labels:
346           alertgroup: "{{ $labels.instance }}"
347       - alert: postgresql replication delay
348         expr: pg_replication_lag_seconds > 5
349         for: 1m
350         labels:
351           alertgroup: "{{ $labels.instance }}"
352         annotations:
353           delay: "{{ $value | humanizeDuration }}"
354       - alert: postgresql connection limit
355         expr: sum (pg_stat_activity_count) by (instance, server) / sum (pg_settings_max_connections) by (instance, server) > 0.8
356         for: 1m
357         labels:
358           alertgroup: "{{ $labels.instance }}"
359         annotations:
360           connections_used: "{{ $value | humanizePercentage }}"
361       - alert: postgresql deadlocks
362         expr: increase(pg_stat_database_deadlocks[1m]) > 5
363         for: 0m
364         labels:
365           alertgroup: "{{ $labels.instance }}"
366         annotations:
367           new_deadlocks: "{{ $value }}"
368       - alert: postgresql slow queries
369         expr: pg_slow_queries > 0
370         for: 5m
371         labels:
372           alertgroup: "{{ $labels.instance }}"
373         annotations:
374           queries: "{{ $value }}"
375   - name: smart
376     rules:
377       - alert: smart failure
378         expr: smart_health_status == 0
379         for: 60m
380         labels:
381           alertgroup: "{{ $labels.instance }}"
382       - alert: smart ssd wearout approaching
383         expr: smart_percentage_used >= 90
384         for: 60m
385         labels:
386           alertgroup: "{{ $labels.instance }}"
387         annotations:
388           percentage_used: "{{ $value | humanizePercentage }}"
389   - name: ssl
390     rules:
391       - alert: ssl certificate probe failed
392         expr: ssl_probe_success == 0
393         for: 60m
394         labels:
395           alertgroup: ssl
396       - alert: ssl certificate expiry
397         expr: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 14
398         for: 0m
399         labels:
400           alertgroup: ssl
401         annotations:
402           expires_in: "{{ $value | humanizeDuration }}"
403       - alert: ssl certificate revoked
404         expr: ssl_ocsp_response_status == 1
405         for: 0m
406         labels:
407           alertgroup: ssl
408       - alert: ocsp status unknown
409         expr: ssl_ocsp_response_status == 1
410         for: 0m
411         labels:
412           alertgroup: ssl
413   - name: statuscake
414     rules:
415       - alert: statuscake uptime check failing
416         expr: statuscake_uptime{status="down",paused="false"} > 0
417         for: 0m
418         labels:
419           alertgroup: statuscake
420   - name: systemd
421     rules:
422       - alert: systemd failed service
423         expr: node_systemd_unit_state{state="failed",name!="chef-client.service"} == 1
424         for: 5m
425         labels:
426           alertgroup: "{{ $labels.instance }}"
427       - alert: systemd failed service
428         expr: node_systemd_unit_state{state="failed",name="chef-client.service"} == 1
429         for: 6h
430         labels:
431           alertgroup: "{{ $labels.instance }}"
432   - name: tile
433     rules:
434       - alert: renderd replication delay
435         expr: renderd_replication_delay > 120
436         for: 15m
437         labels:
438           alertgroup: tile
439         annotations:
440           delay: "{{ $value | humanizeDuration }}"
441       - alert: missed tile rate
442         expr: sum(rate(modtile_http_response_total{code="404"}[5m])) by (instance) / sum(rate(modtile_http_response_total[5m])) by (instance) > 0.05
443         for: 5m
444         labels:
445           alertgroup: tile
446         annotations:
447           miss_rate: "{{ $value | humanizePercentage }}"
448   - name: time
449     rules:
450       - alert: clock not synchronising
451         expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
452         for: 5m
453         labels:
454           alertgroup: "{{ $labels.instance }}"
455       - alert: clock skew detected
456         expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
457         for: 5m
458         labels:
459           alertgroup: "{{ $labels.instance }}"
460         annotations:
461           skew: "{{ with printf \"node_timex_offset_seconds{instance='%s'}\" $labels.instance | query }} {{ . | humanizeDuration }}{{ end }}"
462   - name: web
463     rules:
464       - alert: web error rate
465         expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002
466         for: 5m
467         labels:
468           alertgroup: web
469         annotations:
470           error_rate: "{{ $value | humanizePercentage }}"
471       - alert: job processing rate
472         expr: rate(pg_stat_user_tables_n_tup_del{datname="openstreetmap",relname="delayed_jobs"}[5m]) / rate(pg_stat_user_tables_n_tup_ins{datname="openstreetmap",relname="delayed_jobs"}[5m]) < 0.9 and ignoring(job, name, datname, relname, schemaname, server) chef_role{name="db-master"} == 1
473         for: 15m
474         labels:
475           alertgroup: web
476         annotations:
477           job_processing_rate: "{{ $value | humanizePercentage }}"