]> git.openstreetmap.org Git - chef.git/blob - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Drop role and cookbook for the old tile CDN
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
1 # DO NOT EDIT - This file is being maintained by Chef
2
3 groups:
4   - name: alertmanager
5     rules:
6       - alert: prometheus target missing
7         expr: up == 0
8         for: 5m
9         labels:
10           alertgroup: "prometheus"
11   - name: apache
12     rules:
13       - alert: apache down
14         expr: apache_up == 0
15         for: 5m
16         labels:
17           alertgroup: "{{ $labels.instance }}"
18       - alert: apache workers busy
19         expr: sum(apache_workers{state="busy"}) by (instance) / sum(apache_scoreboard) by (instance) > 0.8
20         for: 5m
21         labels:
22           alertgroup: "{{ $labels.instance }}"
23         annotations:
24           busy_workers: "{{ $value | humanizePercentage }}"
25   - name: database
26     rules:
27       - alert: postgres replication delay
28         expr: pg_replication_lag_seconds > 5
29         for: 5m
30         labels:
31           alertgroup: database
32         annotations:
33           delay: "{{ $value | humanizeDuration }}"
34   - name: filesystem
35     rules:
36       - alert: filesystem low on space
37         expr: node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.05
38         for: 5m
39         labels:
40           alertgroup: "{{ $labels.instance }}"
41         annotations:
42           percentage_free: "{{ $value | humanizePercentage }}"
43           free_bytes: "{{ with printf \"node_filesystem_avail_bytes{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value | humanize1024 }}bytes{{end}}"
44           total_total: "{{ with printf \"node_filesystem_size_bytes{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value | humanize1024 }}bytes{{end}}"
45       - alert: filesystem low on inodes
46         expr: node_filesystem_files_free / node_filesystem_files < 0.1
47         for: 5m
48         labels:
49           alertgroup: "{{ $labels.instance }}"
50         annotations:
51           percentage_free: "{{ $value | humanizePercentage }}"
52           free_inodes: "{{ with printf \"node_filesystem_files_free{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value }}{{end}}"
53           total_inodes: "{{ with printf \"node_filesystem_files{instance='%s',mountpoint='%s'}\" $labels.instance $labels.mountpoint | query }}{{ . | first | value }}{{end}}"
54   - name: hwmon
55     rules:
56       - alert: hwmon fan alarm
57         expr: node_hwmon_fan_alarm == 1
58         for: 5m
59         labels:
60           alertgroup: "{{ $labels.instance }}"
61         annotations:
62           fan_rpm: "{{ with printf \"node_hwmon_fan_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
63           fan_min_rpm: "{{ with printf \"node_hwmon_fan_min_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
64       - alert: hwmon temperature alarm
65         expr: node_hwmon_temp_alarm == 1
66         for: 5m
67         labels:
68           alertgroup: "{{ $labels.instance }}"
69         annotations:
70           temp_celsius: "{{ with printf \"node_hwmon_temp_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
71           temp_max_celsius: "{{ with printf \"node_hwmon_temp_max_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
72           temp_crit_celsius: "{{ with printf \"node_hwmon_temp_crit_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
73       - alert: hwmon voltage alarm
74         expr: node_hwmon_in_alarm == 1
75         for: 5m
76         labels:
77           alertgroup: "{{ $labels.instance }}"
78         annotations:
79           in_volts: "{{ with printf \"node_hwmon_in_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
80           in_min_volts: "{{ with printf \"node_hwmon_in_min_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
81           in_max_volts: "{{ with printf \"node_hwmon_in_max_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
82   - name: ipmi
83     rules:
84       - alert: ipmi fan alarm
85         expr: ipmi_fan_speed_state > 0
86         for: 5m
87         labels:
88           alertgroup: "{{ $labels.instance }}"
89         annotations:
90           fan_speed_rpm: "{{ with printf \"ipmi_fan_speed_rpm{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}rpm{{end}}"
91       - alert: ipmi temperature alarm
92         expr: ipmi_temperature_state > 0
93         for: 5m
94         labels:
95           alertgroup: "{{ $labels.instance }}"
96         annotations:
97           temperature_celsius: "{{ with printf \"ipmi_temperature_celsius{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}C{{end}}"
98       - alert: ipmi voltage alarm
99         expr: ipmi_voltage_state > 0
100         for: 5m
101         labels:
102           alertgroup: "{{ $labels.instance }}"
103         annotations:
104           voltage_volts: "{{ with printf \"ipmi_voltage_volts{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}V{{end}}"
105       - alert: ipmi power alarm
106         expr: ipmi_power_state > 0 or ipmi_sensor_state{type=~"Power .*"} > 0
107         for: 5m
108         labels:
109           alertgroup: "{{ $labels.instance }}"
110   - name: mdadm
111     rules:
112       - alert: mdadm array inactive
113         expr: node_md_state{state="inactive"} > 0
114         for: 0m
115         labels:
116           alertgroup: "{{ $labels.instance }}"
117         annotations:
118           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
119           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
120           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
121           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
122       - alert: mdadm array degraded
123         expr: sum (node_md_disks{state="active"}) without (state) < node_md_disks_required
124         for: 0m
125         labels:
126           alertgroup: "{{ $labels.instance }}"
127         annotations:
128           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
129           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
130           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
131           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
132       - alert: mdadm disk failed
133         expr: node_md_disks{state="failed"} > 0
134         for: 0m
135         labels:
136           alertgroup: "{{ $labels.instance }}"
137         annotations:
138           required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
139           active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
140           failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
141           spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
142   - name: memory
143     rules:
144       - alert: low memory
145         expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.1
146         for: 5m
147         labels:
148           alertgroup: "{{ $labels.instance }}"
149         annotations:
150           memory_free: "{{ $value | humanizePercentage }}"
151       - alert: memory pressure
152         expr: rate(node_vmstat_pgmajfault[1m]) > 1000
153         for: 5m
154         labels:
155           alertgroup: "{{ $labels.instance }}"
156         annotations:
157           major_page_faults: "{{ $value }} faults/s"
158       - alert: oom kill detected
159         expr: increase(node_vmstat_oom_kill[1m]) > 0
160         for: 0m
161         labels:
162           alertgroup: "{{ $labels.instance }}"
163         annotations:
164           new_oom_kills: "{{ $value }}"
165   - name: network
166     rules:
167       - alert: interface transmit rate
168         expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98
169         for: 5m
170         labels:
171           alertgroup: "{{ $labels.instance }}"
172         annotations:
173           bandwidth_used: "{{ $value | humanizePercentage }}"
174       - alert: interface receive rate
175         expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98
176         for: 5m
177         labels:
178           alertgroup: "{{ $labels.instance }}"
179         annotations:
180           bandwidth_used: "{{ $value | humanizePercentage }}"
181       - alert: interface transmit errors
182         expr: rate(node_network_transmit_errs_total[1m]) / rate(node_network_transmit_packets_total[1m]) > 0.01
183         for: 5m
184         labels:
185           alertgroup: "{{ $labels.instance }}"
186         annotations:
187           error_rate: "{{ $value | humanizePercentage }}"
188       - alert: interface receive errors
189         expr: rate(node_network_receive_errs_total[1m]) / rate(node_network_receive_packets_total[1m]) > 0.01
190         for: 5m
191         labels:
192           alertgroup: "{{ $labels.instance }}"
193         annotations:
194           error_rate: "{{ $value | humanizePercentage }}"
195       - alert: conntrack entries
196         expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
197         for: 5m
198         labels:
199           alertgroup: "{{ $labels.instance }}"
200         annotations:
201           entries_used: "{{ $value | humanizePercentage }}"
202   - name: planet
203     rules:
204       - alert: planet dump overdue
205         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/(pbf|planet)/.*"} > 7 * 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
206         for: 24h
207         labels:
208           alertgroup: planet
209         annotations:
210           overdue_by: "{{ $value | humanizeDuration }}"
211       - alert: notes dump overdue
212         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/notes/.*"} > 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
213         for: 6h
214         labels:
215           alertgroup: planet
216         annotations:
217           overdue_by: "{{ $value | humanizeDuration }}"
218       - alert: daily replication feed delayed
219         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/day/.*"} > 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
220         for: 3h
221         labels:
222           alertgroup: planet
223         annotations:
224           delayed_by: "{{ $value | humanizeDuration }}"
225       - alert: hourly replication feed delayed
226         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/hour/.*"} > 3600 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
227         for: 30m
228         labels:
229           alertgroup: planet
230         annotations:
231           delayed_by: "{{ $value | humanizeDuration }}"
232       - alert: minutely replication feed delayed
233         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/minute/.*"} > 60 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
234         for: 5m
235         labels:
236           alertgroup: planet
237         annotations:
238           delayed_by: "{{ $value | humanizeDuration }}"
239       - alert: changeset replication feed delayed
240         expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/changesets/.*"} > 60 and ignoring (job, name, path) chef_role{name="planetdump"} == 1
241         for: 5m
242         labels:
243           alertgroup: planet
244         annotations:
245           delayed_by: "{{ $value | humanizeDuration }}"
246   - name: postgresql
247     rules:
248       - alert: postgresql down
249         expr: pg_up == 0
250         for: 1m
251         labels:
252           alertgroup: "{{ $labels.instance }}"
253       - alert: postgresql replication delay
254         expr: pg_replication_lag_seconds > 5
255         for: 1m
256         labels:
257           alertgroup: "{{ $labels.instance }}"
258         annotations:
259           delay: "{{ $value | humanizeDuration }}"
260       - alert: postgresql connection limit
261         expr: sum (pg_stat_activity_count) by (instance, server) / sum (pg_settings_max_connections) by (instance, server) > 0.8
262         for: 1m
263         labels:
264           alertgroup: "{{ $labels.instance }}"
265         annotations:
266           connections_used: "{{ $value | humanizePercentage }}"
267       - alert: postgresql deadlocks
268         expr: increase(pg_stat_database_deadlocks[1m]) > 5
269         for: 0m
270         labels:
271           alertgroup: "{{ $labels.instance }}"
272         annotations:
273           new_deadlocks: "{{ $value }}"
274       - alert: postgresql slow queries
275         expr: pg_slow_queries > 0
276         for: 5m
277         labels:
278           alertgroup: "{{ $labels.instance }}"
279         annotations:
280           queries: "{{ $value }}"
281   - name: smart
282     rules:
283       - alert: smart failure
284         expr: smart_health_status == 0
285         for: 60m
286         labels:
287           alertgroup: "{{ $labels.instance }}"
288       - alert: smart ssd wearout approaching
289         expr: smart_percentage_used >= 90
290         for: 60m
291         labels:
292           alertgroup: "{{ $labels.instance }}"
293         annotations:
294           percentage_used: "{{ $value | humanizePercentage }}"
295   - name: ssl
296     rules:
297       - alert: ssl certificate probe failed
298         expr: ssl_probe_success == 0
299         for: 60m
300         labels:
301           alertgroup: ssl
302       - alert: ssl certificate expiry
303         expr: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 14
304         for: 0m
305         labels:
306           alertgroup: ssl
307         annotations:
308           expires_in: "{{ $value | humanizeDuration }}"
309       - alert: ssl certificate revoked
310         expr: ssl_ocsp_response_status == 1
311         for: 0m
312         labels:
313           alertgroup: ssl
314       - alert: ocsp status unknown
315         expr: ssl_ocsp_response_status == 1
316         for: 0m
317         labels:
318           alertgroup: ssl
319   - name: systemd
320     rules:
321       - alert: systemd failed service
322         expr: node_systemd_unit_state{state="failed"} == 1
323         for: 5m
324         labels:
325           alertgroup: "{{ $labels.instance }}"
326   - name: tile
327     rules:
328       - alert: renderd replication delay
329         expr: renderd_replication_delay > 120
330         for: 5m
331         labels:
332           alertgroup: tile
333         annotations:
334           delay: "{{ $value | humanizeDuration }}"
335       - alert: missed tile rate
336         expr: sum(rate(modtile_http_response_total{code="404"}[5m])) by (instance) / sum(rate(modtile_http_response_total[5m])) by (instance) > 0.05
337         for: 5m
338         labels:
339           alertgroup: tile
340         annotations:
341           miss_rate: "{{ $value | humanizePercentage }}"
342   - name: time
343     rules:
344       - alert: clock not synchronising
345         expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
346         for: 5m
347         labels:
348           alertgroup: "{{ $labels.instance }}"
349       - alert: clock skew detected
350         expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
351         for: 5m
352         labels:
353           alertgroup: "{{ $labels.instance }}"
354         annotations:
355           skew: "{{ with printf \"node_timex_offset_seconds{instance='%s'}\" $labels.instance | query }} {{ . | humanizeDuration }}{{ end }}"
356   - name: web
357     rules:
358       - alert: web error rate
359         expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002
360         for: 5m
361         labels:
362           alertgroup: web
363         annotations:
364           error_rate: "{{ $value | humanizePercentage }}"