1 # DO NOT EDIT - This file is being maintained by Chef
6 - alert: prometheus target missing
10 alertgroup: "prometheus"
13 - alert: hwmon fan alarm
14 expr: node_hwmon_fan_alarm == 1
17 alertgroup: "{{ $labels.instance }}"
19 fan_rpm: "{{ with printf \"node_hwmon_fan_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
20 fan_min_rpm: "{{ with printf \"node_hwmon_fan_min_rpm{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}rpm{{end}}"
21 - alert: hwmon temperature alarm
22 expr: node_hwmon_temp_alarm == 1
25 alertgroup: "{{ $labels.instance }}"
27 temp_celsius: "{{ with printf \"node_hwmon_temp_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
28 temp_max_celsius: "{{ with printf \"node_hwmon_temp_max_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
29 temp_crit_celsius: "{{ with printf \"node_hwmon_temp_crit_celsius{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}C{{end}}"
30 - alert: hwmon voltage alarm
31 expr: node_hwmon_in_alarm == 1
34 alertgroup: "{{ $labels.instance }}"
36 in_volts: "{{ with printf \"node_hwmon_in_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
37 in_min_volts: "{{ with printf \"node_hwmon_in_min_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
38 in_max_volts: "{{ with printf \"node_hwmon_in_max_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
41 - alert: ipmi fan alarm
42 expr: ipmi_fan_speed_state > 0
45 alertgroup: "{{ $labels.instance }}"
47 fan_speed_rpm: "{{ with printf \"ipmi_fan_speed_rpm{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}rpm{{end}}"
48 - alert: ipmi temperature alarm
49 expr: ipmi_temperature_state > 0
52 alertgroup: "{{ $labels.instance }}"
54 temperature_celsius: "{{ with printf \"ipmi_temperature_celsius{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}C{{end}}"
55 - alert: ipmi voltage alarm
56 expr: ipmi_voltage_state > 0
59 alertgroup: "{{ $labels.instance }}"
61 voltage_volts: "{{ with printf \"ipmi_voltage_volts{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}V{{end}}"
64 - alert: mdadm array inactive
65 expr: node_md_state{state="inactive"} > 0
68 alertgroup: "{{ $labels.instance }}"
70 required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
71 active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
72 failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
73 spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
74 - alert: mdadm disk failed
75 expr: node_md_disks{state="failed"} > 0
78 alertgroup: "{{ $labels.instance }}"
80 required: "{{ with printf \"node_md_disks_required{instance='%s',device='%s'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
81 active: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='active'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
82 failed: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='failed'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
83 spare: "{{ with printf \"node_md_disks{instance='%s',device='%s',state='spare'}\" $labels.instance $labels.device | query }}{{ . | first | value | humanize }} disks{{end}}"
87 expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
90 alertgroup: "{{ $labels.instance }}"
92 memory_free: "{{ $value }}%"
93 - alert: memory pressure
94 expr: rate(node_vmstat_pgmajfault[1m]) > 1000
97 alertgroup: "{{ $labels.instance }}"
99 major_page_faults: "{{ $value }} faults/s"
100 - alert: oom kill detected
101 expr: increase(node_vmstat_oom_kill[1m]) > 0
104 alertgroup: "{{ $labels.instance }}"
106 new_oom_kills: "{{ $value }}"