alertgroup: "amsterdam"
annotations:
current: "{{ $value | humanize }}A"
+ - alert: site power
+ expr: sum(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 100) > 3
+ for: 6m
+ labels:
+ alertgroup: "amsterdam"
+ annotations:
+ current: "{{ $value | humanize }}kVA"
- alert: site temperature
expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 26
for: 6m
- name: cpu
rules:
- alert: cpu pressure
- expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.6
- for: 15m
+ expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.75
+ for: 60m
labels:
alertgroup: "{{ $labels.instance }}"
annotations:
alertgroup: "dublin"
annotations:
current: "{{ $value | humanize }}A"
+ - alert: site power
+ expr: sum(rPDU2PhaseStatusApparentPower{site="dublin",rPDU2PhaseStatusIndex="1"} / 100) > 4
+ for: 6m
+ labels:
+ alertgroup: "dublin"
+ annotations:
+ current: "{{ $value | humanize }}kVA"
- alert: site temperature
expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 26
for: 6m
for: 5m
labels:
alertgroup: "{{ $labels.instance }}"
+ - alert: raid controller battery recharging
+ expr: ohai_controller_info{battery_status="recharging"} > 0
+ for: 4h
+ labels:
+ alertgroup: "{{ $labels.instance }}"
- alert: raid array degraded
expr: ohai_array_info{status="degraded"} > 0
for: 5m