X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/c954aeff2861e3c9fcd8e236f08a2b15f4a00202..1e91c84f1d288714b490e894ca4036eae6fe380f:/cookbooks/prometheus/templates/default/alert_rules.yml.erb diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index cb8d91e99..9990483b8 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -17,6 +17,13 @@ groups: alertgroup: "amsterdam" annotations: current: "{{ $value | humanize }}A" + - alert: site power + expr: sum(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 100) > 3 + for: 6m + labels: + alertgroup: "amsterdam" + annotations: + current: "{{ $value | humanize }}kVA" - alert: site temperature expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 26 for: 6m @@ -90,8 +97,8 @@ groups: - name: cpu rules: - alert: cpu pressure - expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.6 - for: 15m + expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.75 + for: 60m labels: alertgroup: "{{ $labels.instance }}" annotations: @@ -130,6 +137,13 @@ groups: alertgroup: "dublin" annotations: current: "{{ $value | humanize }}A" + - alert: site power + expr: sum(rPDU2PhaseStatusApparentPower{site="dublin",rPDU2PhaseStatusIndex="1"} / 100) > 4 + for: 6m + labels: + alertgroup: "dublin" + annotations: + current: "{{ $value | humanize }}kVA" - alert: site temperature expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 26 for: 6m @@ -544,6 +558,11 @@ groups: for: 5m labels: alertgroup: "{{ $labels.instance }}" + - alert: raid controller battery recharging + expr: ohai_controller_info{battery_status="recharging"} > 0 + for: 4h + labels: + alertgroup: "{{ $labels.instance }}" - alert: raid array degraded expr: ohai_array_info{status="degraded"} > 0 for: 5m