]> git.openstreetmap.org Git - chef.git/blobdiff - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Base site power alerts on a one hour rolling average
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
index cb8d91e99444379eb24b51336e86b4f24c9373ea..8cf1776694f3f15573e49e32e4bbbe2b30e575f6 100644 (file)
@@ -4,19 +4,19 @@ groups:
   - name: amsterdam
     rules:
       - alert: pdu current draw
   - name: amsterdam
     rules:
       - alert: pdu current draw
-        expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 10
+        expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 28
         for: 6m
         labels:
           alertgroup: "amsterdam"
         annotations:
           current: "{{ $value | humanize }}A"
         for: 6m
         labels:
           alertgroup: "amsterdam"
         annotations:
           current: "{{ $value | humanize }}A"
-      - alert: site current draw
-        expr: sum(rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10) > 13
+      - alert: site power
+        expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 3
         for: 6m
         labels:
           alertgroup: "amsterdam"
         annotations:
         for: 6m
         labels:
           alertgroup: "amsterdam"
         annotations:
-          current: "{{ $value | humanize }}A"
+          current: "{{ $value | humanize }}kVA"
       - alert: site temperature
         expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 26
         for: 6m
       - alert: site temperature
         expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 26
         for: 6m
@@ -90,8 +90,8 @@ groups:
   - name: cpu
     rules:
       - alert: cpu pressure
   - name: cpu
     rules:
       - alert: cpu pressure
-        expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.6
-        for: 15m
+        expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.75
+        for: 60m
         labels:
           alertgroup: "{{ $labels.instance }}"
         annotations:
         labels:
           alertgroup: "{{ $labels.instance }}"
         annotations:
@@ -117,19 +117,19 @@ groups:
   - name: dublin
     rules:
       - alert: pdu current draw
   - name: dublin
     rules:
       - alert: pdu current draw
-        expr: rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10 > 13
+        expr: rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10 > 28
         for: 6m
         labels:
           alertgroup: "dublin"
         annotations:
           current: "{{ $value | humanize }}A"
         for: 6m
         labels:
           alertgroup: "dublin"
         annotations:
           current: "{{ $value | humanize }}A"
-      - alert: site current draw
-        expr: sum(rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10) > 17
+      - alert: site power
+        expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="dublin",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 4
         for: 6m
         labels:
           alertgroup: "dublin"
         annotations:
         for: 6m
         labels:
           alertgroup: "dublin"
         annotations:
-          current: "{{ $value | humanize }}A"
+          current: "{{ $value | humanize }}kVA"
       - alert: site temperature
         expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 26
         for: 6m
       - alert: site temperature
         expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 26
         for: 6m
@@ -544,6 +544,11 @@ groups:
         for: 5m
         labels:
           alertgroup: "{{ $labels.instance }}"
         for: 5m
         labels:
           alertgroup: "{{ $labels.instance }}"
+      - alert: raid controller battery recharging
+        expr: ohai_controller_info{battery_status="recharging"} > 0
+        for: 4h
+        labels:
+          alertgroup: "{{ $labels.instance }}"
       - alert: raid array degraded
         expr: ohai_array_info{status="degraded"} > 0
         for: 5m
       - alert: raid array degraded
         expr: ohai_array_info{status="degraded"} > 0
         for: 5m