]> git.openstreetmap.org Git - chef.git/blobdiff - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Increase alert window for site power usage alert
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
index cb8d91e99444379eb24b51336e86b4f24c9373ea..9990483b855bfa1d4387a11b4830730a6e828c3f 100644 (file)
@@ -17,6 +17,13 @@ groups:
           alertgroup: "amsterdam"
         annotations:
           current: "{{ $value | humanize }}A"
+      - alert: site power
+        expr: sum(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 100) > 3
+        for: 6m
+        labels:
+          alertgroup: "amsterdam"
+        annotations:
+          current: "{{ $value | humanize }}kVA"
       - alert: site temperature
         expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 26
         for: 6m
@@ -90,8 +97,8 @@ groups:
   - name: cpu
     rules:
       - alert: cpu pressure
-        expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.6
-        for: 15m
+        expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.75
+        for: 60m
         labels:
           alertgroup: "{{ $labels.instance }}"
         annotations:
@@ -130,6 +137,13 @@ groups:
           alertgroup: "dublin"
         annotations:
           current: "{{ $value | humanize }}A"
+      - alert: site power
+        expr: sum(rPDU2PhaseStatusApparentPower{site="dublin",rPDU2PhaseStatusIndex="1"} / 100) > 4
+        for: 6m
+        labels:
+          alertgroup: "dublin"
+        annotations:
+          current: "{{ $value | humanize }}kVA"
       - alert: site temperature
         expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 26
         for: 6m
@@ -544,6 +558,11 @@ groups:
         for: 5m
         labels:
           alertgroup: "{{ $labels.instance }}"
+      - alert: raid controller battery recharging
+        expr: ohai_controller_info{battery_status="recharging"} > 0
+        for: 4h
+        labels:
+          alertgroup: "{{ $labels.instance }}"
       - alert: raid array degraded
         expr: ohai_array_info{status="degraded"} > 0
         for: 5m