]> git.openstreetmap.org Git - chef.git/blobdiff - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Merge remote-tracking branch 'github/pull/550'
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
index 3dd1af873d8fdc8ef227327c5e7b64fc61e00506..16496c12d0887aec40f6ead0702b45ea8e962834 100644 (file)
@@ -5,28 +5,28 @@ groups:
     rules:
       - alert: pdu current draw
         expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 10
-        for: 5m
+        for: 6m
         labels:
           alertgroup: "amsterdam"
         annotations:
           current: "{{ $value | humanize }}A"
       - alert: site current draw
         expr: sum(rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10) > 13
-        for: 5m
+        for: 6m
         labels:
           alertgroup: "amsterdam"
         annotations:
           current: "{{ $value | humanize }}A"
       - alert: site temperature
-        expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 25
-        for: 5m
+        expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 25.5
+        for: 6m
         labels:
           alertgroup: "amsterdam"
         annotations:
           temperature: "{{ $value | humanize }}C"
       - alert: site humidity
         expr: max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="amsterdam"}) / 100 < 0.25 or max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="amsterdam"}) / 100 > 0.65
-        for: 5m
+        for: 6m
         labels:
           alertgroup: "amsterdam"
         annotations:
@@ -109,28 +109,28 @@ groups:
     rules:
       - alert: pdu current draw
         expr: rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10 > 13
-        for: 5m
+        for: 6m
         labels:
           alertgroup: "dublin"
         annotations:
           current: "{{ $value | humanize }}A"
       - alert: site current draw
         expr: sum(rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10) > 17
-        for: 5m
+        for: 6m
         labels:
           alertgroup: "dublin"
         annotations:
           current: "{{ $value | humanize }}A"
       - alert: site temperature
-        expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 25
-        for: 5m
+        expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 25.5
+        for: 6m
         labels:
           alertgroup: "dublin"
         annotations:
           temperature: "{{ $value | humanize }}C"
       - alert: site humidity
         expr: max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="dublin"}) / 100 < 0.25 or max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="dublin"}) / 100 > 0.65
-        for: 5m
+        for: 6m
         labels:
           alertgroup: "dublin"
         annotations:
@@ -157,7 +157,7 @@ groups:
   - name: filesystem
     rules:
       - alert: readonly filesystem
-        expr: node_filesystem_readonly == 1
+        expr: node_filesystem_readonly > min_over_time(node_filesystem_readonly[7d])
         for: 0m
         labels:
           alertgroup: "{{ $labels.instance }}"
@@ -263,6 +263,11 @@ groups:
           alertgroup: "{{ $labels.site }}"
   - name: mail
     rules:
+      - alert: exim down
+        expr: exim_up == 0
+        for: 5m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
       - alert: exim queue length
         expr: exim_queue > exim_queue_limit
         for: 60m