X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/f9c7ef7c00ab594a67af5a69acd906ffb51c77b3..11db8a461a61c0aba4259c9a3564d0e72bacc412:/cookbooks/prometheus/templates/default/alert_rules.yml.erb?ds=sidebyside diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index b0094a83b..8f2986e6a 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -17,6 +17,13 @@ groups: alertgroup: "amsterdam" annotations: current: "{{ $value | humanize }}A" + - alert: site power + expr: sum(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 100) > 3 + for: 0m + labels: + alertgroup: "amsterdam" + annotations: + current: "{{ $value | humanize }}kVA" - alert: site temperature expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 26 for: 6m @@ -90,8 +97,8 @@ groups: - name: cpu rules: - alert: cpu pressure - expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.6 - for: 15m + expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.75 + for: 60m labels: alertgroup: "{{ $labels.instance }}" annotations: @@ -130,6 +137,13 @@ groups: alertgroup: "dublin" annotations: current: "{{ $value | humanize }}A" + - alert: site power + expr: sum(rPDU2PhaseStatusApparentPower{site="dublin",rPDU2PhaseStatusIndex="1"} / 100) > 4 + for: 0m + labels: + alertgroup: "dublin" + annotations: + current: "{{ $value | humanize }}kVA" - alert: site temperature expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 26 for: 6m @@ -578,7 +592,7 @@ groups: labels: alertgroup: "{{ $labels.instance }}" - alert: smart ssd wearout approaching - expr: smart_percentage_used >= 80 + expr: smart_percentage_used / 100 >= 0.8 for: 60m labels: alertgroup: "{{ $labels.instance }}" @@ -587,8 +601,8 @@ groups: - name: smokeping rules: - alert: packet loss - expr: 100 - (rate(smokeping_response_duration_seconds_count[5m]) * 100 / rate(smokeping_requests_total[5m])) > 0 - for: 5m + expr: 1 - (rate(smokeping_response_duration_seconds_count[5m]) / rate(smokeping_requests_total[5m])) > 0.02 + for: 10m labels: alertgroup: smokeping annotations: