X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/c954aeff2861e3c9fcd8e236f08a2b15f4a00202..7d85513d56261a3f6e6b13384137974e852c565e:/cookbooks/prometheus/templates/default/alert_rules.yml.erb diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index cb8d91e99..4e00763e6 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -4,19 +4,19 @@ groups: - name: amsterdam rules: - alert: pdu current draw - expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 10 + expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 28 for: 6m labels: alertgroup: "amsterdam" annotations: current: "{{ $value | humanize }}A" - - alert: site current draw - expr: sum(rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10) > 13 + - alert: site power + expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 3 for: 6m labels: alertgroup: "amsterdam" annotations: - current: "{{ $value | humanize }}A" + current: "{{ $value | humanize }}kVA" - alert: site temperature expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 26 for: 6m @@ -90,8 +90,8 @@ groups: - name: cpu rules: - alert: cpu pressure - expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.6 - for: 15m + expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.75 + for: 60m labels: alertgroup: "{{ $labels.instance }}" annotations: @@ -117,19 +117,19 @@ groups: - name: dublin rules: - alert: pdu current draw - expr: rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10 > 13 + expr: rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10 > 28 for: 6m labels: alertgroup: "dublin" annotations: current: "{{ $value | humanize }}A" - - alert: site current draw - expr: sum(rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10) > 17 + - alert: site power + expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="dublin",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 4 for: 6m labels: alertgroup: "dublin" annotations: - current: "{{ $value | humanize }}A" + current: "{{ $value | humanize }}kVA" - alert: site temperature expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 26 for: 6m @@ -363,14 +363,14 @@ groups: - name: network rules: - alert: interface transmit rate - expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98 + expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.99 for: 5m labels: alertgroup: "{{ $labels.instance }}" annotations: bandwidth_used: "{{ $value | humanizePercentage }}" - alert: interface receive rate - expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98 + expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.99 for: 5m labels: alertgroup: "{{ $labels.instance }}" @@ -525,6 +525,13 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: queries: "{{ $value }}" + - alert: postgresql idle transactions + expr: sum(pg_process_idle_seconds_count{state="idle in transaction"}) by (instance, server) > sum(pg_process_idle_seconds_bucket{state="idle in transaction",le="300"}) by (instance, server) + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + queries: "{{ $value }}" - name: prometheus rules: - alert: prometheus configuration error @@ -544,6 +551,11 @@ groups: for: 5m labels: alertgroup: "{{ $labels.instance }}" + - alert: raid controller battery recharging + expr: ohai_controller_info{battery_status="recharging"} > 0 + for: 4h + labels: + alertgroup: "{{ $labels.instance }}" - alert: raid array degraded expr: ohai_array_info{status="degraded"} > 0 for: 5m @@ -645,6 +657,29 @@ groups: for: 0m labels: alertgroup: "{{ $labels.instance }}" + - name: taginfo + rules: + - alert: taginfo planet age + expr: time() - taginfo_data_from_seconds > 129600 + for: 0m + labels: + alertgroup: taginfo + annotations: + age: "{{ $value | humanizeDuration }}" + - alert: taginfo database age + expr: time() - taginfo_database_update_finish_seconds > 129600 + for: 0m + labels: + alertgroup: taginfo + annotations: + age: "{{ $value | humanizeDuration }}" + - alert: taginfo database size + expr: abs(delta(taginfo_database_size_bytes[30m])) / taginfo_database_size_bytes > 0.1 + for: 30m + labels: + alertgroup: taginfo + annotations: + size_change: "{{ $value | humanizePercentage }}" - name: tile rules: - alert: renderd replication delay