X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/25d07fd9899c0ba0e741a21eb9a590ea6073e163..6ca4c5b1adbeff9556a2f63e7716d21d02495901:/cookbooks/prometheus/templates/default/alert_rules.yml.erb diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index a0cea5792..305afbd90 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -3,13 +3,20 @@ groups: - name: amsterdam rules: - - alert: uplink + - alert: he uplink expr: junos_interface_up{site="amsterdam",name=~"ge-[01]/2/2"} != 1 for: 6m labels: alertgroup: "amsterdam" annotations: status: "{{ $value }}" + - alert: equinix uplink + expr: junos_interface_up{site="amsterdam",name=~"xe-[01]/2/0"} != 1 + for: 6m + labels: + alertgroup: "amsterdam" + annotations: + status: "{{ $value }}" - alert: pdu current draw expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 28 for: 6m @@ -118,7 +125,7 @@ groups: labels: alertgroup: database annotations: - delay: "{{ $value }}" + queries: "{{ $value }}" - name: discourse rules: - alert: discourse job failure rate @@ -130,13 +137,20 @@ groups: failure_rate: "{{ $value }} jobs/s" - name: dublin rules: - - alert: uplink + - alert: he uplink expr: junos_interface_up{site="dublin",name=~"ge-[01]/2/2"} != 1 for: 6m labels: alertgroup: "dublin" annotations: status: "{{ $value }}" + - alert: equinix uplink + expr: junos_interface_up{site="dublin",name=~"xe-[01]/2/0"} != 1 + for: 6m + labels: + alertgroup: "dublin" + annotations: + status: "{{ $value }}" - alert: pdu current draw expr: rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10 > 28 for: 6m @@ -331,6 +345,15 @@ groups: alertgroup: "{{ $labels.site }}" annotations: power: "{{ $value }} dBm" + - name: load + rules: + - alert: load average + expr: sum(node_load5) by (instance) / count(node_cpu_frequency_max_hertz) by (instance) > 2 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + load: "{{ $value | humanizePercentage }}" - name: mail rules: - alert: exim down @@ -488,6 +511,11 @@ groups: alertgroup: nominatim annotations: delay: "{{ $value | humanizeDuration }}" + - alert: nominatim connections + expr: sum(nginx_connections_writing and on (instance) chef_role{name="nominatim"}) > 2500 + for: 15m + labels: + alertgroup: nominatim - name: overpass rules: - alert: overpass osm database age @@ -655,6 +683,13 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: new_errors: "{{ $value }}" + - name: resolved + rules: + - alert: dnssec validation failures + expr: rate(resolved_dnssec_verdicts_total{result="bogus"}[1m]) > 1 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" - name: smart rules: - alert: smart failure @@ -793,7 +828,7 @@ groups: - name: web rules: - alert: web error rate - expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002 + expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002 and sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) > 0.01 for: 5m labels: alertgroup: web