X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/b602bd1e89b1d4de6f776db8a23a3ae778b2edbe..88f820a038a839a687db7374235eca3003790296:/cookbooks/prometheus/templates/default/alert_rules.yml.erb?ds=sidebyside diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index 9a3c52920..396de8de4 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -3,23 +3,23 @@ groups: - name: amsterdam rules: - - alert: pdu current draw - expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 10 + - alert: uplink + expr: ifOperStatus{site="amsterdam",ifName=~"ge-[01]/2/2"} != 1 for: 6m labels: alertgroup: "amsterdam" annotations: - current: "{{ $value | humanize }}A" - - alert: site current draw - expr: sum(rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10) > 13 + status: "{{ $value }}" + - alert: pdu current draw + expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 28 for: 6m labels: alertgroup: "amsterdam" annotations: current: "{{ $value | humanize }}A" - alert: site power - expr: sum(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 100) > 3 - for: 0m + expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 3.5 + for: 6m labels: alertgroup: "amsterdam" annotations: @@ -52,13 +52,6 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: busy_workers: "{{ $value | humanizePercentage }}" - - alert: apache low request rate - expr: rate(apache_accesses_total[5m]) / rate(apache_accesses_total[1h] offset 1w) < 0.25 and rate(apache_accesses_total[1h] offset 1w) > 2 - for: 15m - labels: - alertgroup: "{{ $labels.instance }}" - annotations: - request_rate: "{{ $value | humanizePercentage }}" - name: chef rules: - alert: chef client not running @@ -123,23 +116,23 @@ groups: failure_rate: "{{ $value }} jobs/s" - name: dublin rules: - - alert: pdu current draw - expr: rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10 > 13 + - alert: uplink + expr: ifOperStatus{site="dublin",ifName=~"ge-[01]/2/2"} != 1 for: 6m labels: alertgroup: "dublin" annotations: - current: "{{ $value | humanize }}A" - - alert: site current draw - expr: sum(rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10) > 17 + status: "{{ $value }}" + - alert: pdu current draw + expr: rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10 > 28 for: 6m labels: alertgroup: "dublin" annotations: current: "{{ $value | humanize }}A" - alert: site power - expr: sum(rPDU2PhaseStatusApparentPower{site="dublin",rPDU2PhaseStatusIndex="1"} / 100) > 4 - for: 0m + expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="dublin",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 4 + for: 6m labels: alertgroup: "dublin" annotations: @@ -270,10 +263,12 @@ groups: - name: juniper rules: - alert: juniper cpu alarm - expr: jnxOperatingCPU{jnxOperatingContentsIndex="7"} > 30 + expr: jnxOperating5MinLoadAvg{jnxOperatingContentsIndex="9"} / 200 > 0.5 for: 5m labels: alertgroup: "{{ $labels.site }}" + annotations: + load_average: "{{ $value | humanizePercentage }}" - alert: juniper fan alarm expr: jnxOperatingState{jnxOperatingContentsIndex="4",jnxOperatingState!~"running.*"} > 0 for: 5m @@ -292,7 +287,7 @@ groups: labels: alertgroup: "{{ $labels.instance }}" - alert: exim queue length - expr: exim_queue > exim_queue_limit + expr: exim_queue > ignoring(job) exim_queue_limit for: 60m labels: alertgroup: mail @@ -377,14 +372,14 @@ groups: - name: network rules: - alert: interface transmit rate - expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98 + expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.99 for: 5m labels: alertgroup: "{{ $labels.instance }}" annotations: bandwidth_used: "{{ $value | humanizePercentage }}" - alert: interface receive rate - expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98 + expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.99 for: 5m labels: alertgroup: "{{ $labels.instance }}" @@ -539,6 +534,13 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: queries: "{{ $value }}" + - alert: postgresql idle transactions + expr: sum(pg_process_idle_seconds_count{state="idle in transaction"}) by (instance, server) > sum(pg_process_idle_seconds_bucket{state="idle in transaction",le="300"}) by (instance, server) + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + queries: "{{ $value }}" - name: prometheus rules: - alert: prometheus configuration error @@ -664,6 +666,29 @@ groups: for: 0m labels: alertgroup: "{{ $labels.instance }}" + - name: taginfo + rules: + - alert: taginfo planet age + expr: time() - taginfo_data_from_seconds > 129600 + for: 0m + labels: + alertgroup: taginfo + annotations: + age: "{{ $value | humanizeDuration }}" + - alert: taginfo database age + expr: time() - taginfo_database_update_finish_seconds > 129600 + for: 0m + labels: + alertgroup: taginfo + annotations: + age: "{{ $value | humanizeDuration }}" + - alert: taginfo database size + expr: abs(delta(taginfo_database_size_bytes[30m])) / taginfo_database_size_bytes > 0.1 + for: 30m + labels: + alertgroup: taginfo + annotations: + size_change: "{{ $value | humanizePercentage }}" - name: tile rules: - alert: renderd replication delay @@ -711,8 +736,8 @@ groups: annotations: error_rate: "{{ $value | humanizePercentage }}" - alert: job processing rate - expr: rate(pg_stat_user_tables_n_tup_del{datname="openstreetmap",relname="delayed_jobs"}[5m]) / rate(pg_stat_user_tables_n_tup_ins{datname="openstreetmap",relname="delayed_jobs"}[5m]) < 0.9 and ignoring(job, name, datname, relname, schemaname, server) chef_role{name="db-master"} == 1 - for: 15m + expr: rate(pg_stat_user_tables_n_tup_del{datname="openstreetmap",relname="delayed_jobs"}[1h]) / rate(pg_stat_user_tables_n_tup_ins{datname="openstreetmap",relname="delayed_jobs"}[1h]) < 0.9 and ignoring(job, name, datname, relname, schemaname, server) chef_role{name="db-master"} == 1 + for: 1h labels: alertgroup: web annotations: