X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/16ae8dd1960a18f6c82973aab884a142a8901f54..3a33d35c42600aa215df8467702c98bee73415f8:/cookbooks/prometheus/templates/default/alert_rules.yml.erb diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index 521d2c92b..32ec1344a 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -11,7 +11,7 @@ groups: annotations: current: "{{ $value | humanize }}A" - alert: site power - expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 3 + expr: sum(avg_over_time(rPDU2PhaseStatusApparentPower{site="amsterdam",rPDU2PhaseStatusIndex="1"}[1h]) / 100) > 3.5 for: 6m labels: alertgroup: "amsterdam" @@ -363,14 +363,14 @@ groups: - name: network rules: - alert: interface transmit rate - expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98 + expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.99 for: 5m labels: alertgroup: "{{ $labels.instance }}" annotations: bandwidth_used: "{{ $value | humanizePercentage }}" - alert: interface receive rate - expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98 + expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.99 for: 5m labels: alertgroup: "{{ $labels.instance }}" @@ -727,8 +727,8 @@ groups: annotations: error_rate: "{{ $value | humanizePercentage }}" - alert: job processing rate - expr: rate(pg_stat_user_tables_n_tup_del{datname="openstreetmap",relname="delayed_jobs"}[5m]) / rate(pg_stat_user_tables_n_tup_ins{datname="openstreetmap",relname="delayed_jobs"}[5m]) < 0.9 and ignoring(job, name, datname, relname, schemaname, server) chef_role{name="db-master"} == 1 - for: 15m + expr: rate(pg_stat_user_tables_n_tup_del{datname="openstreetmap",relname="delayed_jobs"}[1h]) / rate(pg_stat_user_tables_n_tup_ins{datname="openstreetmap",relname="delayed_jobs"}[1h]) < 0.9 and ignoring(job, name, datname, relname, schemaname, server) chef_role{name="db-master"} == 1 + for: 1h labels: alertgroup: web annotations: