]> git.openstreetmap.org Git - chef.git/blobdiff - cookbooks/prometheus/templates/default/alert_rules.yml.erb
Reduce sensitivity of job processing rate alert
[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
index 2a7fd1b097d8a557c3d6a8fa0d26424aeb69f6ed..02c41ce52555b862ed94c33b0e377ecfd3a3497c 100644 (file)
@@ -363,14 +363,14 @@ groups:
   - name: network
     rules:
       - alert: interface transmit rate
-        expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98
+        expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.99
         for: 5m
         labels:
           alertgroup: "{{ $labels.instance }}"
         annotations:
           bandwidth_used: "{{ $value | humanizePercentage }}"
       - alert: interface receive rate
-        expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98
+        expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.99
         for: 5m
         labels:
           alertgroup: "{{ $labels.instance }}"
@@ -526,7 +526,7 @@ groups:
         annotations:
           queries: "{{ $value }}"
       - alert: postgresql idle transactions
-        expr: sum(pg_process_idle_seconds_count{state="idle in transaction"}) by (instance, server) > sum(pg_process_idle_seconds_bucket{state="idle in transaction",le="120"}) by (instance, server)
+        expr: sum(pg_process_idle_seconds_count{state="idle in transaction"}) by (instance, server) > sum(pg_process_idle_seconds_bucket{state="idle in transaction",le="300"}) by (instance, server)
         for: 5m
         labels:
           alertgroup: "{{ $labels.instance }}"
@@ -675,7 +675,7 @@ groups:
           age: "{{ $value | humanizeDuration }}"
       - alert: taginfo database size
         expr: abs(delta(taginfo_database_size_bytes[30m])) / taginfo_database_size_bytes > 0.1
-        for: 0m
+        for: 30m
         labels:
           alertgroup: taginfo
         annotations:
@@ -727,8 +727,8 @@ groups:
         annotations:
           error_rate: "{{ $value | humanizePercentage }}"
       - alert: job processing rate
-        expr: rate(pg_stat_user_tables_n_tup_del{datname="openstreetmap",relname="delayed_jobs"}[5m]) / rate(pg_stat_user_tables_n_tup_ins{datname="openstreetmap",relname="delayed_jobs"}[5m]) < 0.9 and ignoring(job, name, datname, relname, schemaname, server) chef_role{name="db-master"} == 1
-        for: 15m
+        expr: rate(pg_stat_user_tables_n_tup_del{datname="openstreetmap",relname="delayed_jobs"}[1h]) / rate(pg_stat_user_tables_n_tup_ins{datname="openstreetmap",relname="delayed_jobs"}[1h]) < 0.9 and ignoring(job, name, datname, relname, schemaname, server) chef_role{name="db-master"} == 1
+        for: 1h
         labels:
           alertgroup: web
         annotations: