X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/2c385f3d70fb6867de8c6f0d846eba62fba1aeb0..ae3dbed359fa5aa4fde098f72fba1c6ca1cd60a9:/cookbooks/prometheus/templates/default/alert_rules.yml.erb?ds=inline diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index d870444ca..40d49640a 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -8,6 +8,29 @@ groups: for: 5m labels: alertgroup: "prometheus" + - name: apache + rules: + - alert: apache down + expr: apache_up == 0 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + - alert: apache workers busy + expr: sum(apache_workers{state="busy"}) by (instance) / sum(apache_scoreboard) by (instance) > 0.8 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + busy_workers: "{{ $value | humanizePercentage }}" + - name: database + rules: + - alert: postgres replication delay + expr: pg_replication_lag_seconds > 5 + for: 5m + labels: + alertgroup: database + annotations: + delay: "{{ $value | humanizeDuration }}" - name: hwmon rules: - alert: hwmon fan alarm @@ -104,3 +127,70 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: new_oom_kills: "{{ $value }}" + - name: network + rules: + - alert: interface transmit rate + expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + bandwidth_used: "{{ $value | humanizePercentage }}" + - alert: interface receive rate + expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + bandwidth_used: "{{ $value | humanizePercentage }}" + - alert: interface transmit errors + expr: rate(node_network_transmit_errs_total[1m]) / rate(node_network_transmit_packets_total[1m]) > 0.01 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + error_rate: "{{ $value | humanizePercentage }}" + - alert: interface receive errors + expr: rate(node_network_receive_errs_total[1m]) / rate(node_network_receive_packets_total[1m]) > 0.01 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + error_rate: "{{ $value | humanizePercentage }}" + - alert: conntrack entries + expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + entries_used: "{{ $value | humanizePercentage }}" + - name: tile + rules: + - alert: renderd replication delay + expr: renderd_replication_delay > 120 + for: 5m + labels: + alertgroup: tile + annotations: + delay: "{{ $value | humanizeDuration }}" + - alert: missed tile rate + expr: sum(rate(modtile_http_response_total{code="404"}[5m])) by (instance) / sum(rate(modtile_http_response_total[5m])) by (instance) > 0.05 + for: 5m + labels: + alertgroup: tile + annotations: + miss_rate: "{{ $value | humanizePercentage }}" + - name: time + rules: + - alert: clock not synchronising + expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + - alert: clock skew detected + expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + skew: "{{ with printf \"node_timex_offset_seconds{instance='%s'}\" $labels.instance | query }} {{ . | humanizeDuration }}{{ end }}"