X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/326c0527e3a2616f9f808d14d533f7c761c73616..d1633aaa414c1d6126ed0d1a60ee33537ccd7c21:/cookbooks/prometheus/templates/default/alert_rules.yml.erb diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index d29b7272d..4511e1eac 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -8,6 +8,20 @@ groups: for: 5m labels: alertgroup: "prometheus" + - name: apache + rules: + - alert: apache down + expr: apache_up == 0 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + - alert: apache workers busy + expr: sum(apache_workers{state="busy"}) by (instance) / sum(apache_scoreboard) by (instance) > 0.8 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + busy_workers: "{{ $value | humanizePercentage }}" - name: database rules: - alert: postgres replication delay @@ -68,6 +82,11 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: voltage_volts: "{{ with printf \"ipmi_voltage_volts{instance='%s',id='%s'}\" $labels.instance $labels.id | query }}{{ . | first | value | humanize }}V{{end}}" + - alert: ipmi power alarm + expr: ipmi_power_state > 0 or ipmi_sensor_state{type=~"Power .*"} > 0 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" - name: mdadm rules: - alert: mdadm array inactive @@ -113,6 +132,116 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: new_oom_kills: "{{ $value }}" + - name: network + rules: + - alert: interface transmit rate + expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + bandwidth_used: "{{ $value | humanizePercentage }}" + - alert: interface receive rate + expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + bandwidth_used: "{{ $value | humanizePercentage }}" + - alert: interface transmit errors + expr: rate(node_network_transmit_errs_total[1m]) / rate(node_network_transmit_packets_total[1m]) > 0.01 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + error_rate: "{{ $value | humanizePercentage }}" + - alert: interface receive errors + expr: rate(node_network_receive_errs_total[1m]) / rate(node_network_receive_packets_total[1m]) > 0.01 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + error_rate: "{{ $value | humanizePercentage }}" + - alert: conntrack entries + expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + entries_used: "{{ $value | humanizePercentage }}" + - name: postgresql + rules: + - alert: postgresql down + expr: pg_up == 0 + for: 1m + labels: + alertgroup: "{{ $labels.instance }}" + - alert: postgresql replication delay + expr: pg_replication_lag_seconds > 5 + for: 1m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + delay: "{{ $value | humanizeDuration }}" + - alert: postgresql connection limit + expr: sum (pg_stat_activity_count) by (instance, server) / sum (pg_settings_max_connections) by (instance, server) > 0.8 + for: 1m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + connections_used: "{{ $value | humanizePercentage }}" + - alert: postgresql deadlocks + expr: increase(pg_stat_database_deadlocks[1m]) > 5 + for: 0m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + new_deadlocks: "{{ $value }}" + - alert: postgresql slow queries + expr: pg_slow_queries > 0 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + queries: "{{ $value }}" + - name: smart + rules: + - alert: smart failure + expr: smart_health_status == 0 + for: 60m + labels: + alertgroup: "{{ $labels.instance }}" + - alert: smart ssd wearout approaching + expr: smart_percentage_used >= 90 + for: 60m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + percentage_used: "{{ $value | humanizePercentage }}" + - name: ssl + rules: + - alert: ssl certificate probe failed + expr: ssl_probe_success == 0 + for: 60m + labels: + alertgroup: ssl + - alert: ssl certificate expiry + expr: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 14 + for: 0m + labels: + alertgroup: ssl + annotations: + expires_in: "{{ $value | humanizeDuration }}" + - alert: ssl certificate revoked + expr: ssl_ocsp_response_status == 1 + for: 0m + labels: + alertgroup: ssl + - alert: ocsp status unknown + expr: ssl_ocsp_response_status == 1 + for: 0m + labels: + alertgroup: ssl - name: tile rules: - alert: renderd replication delay