From: Tom Hughes Date: Sun, 28 Feb 2021 19:38:13 +0000 (+0000) Subject: Add some additional prometheus alerts X-Git-Url: https://git.openstreetmap.org./chef.git/commitdiff_plain/8087f4551b3f216a0ea1af0d55dccfbdc43b46dc?ds=inline;hp=--cc Add some additional prometheus alerts --- 8087f4551b3f216a0ea1af0d55dccfbdc43b46dc diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index 40d49640a..0e834474c 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -164,6 +164,65 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: entries_used: "{{ $value | humanizePercentage }}" + - name: postgresql + rules: + - alert: postgresql down + expr: pg_up == 0 + for: 1m + labels: + alertgroup: "{{ $labels.instance }}" + - alert: postgresql replication delay + expr: pg_replication_lag_seconds > 5 + for: 1m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + delay: "{{ $value | humanizeDuration }}" + - alert: postgresql connection limit + expr: sum (pg_stat_activity_count) by (instance, server) / sum (pg_settings_max_connections) by (instance, server) > 0.8 + for: 1m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + connections_used: "{{ $value | humanizePercentage }}" + - alert: postgresql deadlocks + expr: increase(pg_stat_database_deadlocks[1m]) > 5 + for: 0m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + new_deadlocks: "{{ $value }}" + - alert: postgresql slow queries + expr: pg_slow_queries > 0 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + queries: "{{ $value }}" + - name: ssl + rules: + - alert: ssl certificate probe failed + expr: ssl_probe_success == 0 + for: 60m + labels: + alertgroup: ssl + - alert: ssl certificate expiry + expr: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 14 + for: 0m + labels: + alertgroup: ssl + annotations: + expires_in: "{{ $value | humanizeDuration }}" + - alert: ssl certificate revoked + expr: ssl_ocsp_response_status == 1 + for: 0m + labels: + alertgroup: ssl + - alert: ocsp status unknown + expr: ssl_ocsp_response_status == 1 + for: 0m + labels: + alertgroup: ssl - name: tile rules: - alert: renderd replication delay