X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/e85f7cbf201686dac598d8b285585d9af468490d..44b52281d387a2a26df9cd9ac5e228c0976089fd:/cookbooks/prometheus/templates/default/alert_rules.yml.erb?ds=inline diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index 30a14a471..23b94727c 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -115,7 +115,12 @@ groups: annotations: error_rate: "{{ $value | humanizePercentage }}" - alert: fastly healthcheck failing - expr: fastly_healthcheck_status == 0 + expr: count(fastly_healthcheck_status == 0) > 0 + for: 15m + labels: + alertgroup: fastly + - alert: fastly healthcheck failing + expr: count(fastly_healthcheck_status == 0) > 4 for: 5m labels: alertgroup: fastly @@ -402,7 +407,7 @@ groups: annotations: connections_used: "{{ $value | humanizePercentage }}" - alert: postgresql deadlocks - expr: increase(pg_stat_database_deadlocks[1m]) > 5 + expr: increase(pg_stat_database_deadlocks{datname!="nominatim"}[1m]) > 5 for: 0m labels: alertgroup: "{{ $labels.instance }}" @@ -439,6 +444,22 @@ groups: for: 5m labels: alertgroup: "{{ $labels.instance }}" + - name: rasdaemon + rules: + - alert: memory controller errors + expr: increase(rasdaemon_mc_events_total[1m]) > 0 + for: 0m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + new_errors: "{{ $value }}" + - alert: pcie aer errors + expr: increase(rasdaemon_aer_events_total[1m]) > 0 + for: 0m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + new_ercrors: "{{ $value }}" - name: smart rules: - alert: smart failure @@ -447,7 +468,7 @@ groups: labels: alertgroup: "{{ $labels.instance }}" - alert: smart ssd wearout approaching - expr: smart_percentage_used >= 90 + expr: smart_percentage_used >= 80 for: 60m labels: alertgroup: "{{ $labels.instance }}"