X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/210ef98310a7e983a28e6136c8835834fe531ddf..44b52281d387a2a26df9cd9ac5e228c0976089fd:/cookbooks/prometheus/templates/default/alert_rules.yml.erb?ds=sidebyside diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index 32e536736..23b94727c 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -407,7 +407,7 @@ groups: annotations: connections_used: "{{ $value | humanizePercentage }}" - alert: postgresql deadlocks - expr: increase(pg_stat_database_deadlocks[1m]) > 5 + expr: increase(pg_stat_database_deadlocks{datname!="nominatim"}[1m]) > 5 for: 0m labels: alertgroup: "{{ $labels.instance }}" @@ -444,6 +444,22 @@ groups: for: 5m labels: alertgroup: "{{ $labels.instance }}" + - name: rasdaemon + rules: + - alert: memory controller errors + expr: increase(rasdaemon_mc_events_total[1m]) > 0 + for: 0m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + new_errors: "{{ $value }}" + - alert: pcie aer errors + expr: increase(rasdaemon_aer_events_total[1m]) > 0 + for: 0m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + new_ercrors: "{{ $value }}" - name: smart rules: - alert: smart failure @@ -452,7 +468,7 @@ groups: labels: alertgroup: "{{ $labels.instance }}" - alert: smart ssd wearout approaching - expr: smart_percentage_used >= 90 + expr: smart_percentage_used >= 80 for: 60m labels: alertgroup: "{{ $labels.instance }}"