From: Tom Hughes Date: Tue, 4 Mar 2025 12:54:55 +0000 (+0000) Subject: Reduce sensitivity of web error rate alert on idle servers X-Git-Url: https://git.openstreetmap.org./chef.git/commitdiff_plain/d514d820f3d2d59fd9543a2d582ceb3dc8fcd432?hp=920b7eb79cf5798f70cf942744a88c3c12377dd1 Reduce sensitivity of web error rate alert on idle servers --- diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index 305afbd90..27cb72e47 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -828,7 +828,7 @@ groups: - name: web rules: - alert: web error rate - expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002 and sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) > 0.01 + expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002 and sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) > 0.05 for: 5m labels: alertgroup: web