X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/98c9ef96146854a2f0fd8c342dae33fde8596155..a9eccad425a4f107b6449a670954179fd27b30c6:/cookbooks/prometheus/templates/default/alert_rules.yml.erb?ds=inline diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index dfb83891d..dfd885bb8 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -331,6 +331,15 @@ groups: alertgroup: "{{ $labels.site }}" annotations: power: "{{ $value }} dBm" + - name: load + rules: + - alert: load average + expr: sum(node_load5) by (instance) / count(node_cpu_frequency_max_hertz) by (instance) > 2 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + load: "{{ $value | humanizePercentage }}" - name: mail rules: - alert: exim down @@ -798,7 +807,7 @@ groups: - name: web rules: - alert: web error rate - expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002 + expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002 and sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) > 0.01 for: 5m labels: alertgroup: web