X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/694ab226ea187543b7987cc396853f2c2515df7c..eb10ee11cf967f641485a4124c337f52ce6b9939:/cookbooks/prometheus/templates/default/alert_rules.yml.erb?ds=sidebyside diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index 16496c12d..3618f1461 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -18,7 +18,7 @@ groups: annotations: current: "{{ $value | humanize }}A" - alert: site temperature - expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 25.5 + expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 26 for: 6m labels: alertgroup: "amsterdam" @@ -122,7 +122,7 @@ groups: annotations: current: "{{ $value | humanize }}A" - alert: site temperature - expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 25.5 + expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 26 for: 6m labels: alertgroup: "dublin" @@ -337,6 +337,20 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: new_oom_kills: "{{ $value }}" + - name: mysql + rules: + - alert: mysql down + expr: mysql_up == 0 + for: 1m + labels: + alertgroup: "{{ $labels.instance }}" + - alert: mysql connection limit + expr: mysql_global_status_max_used_connections / mysql_global_variables_max_connections > 0.8 + for: 1m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + connections_used: "{{ $value | humanizePercentage }}" - name: network rules: - alert: interface transmit rate @@ -541,7 +555,7 @@ groups: labels: alertgroup: "{{ $labels.instance }}" annotations: - new_ercrors: "{{ $value }}" + new_errors: "{{ $value }}" - name: smart rules: - alert: smart failure @@ -556,6 +570,15 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: percentage_used: "{{ $value | humanizePercentage }}" + - name: snmp + rules: + - alert: snmp pdus missing + expr: max_over_time(snmp_scrape_pdus_returned[1d]) - snmp_scrape_pdus_returned > 0 + for: 15m + labels: + alertgroup: snmp + annotations: + missing_pdus: "{{ $value }}" - name: ssl rules: - alert: ssl certificate probe failed @@ -615,6 +638,13 @@ groups: alertgroup: tile annotations: miss_rate: "{{ $value | humanizePercentage }}" + - alert: tile render rate + expr: sum(rate(renderd_zoom_metatiles_total[5m])) by (instance) < 1 + for: 5m + labels: + alertgroup: tile + annotations: + render_rate: "{{ $value }} tiles/s" - name: time rules: - alert: clock not synchronising