X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/68e068818d559ef35bdf2a138a93596337828ef9..98c8d0026a2836abf350ac2a393930ff81cef41c:/cookbooks/prometheus/templates/default/alert_rules.yml.erb diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index 149ff86bc..367e07650 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -18,7 +18,7 @@ groups: annotations: current: "{{ $value | humanize }}A" - alert: site temperature - expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 25.5 + expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 26 for: 6m labels: alertgroup: "amsterdam" @@ -122,7 +122,7 @@ groups: annotations: current: "{{ $value | humanize }}A" - alert: site temperature - expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 25.5 + expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 26 for: 6m labels: alertgroup: "dublin" @@ -157,7 +157,7 @@ groups: - name: filesystem rules: - alert: readonly filesystem - expr: node_filesystem_readonly == 1 + expr: node_filesystem_readonly > min_over_time(node_filesystem_readonly[7d]) for: 0m labels: alertgroup: "{{ $labels.instance }}" @@ -337,6 +337,20 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: new_oom_kills: "{{ $value }}" + - name: mysql + rules: + - alert: mysql down + expr: mysql_up == 0 + for: 1m + labels: + alertgroup: "{{ $labels.instance }}" + - alert: mysql connection limit + expr: mysql_global_status_max_used_connections / mysql_global_variables_max_connections > 0.8 + for: 1m + labels: + alertgroup: "{{ $labels.instance }}" + annotations: + connections_used: "{{ $value | humanizePercentage }}" - name: network rules: - alert: interface transmit rate @@ -541,7 +555,7 @@ groups: labels: alertgroup: "{{ $labels.instance }}" annotations: - new_ercrors: "{{ $value }}" + new_errors: "{{ $value }}" - name: smart rules: - alert: smart failure @@ -556,6 +570,15 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: percentage_used: "{{ $value | humanizePercentage }}" + - name: snmp + rules: + - alert: snmp pdus missing + expr: max_over_time(snmp_scrape_pdus_returned[1d]) - snmp_scrape_pdus_returned > 0 + for: 15m + labels: + alertgroup: snmp + annotations: + missing_pdus: "{{ $value }}" - name: ssl rules: - alert: ssl certificate probe failed @@ -615,6 +638,13 @@ groups: alertgroup: tile annotations: miss_rate: "{{ $value | humanizePercentage }}" + - alert: tile render rate + expr: sum(rate(renderd_zoom_metatiles_total[5m])) by (instance) == 0 + for: 15m + labels: + alertgroup: tile + annotations: + render_rate: "{{ $value }} tiles/s" - name: time rules: - alert: clock not synchronising