annotations:
current: "{{ $value | humanize }}A"
- alert: site temperature
- expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 25.5
+ expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 26
for: 6m
labels:
alertgroup: "amsterdam"
annotations:
current: "{{ $value | humanize }}A"
- alert: site temperature
- expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 25.5
+ expr: min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="dublin"}) / 10 > 26
for: 6m
labels:
alertgroup: "dublin"
- name: filesystem
rules:
- alert: readonly filesystem
- expr: node_filesystem_readonly == 1
+ expr: node_filesystem_readonly > min_over_time(node_filesystem_readonly[7d])
for: 0m
labels:
alertgroup: "{{ $labels.instance }}"
alertgroup: "{{ $labels.instance }}"
annotations:
new_oom_kills: "{{ $value }}"
+ - name: mysql
+ rules:
+ - alert: mysql down
+ expr: mysql_up == 0
+ for: 1m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ - alert: mysql connection limit
+ expr: mysql_global_status_max_used_connections / mysql_global_variables_max_connections > 0.8
+ for: 1m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ connections_used: "{{ $value | humanizePercentage }}"
- name: network
rules:
- alert: interface transmit rate
labels:
alertgroup: "{{ $labels.instance }}"
annotations:
- new_ercrors: "{{ $value }}"
+ new_errors: "{{ $value }}"
- name: smart
rules:
- alert: smart failure
alertgroup: "{{ $labels.instance }}"
annotations:
percentage_used: "{{ $value | humanizePercentage }}"
+ - name: snmp
+ rules:
+ - alert: snmp pdus missing
+ expr: max_over_time(snmp_scrape_pdus_returned[1d]) - snmp_scrape_pdus_returned > 0
+ for: 15m
+ labels:
+ alertgroup: snmp
+ annotations:
+ missing_pdus: "{{ $value }}"
- name: ssl
rules:
- alert: ssl certificate probe failed
alertgroup: tile
annotations:
miss_rate: "{{ $value | humanizePercentage }}"
+ - alert: tile render rate
+ expr: sum(rate(renderd_zoom_metatiles_total[5m])) by (instance) < 1
+ for: 5m
+ labels:
+ alertgroup: tile
+ annotations:
+ render_rate: "{{ $value }} tiles/s"
- name: time
rules:
- alert: clock not synchronising