- name: cpu
rules:
- alert: cpu pressure
- expr: rate(node_pressure_memory_waiting_seconds_total[5m]) > 0.3
+ expr: rate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.3
for: 15m
labels:
alertgroup: "{{ $labels.instance }}"
alertgroup: database
annotations:
delay: "{{ $value | humanizeDuration }}"
+ - name: fastly
+ rules:
+ - alert: error rate
+ expr: sum(rate(fastly_rt_status_group_total{status_group="5xx"}[5m])) by (service_name, datacenter) / sum(rate(fastly_rt_status_group_total[5m])) by (service_name, datacenter) > 0.005
+ for: 15m
+ labels:
+ alertgroup: fastly
+ annotations:
+ error_rate: "{{ $value | humanizePercentage }}"
- name: filesystem
rules:
- alert: readonly filesystem
in_max_volts: "{{ with printf \"node_hwmon_in_max_volts{instance='%s',chip='%s',sensor='%s'}\" $labels.instance $labels.chip $labels.sensor | query }}{{ . | first | value | humanize }}V{{end}}"
- name: io
rules:
- - alert: cpu pressure
+ - alert: io pressure
expr: rate(node_pressure_io_waiting_seconds_total[5m]) > 0.6
for: 60m
labels:
for: 5m
labels:
alertgroup: "{{ $labels.instance }}"
+ - name: mail
+ rules:
+ - alert: exim queue length
+ expr: exim_queue > exim_queue_limit
+ for: 60m
+ labels:
+ alertgroup: mail
+ annotations:
+ queue_length: "{{ $value }}"
+ - alert: mailman queue length
+ expr: mailman_queue_length > 200
+ for: 60m
+ labels:
+ alertgroup: mail
+ annotations:
+ queue_length: "{{ $value }}"
- name: mdadm
rules:
- alert: mdadm array inactive