alertgroup: database
annotations:
delay: "{{ $value | humanizeDuration }}"
+ - name: fastly
+ rules:
+ - alert: error rate
+ expr: sum(rate(fastly_rt_status_group_total{status_group="5xx"}[5m])) by (service_name, datacenter) / sum(rate(fastly_rt_status_group_total[5m])) by (service_name, datacenter) > 0.005
+ for: 15m
+ labels:
+ alertgroup: fastly
+ annotations:
+ error_rate: "{{ $value | humanizePercentage }}"
- name: filesystem
rules:
- alert: readonly filesystem
alertgroup: "{{ $labels.instance }}"
- name: mail
rules:
- - alert: mail queue length
+ - alert: exim queue length
expr: exim_queue > exim_queue_limit
for: 60m
labels:
alertgroup: mail
annotations:
queue_length: "{{ $value }}"
+ - alert: mailman queue length
+ expr: mailman_queue_length > 200
+ for: 60m
+ labels:
+ alertgroup: mail
+ annotations:
+ queue_length: "{{ $value }}"
- name: mdadm
rules:
- alert: mdadm array inactive
alertgroup: web
annotations:
error_rate: "{{ $value | humanizePercentage }}"
+ - alert: job processing rate
+ expr: rate(pg_stat_user_tables_n_tup_del{datname="openstreetmap",relname="delayed_jobs"}[5m]) / rate(pg_stat_user_tables_n_tup_ins{datname="openstreetmap",relname="delayed_jobs"}[5m]) < 0.9 and ignoring(job, name, datname, relname, schemaname, server) chef_role{name="db-master"} == 1
+ for: 5m
+ labels:
+ alertgroup: web
+ annotations:
+ job_processing_rate: "{{ $value | humanizePercentage }}"