labels:
alertgroup: database
annotations:
- delay: "{{ $value }}"
+ queries: "{{ $value }}"
- name: discourse
rules:
- alert: discourse job failure rate
alertgroup: "{{ $labels.site }}"
annotations:
power: "{{ $value }} dBm"
+ - name: load
+ rules:
+ - alert: load average
+ expr: sum(node_load5) by (instance) / count(node_cpu_frequency_max_hertz) by (instance) > 2
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ load: "{{ $value | humanizePercentage }}"
- name: mail
rules:
- alert: exim down
alertgroup: nominatim
annotations:
delay: "{{ $value | humanizeDuration }}"
+ - alert: nominatim connections
+ expr: sum(nginx_connections_writing and on (instance) chef_role{name="nominatim"}) > 2500
+ for: 15m
+ labels:
+ alertgroup: nominatim
- name: overpass
rules:
- alert: overpass osm database age
alertgroup: "{{ $labels.instance }}"
annotations:
new_errors: "{{ $value }}"
+ - name: resolved
+ rules:
+ - alert: dnssec validation failures
+ expr: rate(resolved_dnssec_verdicts_total{result="bogus"}[1m]) > 1
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
- name: smart
rules:
- alert: smart failure
- name: web
rules:
- alert: web error rate
- expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002
+ expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002 and sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) > 0.01
for: 5m
labels:
alertgroup: web