for: 15m
labels:
alertgroup: fastly
- - alert: fastly healthcheck failing
+ - alert: multipe fastly healthchecks failing
expr: count(fastly_healthcheck_status == 0) > 4
for: 5m
labels:
alertgroup: "{{ $labels.instance }}"
annotations:
error_rate: "{{ $value | humanizePercentage }}"
- - alert: interface transmit errors
+ - alert: wireguard interface transmit errors
expr: rate(node_network_transmit_errs_total{device=~"wg.*"}[1m]) / rate(node_network_transmit_packets_total{device=~"wg.*"}[1m]) > 0.05
for: 1h
labels:
alertgroup: "{{ $labels.instance }}"
annotations:
entries_used: "{{ $value | humanizePercentage }}"
+ - name: nominatim
+ rules:
+ - alert: nominatim replication delay
+ expr: nominatim_replication_delay > 10800
+ for: 1h
+ labels:
+ alertgroup: nominatim
+ annotations:
+ delay: "{{ $value | humanizeDuration }}"
- name: overpass
rules:
- alert: overpass osm database age
- expr: overpass_database_age_seconds{database="osm"} > 5m
+ expr: overpass_database_age_seconds{database="osm"} > 300
for: 5m
labels:
alertgroup: overpass
annotations:
age: "{{ $value | humanizeDuration }}"
- alert: overpass area database age
- expr: overpass_database_age_seconds{database="area"} > 24h
+ expr: overpass_database_age_seconds{database="area"} > 86400
for: 1h
labels:
alertgroup: overpass
for: 5m
labels:
alertgroup: "{{ $labels.instance }}"
- - alert: systemd failed service
+ - alert: systemd failed chef client service
expr: node_systemd_unit_state{state="failed",name="chef-client.service"} == 1
for: 6h
labels: