for: 15m
labels:
alertgroup: fastly
- - alert: fastly healthcheck failing
+ - alert: multipe fastly healthchecks failing
expr: count(fastly_healthcheck_status == 0) > 4
for: 5m
labels:
alertgroup: "{{ $labels.instance }}"
annotations:
error_rate: "{{ $value | humanizePercentage }}"
- - alert: interface transmit errors
+ - alert: wireguard interface transmit errors
expr: rate(node_network_transmit_errs_total{device=~"wg.*"}[1m]) / rate(node_network_transmit_packets_total{device=~"wg.*"}[1m]) > 0.05
for: 1h
labels:
alertgroup: "{{ $labels.instance }}"
annotations:
entries_used: "{{ $value | humanizePercentage }}"
+ - name: nominatim
+ rules:
+ - alert: nominatim replication delay
+ expr: nominatim_replication_delay > 10800
+ for: 1h
+ labels:
+ alertgroup: nominatim
+ annotations:
+ delay: "{{ $value | humanizeDuration }}"
+ - name: overpass
+ rules:
+ - alert: overpass osm database age
+ expr: overpass_database_age_seconds{database="osm"} > 300
+ for: 5m
+ labels:
+ alertgroup: overpass
+ annotations:
+ age: "{{ $value | humanizeDuration }}"
+ - alert: overpass area database age
+ expr: overpass_database_age_seconds{database="area"} > 86400
+ for: 1h
+ labels:
+ alertgroup: overpass
+ annotations:
+ age: "{{ $value | humanizeDuration }}"
- name: planet
rules:
- alert: planet dump overdue
- name: rasdaemon
rules:
- alert: memory controller errors
- expr: increase(rasdaemon_mc_events_total) > 0
+ expr: increase(rasdaemon_mc_events_total[1m]) > 0
for: 0m
labels:
alertgroup: "{{ $labels.instance }}"
annotations:
new_errors: "{{ $value }}"
- alert: pcie aer errors
- expr: increase(rasdaemon_aer_events_total) > 0
+ expr: increase(rasdaemon_aer_events_total[1m]) > 0
for: 0m
labels:
alertgroup: "{{ $labels.instance }}"
annotations:
- new_errors: "{{ $value }}"
+ new_ercrors: "{{ $value }}"
- name: smart
rules:
- alert: smart failure
for: 5m
labels:
alertgroup: "{{ $labels.instance }}"
- - alert: systemd failed service
+ - alert: systemd failed chef client service
expr: node_systemd_unit_state{state="failed",name="chef-client.service"} == 1
for: 6h
labels: