annotations:
error_rate: "{{ $value | humanizePercentage }}"
- alert: fastly healthcheck failing
- expr: fastly_healthcheck_status == 0
+ expr: count(fastly_healthcheck_status == 0) > 0
+ for: 15m
+ labels:
+ alertgroup: fastly
+ - alert: fastly healthcheck failing
+ expr: count(fastly_healthcheck_status == 0) > 4
for: 5m
labels:
alertgroup: fastly
annotations:
connections_used: "{{ $value | humanizePercentage }}"
- alert: postgresql deadlocks
- expr: increase(pg_stat_database_deadlocks[1m]) > 5
+ expr: increase(pg_stat_database_deadlocks{datname!="nominatim"}[1m]) > 5
for: 0m
labels:
alertgroup: "{{ $labels.instance }}"
for: 5m
labels:
alertgroup: "{{ $labels.instance }}"
+ - name: rasdaemon
+ rules:
+ - alert: memory controller errors
+ expr: increase(rasdaemon_mc_events_total[1m]) > 0
+ for: 0m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ new_errors: "{{ $value }}"
+ - alert: pcie aer errors
+ expr: increase(rasdaemon_aer_events_total[1m]) > 0
+ for: 0m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ new_ercrors: "{{ $value }}"
- name: smart
rules:
- alert: smart failure
labels:
alertgroup: "{{ $labels.instance }}"
- alert: smart ssd wearout approaching
- expr: smart_percentage_used >= 90
+ expr: smart_percentage_used >= 80
for: 60m
labels:
alertgroup: "{{ $labels.instance }}"