for: 5m
labels:
alertgroup: "prometheus"
+ - name: apache
+ rules:
+ - alert: apache down
+ expr: apache_up == 0
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ - alert: apache workers busy
+ expr: sum(apache_workers{state="busy"}) by (instance) / sum(apache_scoreboard) by (instance) > 0.8
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ busy_workers: "{{ $value | humanizePercentage }}"
+ - name: database
+ rules:
+ - alert: postgres replication delay
+ expr: pg_replication_lag_seconds > 5
+ for: 5m
+ labels:
+ alertgroup: database
+ annotations:
+ delay: "{{ $value | humanizeDuration }}"
- name: hwmon
rules:
- alert: hwmon fan alarm
alertgroup: "{{ $labels.instance }}"
annotations:
new_oom_kills: "{{ $value }}"
+ - name: network
+ rules:
+ - alert: interface transmit rate
+ expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.98
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ bandwidth_used: "{{ $value | humanizePercentage }}"
+ - alert: interface receive rate
+ expr: rate(node_network_receive_bytes_total[1m]) / node_network_speed_bytes > 0.98
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ bandwidth_used: "{{ $value | humanizePercentage }}"
+ - alert: interface transmit errors
+ expr: rate(node_network_transmit_errs_total[1m]) / rate(node_network_transmit_packets_total[1m]) > 0.01
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ error_rate: "{{ $value | humanizePercentage }}"
+ - alert: interface receive errors
+ expr: rate(node_network_receive_errs_total[1m]) / rate(node_network_receive_packets_total[1m]) > 0.01
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ error_rate: "{{ $value | humanizePercentage }}"
+ - alert: conntrack entries
+ expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ entries_used: "{{ $value | humanizePercentage }}"
+ - name: tile
+ rules:
+ - alert: renderd replication delay
+ expr: renderd_replication_delay > 120
+ for: 5m
+ labels:
+ alertgroup: tile
+ annotations:
+ delay: "{{ $value | humanizeDuration }}"
+ - alert: missed tile rate
+ expr: sum(rate(modtile_http_response_total{code="404"}[5m])) by (instance) / sum(rate(modtile_http_response_total[5m])) by (instance) > 0.05
+ for: 5m
+ labels:
+ alertgroup: tile
+ annotations:
+ miss_rate: "{{ $value | humanizePercentage }}"
+ - name: time
+ rules:
+ - alert: clock not synchronising
+ expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ - alert: clock skew detected
+ expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ skew: "{{ with printf \"node_timex_offset_seconds{instance='%s'}\" $labels.instance | query }} {{ . | humanizeDuration }}{{ end }}"