groups:
- name: amsterdam
rules:
- - alert: uplink
+ - alert: he uplink
expr: junos_interface_up{site="amsterdam",name=~"ge-[01]/2/2"} != 1
for: 6m
labels:
alertgroup: "amsterdam"
annotations:
status: "{{ $value }}"
+ - alert: equinix uplink
+ expr: junos_interface_up{site="amsterdam",name=~"xe-[01]/2/0"} != 1
+ for: 6m
+ labels:
+ alertgroup: "amsterdam"
+ annotations:
+ status: "{{ $value }}"
- alert: pdu current draw
expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 28
for: 6m
annotations:
current: "{{ $value | humanize }}kVA"
- alert: site temperature
- expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 18 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 26
+ expr: min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 < 15 or min(rPDU2SensorTempHumidityStatusTempC{site="amsterdam"}) / 10 > 32
for: 6m
labels:
alertgroup: "amsterdam"
annotations:
temperature: "{{ $value | humanize }}C"
- alert: site humidity
- expr: max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="amsterdam"}) / 100 < 0.25 or max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="amsterdam"}) / 100 > 0.65
+ expr: max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="amsterdam"}) / 100 < 0.08 or max(rPDU2SensorTempHumidityStatusRelativeHumidity{site="amsterdam"}) / 100 > 0.8
for: 6m
labels:
alertgroup: "amsterdam"
alertgroup: "{{ $labels.instance }}"
annotations:
busy_workers: "{{ $value | humanizePercentage }}"
+ - alert: apache connection limit
+ expr: (apache_connections{state="total"} - on (instance) apache_connections{state="closing"}) / on (instance) (apache_server_limit * on (instance) (apache_threads_per_child + on (instance) (apache_async_request_worker_factor * on (instance) apache_workers{state="idle"} / on(instance) apache_processes{state="all"}))) > 0.8
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ connections: "{{ $value | humanizePercentage }}"
- name: chef
rules:
- alert: chef client not running
pressure: "{{ $value | humanizePercentage }}"
- name: database
rules:
- - alert: postgres replication delay
- expr: pg_replication_lag_seconds > 30
- for: 15m
+ - alert: active rails queries
+ expr: sum(pg_stat_activity_count{datname="openstreetmap",usename="rails",state="active"}) by (instance) > 50 and on (instance) chef_role{name="db-master"}
+ for: 5m
labels:
alertgroup: database
annotations:
- delay: "{{ $value | humanizeDuration }}"
+ queries: "{{ $value }}"
+ - alert: active cgimap queries
+ expr: sum(pg_stat_activity_count{datname="openstreetmap",usename="cgimap",state="active"}) by (instance) > 30 and on (instance) chef_role{name="db-master"}
+ for: 5m
+ labels:
+ alertgroup: database
+ annotations:
+ queries: "{{ $value }}"
- name: discourse
rules:
- alert: discourse job failure rate
failure_rate: "{{ $value }} jobs/s"
- name: dublin
rules:
- - alert: uplink
+ - alert: he uplink
expr: junos_interface_up{site="dublin",name=~"ge-[01]/2/2"} != 1
for: 6m
labels:
alertgroup: "dublin"
annotations:
status: "{{ $value }}"
+ - alert: equinix uplink
+ expr: junos_interface_up{site="dublin",name=~"xe-[01]/2/0"} != 1
+ for: 6m
+ labels:
+ alertgroup: "dublin"
+ annotations:
+ status: "{{ $value }}"
- alert: pdu current draw
expr: rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10 > 28
for: 6m
for: 5m
labels:
alertgroup: "{{ $labels.site }}"
+ - alert: juniper laser receive power
+ expr: junos_interface_diagnostics_laser_rx_dbm < -12 and on (site, instance, name) junos_interface_admin_up == 1
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.site }}"
+ annotations:
+ power: "{{ $value }} dBm"
+ - alert: juniper laser transmit power
+ expr: junos_interface_diagnostics_laser_output_dbm < -8 and on (site, instance, name) junos_interface_admin_up == 1
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.site }}"
+ annotations:
+ power: "{{ $value }} dBm"
+ - name: load
+ rules:
+ - alert: load average
+ expr: sum(node_load5) by (instance) / count(node_cpu_frequency_max_hertz) by (instance) > 2
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ load: "{{ $value | humanizePercentage }}"
- name: mail
rules:
- alert: exim down
alertgroup: "{{ $labels.instance }}"
annotations:
connections_used: "{{ $value | humanizePercentage }}"
+ - alert: mysql connection errors
+ expr: increase(mysql_global_status_connection_errors_total[1m]) > 0
+ for: 0m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ error_count: "{{ $value }}"
- name: network
rules:
+ - alert: interface redundancy lost
+ expr: node_bonding_active < 2 and on (instance, master) label_replace(chef_network_interface{bond_mode="802.3ad"}, "master", "$1", "name", "(.*)")
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ link_count: "{{ $value }}"
- alert: interface transmit rate
expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.99
for: 5m
alertgroup: nominatim
annotations:
delay: "{{ $value | humanizeDuration }}"
+ - alert: nominatim connections
+ expr: sum(nginx_connections_writing and on (instance) chef_role{name="nominatim"}) > 2500
+ for: 15m
+ labels:
+ alertgroup: nominatim
- name: overpass
rules:
- alert: overpass osm database age
alertgroup: "{{ $labels.instance }}"
annotations:
new_deadlocks: "{{ $value }}"
- - alert: postgresql slow queries
- expr: pg_slow_queries > 0
- for: 5m
- labels:
- alertgroup: "{{ $labels.instance }}"
- annotations:
- queries: "{{ $value }}"
- alert: postgresql idle transactions
expr: sum(pg_process_idle_seconds_count{state="idle in transaction"}) by (instance, server) > sum(pg_process_idle_seconds_bucket{state="idle in transaction",le="300"}) by (instance, server)
for: 5m
for: 10m
labels:
alertgroup: "prometheus"
+ - alert: node exporter text file scrape error
+ expr: node_textfile_scrape_error > 0
+ for: 10m
+ labels:
+ alertgroup: "prometheus"
- name: raid
rules:
- alert: raid controller battery failed
alertgroup: "{{ $labels.instance }}"
annotations:
new_errors: "{{ $value }}"
+ - name: resolved
+ rules:
+ - alert: dnssec validation failures
+ expr: rate(resolved_dnssec_verdicts_total{result="bogus"}[1m]) > 1
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
- name: smart
rules:
- alert: smart failure
- name: taginfo
rules:
- alert: taginfo planet age
- expr: time() - taginfo_data_from_seconds > 129600
+ expr: time() - taginfo_data_from_seconds > 129600 and on (instance) chef_role{name="taginfo"}
for: 0m
labels:
alertgroup: taginfo
annotations:
age: "{{ $value | humanizeDuration }}"
- alert: taginfo database age
- expr: time() - taginfo_database_update_finish_seconds > 129600
+ expr: time() - taginfo_database_update_finish_seconds > 129600 and on (instance) chef_role{name="taginfo"}
for: 0m
labels:
alertgroup: taginfo
- name: web
rules:
- alert: web error rate
- expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002
+ expr: sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) / sum(rate(api_call_count_total[5m])) by (instance) > 0.002 and sum(rate(api_call_count_total{status=~"50[0-8]|5[1-9][0-9]"}[5m])) by (instance) > 0.01
for: 5m
labels:
alertgroup: web