pressure: "{{ $value | humanizePercentage }}"
- name: database
rules:
- - alert: postgres replication delay
- expr: pg_replication_lag_seconds > 30
- for: 15m
+ - alert: active rails queries
+ expr: sum(pg_stat_activity_count{datname="openstreetmap",usename="rails",state="active"}) by (instance) > 50 and on (instance) chef_role{name="db-master"}
+ for: 5m
labels:
alertgroup: database
annotations:
- delay: "{{ $value | humanizeDuration }}"
+ queries: "{{ $value }}"
+ - alert: active cgimap queries
+ expr: sum(pg_stat_activity_count{datname="openstreetmap",usename="cgimap",state="active"}) by (instance) > 30 and on (instance) chef_role{name="db-master"}
+ for: 5m
+ labels:
+ alertgroup: database
+ annotations:
+ delay: "{{ $value }}"
- name: discourse
rules:
- alert: discourse job failure rate
alertgroup: "{{ $labels.instance }}"
annotations:
connections_used: "{{ $value | humanizePercentage }}"
+ - alert: mysql connection errors
+ expr: increase(mysql_global_status_connection_errors_total[1m]) > 0
+ for: 0m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ error_count: "{{ $value }}"
- name: network
rules:
+ - alert: interface redundancy lost
+ expr: node_bonding_active < 2 and on (instance, master) label_replace(chef_network_interface{bond_mode="802.3ad"}, "master", "$1", "name", "(.*)")
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ link_count: "{{ $value }}"
- alert: interface transmit rate
expr: rate(node_network_transmit_bytes_total[1m]) / node_network_speed_bytes > 0.99
for: 5m
alertgroup: "{{ $labels.instance }}"
annotations:
new_deadlocks: "{{ $value }}"
- - alert: postgresql slow queries
- expr: pg_slow_queries > 0
- for: 5m
- labels:
- alertgroup: "{{ $labels.instance }}"
- annotations:
- queries: "{{ $value }}"
- alert: postgresql idle transactions
expr: sum(pg_process_idle_seconds_count{state="idle in transaction"}) by (instance, server) > sum(pg_process_idle_seconds_bucket{state="idle in transaction",le="300"}) by (instance, server)
for: 5m
for: 10m
labels:
alertgroup: "prometheus"
+ - alert: node exporter text file scrape error
+ expr: node_textfile_scrape_error > 0
+ for: 10m
+ labels:
+ alertgroup: "prometheus"
- name: raid
rules:
- alert: raid controller battery failed