pressure: "{{ $value | humanizePercentage }}"
- name: database
rules:
- - alert: postgres replication delay
- expr: pg_replication_lag_seconds > 30
- for: 15m
- labels:
- alertgroup: database
- annotations:
- delay: "{{ $value | humanizeDuration }}"
- alert: active rails queries
- expr: sum(pg_stat_activity_count{instance="snap-01",datname="openstreetmap",usename="rails",state="active"}) > 50 and on (instance) chef_role{name="db-master"}
+ expr: sum(pg_stat_activity_count{datname="openstreetmap",usename="rails",state="active"}) by (instance) > 50 and on (instance) chef_role{name="db-master"}
for: 5m
labels:
alertgroup: database
annotations:
queries: "{{ $value }}"
- alert: active cgimap queries
- expr: sum(pg_stat_activity_count{instance="snap-01",datname="openstreetmap",usename="cgimap",state="active"}) > 30 and on (instance) chef_role{name="db-master"}
+ expr: sum(pg_stat_activity_count{datname="openstreetmap",usename="cgimap",state="active"}) by (instance) > 30 and on (instance) chef_role{name="db-master"}
for: 5m
labels:
alertgroup: database
alertgroup: "{{ $labels.instance }}"
annotations:
connections_used: "{{ $value | humanizePercentage }}"
+ - alert: mysql connection errors
+ expr: increase(mysql_global_status_connection_errors_total[1m]) > 0
+ for: 0m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ error_count: "{{ $value }}"
- name: network
rules:
- alert: interface redundancy lost