# DO NOT EDIT - This file is being maintained by Chef
groups:
- - name: alertmanager
- rules:
- - alert: prometheus target missing
- expr: up == 0
- for: 10m
- labels:
- alertgroup: "prometheus"
- name: amsterdam
rules:
- alert: pdu current draw
annotations:
error_rate: "{{ $value | humanizePercentage }}"
- alert: fastly healthcheck failing
- expr: fastly_healthcheck_status = 0
+ expr: count(fastly_healthcheck_status == 0) > 0
+ for: 15m
+ labels:
+ alertgroup: fastly
+ - alert: fastly healthcheck failing
+ expr: count(fastly_healthcheck_status == 0) > 4
for: 5m
labels:
alertgroup: fastly
alertgroup: "{{ $labels.instance }}"
annotations:
queries: "{{ $value }}"
+ - name: prometheus
+ rules:
+ - alert: prometheus configuration error
+ expr: prometheus_config_last_reload_successful == 0
+ for: 10m
+ labels:
+ alertgroup: "prometheus"
+ - alert: prometheus target missing
+ expr: up == 0
+ for: 10m
+ labels:
+ alertgroup: "prometheus"
+ - name: raid
+ rules:
+ - alert: raid array degraded
+ expr: ohai_array_info{status="degraded"} > 0
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ - alert: raid disk failed
+ expr: ohai_disk_info{status="failed"} > 0
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
- name: smart
rules:
- alert: smart failure
labels:
alertgroup: "{{ $labels.instance }}"
- alert: smart ssd wearout approaching
- expr: smart_percentage_used >= 90
+ expr: smart_percentage_used >= 80
for: 60m
labels:
alertgroup: "{{ $labels.instance }}"