+ - name: prometheus
+ rules:
+ - alert: prometheus configuration error
+ expr: prometheus_config_last_reload_successful == 0
+ for: 10m
+ labels:
+ alertgroup: "prometheus"
+ - alert: prometheus target missing
+ expr: up == 0
+ for: 10m
+ labels:
+ alertgroup: "prometheus"
+ - alert: node exporter text file scrape error
+ expr: node_textfile_scrape_error > 0
+ for: 10m
+ labels:
+ alertgroup: "prometheus"
+ - name: raid
+ rules:
+ - alert: raid controller battery failed
+ expr: ohai_controller_info{battery_status="failed"} > 0
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ - alert: raid controller battery recharging
+ expr: ohai_controller_info{battery_status="recharging"} > 0
+ for: 4h
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ - alert: raid array degraded
+ expr: ohai_array_info{status="degraded"} > 0
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ - alert: raid disk failed
+ expr: ohai_disk_info{status="failed"} > 0
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ - name: rasdaemon
+ rules:
+ - alert: memory controller errors
+ expr: increase(rasdaemon_mc_events_total[1m]) > 0
+ for: 0m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ new_errors: "{{ $value }}"
+ - alert: pcie aer errors
+ expr: increase(rasdaemon_aer_events_total[1m]) > 0
+ for: 0m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ new_errors: "{{ $value }}"
+ - name: resolved
+ rules:
+ - alert: dnssec validation failures
+ expr: rate(resolved_dnssec_verdicts_total{result="bogus"}[1m]) > 1
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"