groups:
- name: amsterdam
rules:
- - alert: uplink
+ - alert: he uplink
expr: junos_interface_up{site="amsterdam",name=~"ge-[01]/2/2"} != 1
for: 6m
labels:
alertgroup: "amsterdam"
annotations:
status: "{{ $value }}"
+ - alert: equinix uplink
+ expr: junos_interface_up{site="amsterdam",name=~"xe-[01]/2/0"} != 1
+ for: 6m
+ labels:
+ alertgroup: "amsterdam"
+ annotations:
+ status: "{{ $value }}"
- alert: pdu current draw
expr: rPDU2PhaseStatusCurrent{site="amsterdam",rPDU2PhaseStatusIndex="1"} / 10 > 28
for: 6m
failure_rate: "{{ $value }} jobs/s"
- name: dublin
rules:
- - alert: uplink
+ - alert: he uplink
expr: junos_interface_up{site="dublin",name=~"ge-[01]/2/2"} != 1
for: 6m
labels:
alertgroup: "dublin"
annotations:
status: "{{ $value }}"
+ - alert: equinix uplink
+ expr: junos_interface_up{site="dublin",name=~"xe-[01]/2/0"} != 1
+ for: 6m
+ labels:
+ alertgroup: "dublin"
+ annotations:
+ status: "{{ $value }}"
- alert: pdu current draw
expr: rPDU2PhaseStatusCurrent{site="dublin",rPDU2PhaseStatusIndex="1"} / 10 > 28
for: 6m
alertgroup: "{{ $labels.site }}"
annotations:
power: "{{ $value }} dBm"
+ - name: load
+ rules:
+ - alert: load average
+ expr: sum(node_load5) by (instance) / count(node_cpu_frequency_max_hertz) by (instance) > 2
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
+ annotations:
+ load: "{{ $value | humanizePercentage }}"
- name: mail
rules:
- alert: exim down
alertgroup: "{{ $labels.instance }}"
annotations:
new_errors: "{{ $value }}"
+ - name: resolved
+ rules:
+ - alert: dnssec validation failures
+ expr: rate(resolved_dnssec_verdicts_total{result="bogus"}[1m]) > 1
+ for: 5m
+ labels:
+ alertgroup: "{{ $labels.instance }}"
- name: smart
rules:
- alert: smart failure