X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/cba7532d66e48bf49632eaf22d4ba9407b8182af..923a789051a29fbbce773c56505be4b78fcc4287:/cookbooks/prometheus/templates/default/alert_rules.yml.erb diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index a52901233..61a7370b1 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -114,13 +114,18 @@ groups: delay: "{{ $value | humanizeDuration }}" - name: fastly rules: - - alert: error rate + - alert: fastly error rate expr: sum(rate(fastly_rt_status_group_total{status_group="5xx"}[5m])) by (service_name, datacenter) / sum(rate(fastly_rt_status_group_total[5m])) by (service_name, datacenter) > 0.005 for: 15m labels: alertgroup: fastly annotations: error_rate: "{{ $value | humanizePercentage }}" + - alert: fastly healthcheck failing + expr: fastly_healthcheck_status == 0 + for: 5m + labels: + alertgroup: fastly - name: filesystem rules: - alert: readonly filesystem @@ -417,6 +422,18 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: queries: "{{ $value }}" + - name: raid + rules: + - alert: raid array degraded + expr: ohai_array_info{status="degraded"} > 0 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" + - alert: raid disk failed + expr: ohai_disk_info{status="failed"} > 0 + for: 5m + labels: + alertgroup: "{{ $labels.instance }}" - name: smart rules: - alert: smart failure