From: Tom Hughes Date: Wed, 17 Nov 2021 17:11:01 +0000 (+0000) Subject: Add alerts for planet dumps and replication feeds X-Git-Url: https://git.openstreetmap.org./chef.git/commitdiff_plain/699e157e3d708f38064eea9072aa210fe5ccac30 Add alerts for planet dumps and replication feeds --- diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index 0469226db..346a61377 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -199,6 +199,50 @@ groups: alertgroup: "{{ $labels.instance }}" annotations: entries_used: "{{ $value | humanizePercentage }}" + - name: planet + rules: + - alert: planet dump overdue + expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/(pbf|planet)/.*"} > 7 * 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1 + for: 24h + labels: + alertgroup: planet + annotations: + overdue_by: "{{ $value | humanizeDuration }}" + - alert: notes dump overdue + expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/notes/.*"} > 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1 + for: 6h + labels: + alertgroup: planet + annotations: + overdue_by: "{{ $value | humanizeDuration }}" + - alert: daily replication feed delayed + expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/day/.*"} > 86400 and ignoring (job, name, path) chef_role{name="planetdump"} == 1 + for: 3h + labels: + alertgroup: planet + annotations: + delayed_by: "{{ $value | humanizeDuration }}" + - alert: hourly replication feed delayed + expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/hour/.*"} > 3600 and ignoring (job, name, path) chef_role{name="planetdump"} == 1 + for: 30m + labels: + alertgroup: planet + annotations: + delayed_by: "{{ $value | humanizeDuration }}" + - alert: minutely replication feed delayed + expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/minute/.*"} > 60 and ignoring (job, name, path) chef_role{name="planetdump"} == 1 + for: 5m + labels: + alertgroup: planet + annotations: + delayed_by: "{{ $value | humanizeDuration }}" + - alert: changeset replication feed delayed + expr: time() - file_stat_modif_time_seconds{path=~"/store/planet/replication/changesets/.*"} > 60 and ignoring (job, name, path) chef_role{name="planetdump"} == 1 + for: 5m + labels: + alertgroup: planet + annotations: + delayed_by: "{{ $value | humanizeDuration }}" - name: postgresql rules: - alert: postgresql down