Try and ensure tile server use an integer number of listen buckets

[chef.git] / cookbooks / prometheus / templates / default / alert_rules.yml.erb
diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb

index d717e4f0aeedacf3f5c8fc628485c9b945eedc05..956c0d5b3a729935bedc1bbf087672c79ddf1180 100644 (file)
--- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb
+++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb
@@ -52,6 +52,13 @@ groups:
            alertgroup: "{{ $labels.instance }}"
          annotations:
            busy_workers: "{{ $value | humanizePercentage }}"
+      - alert: apache connection limit
+        expr: (apache_connections{state="total"} - on (instance) apache_connections{state="closing"}) / on (instance) (apache_server_limit * on (instance) (apache_threads_per_child + on (instance) (apache_async_request_worker_factor * on (instance) apache_workers{state="idle"} / on(instance) apache_processes{state="all"}))) > 0.8
+        for: 5m
+        labels:
+          alertgroup: "{{ $labels.instance }}"
+        annotations:
+          connections: "{{ $value | humanizePercentage }}"
    - name: chef
      rules:
        - alert: chef client not running
@@ -693,14 +700,14 @@ groups:
    - name: taginfo
      rules:
        - alert: taginfo planet age
-        expr: time() - taginfo_data_from_seconds > 129600
+        expr: time() - taginfo_data_from_seconds > 129600 and on (instance) chef_role{name="taginfo"}
          for: 0m
          labels:
            alertgroup: taginfo
          annotations:
            age: "{{ $value | humanizeDuration }}"
        - alert: taginfo database age
-        expr: time() - taginfo_database_update_finish_seconds > 129600
+        expr: time() - taginfo_database_update_finish_seconds > 129600 and on (instance) chef_role{name="taginfo"}
          for: 0m
          labels:
            alertgroup: taginfo