From ab863f7f6f6e537980861c533b5d8612986824f9 Mon Sep 17 00:00:00 2001 From: Tom Hughes Date: Mon, 3 Jul 2023 17:16:55 +0100 Subject: [PATCH] Update postgresql exporter configuration --- cookbooks/postgresql/attributes/default.rb | 1 - cookbooks/postgresql/recipes/default.rb | 4 +- .../default/postgres_queries.yml.erb | 138 ------------------ .../templates/default/alert_rules.yml.erb | 4 +- 4 files changed, 5 insertions(+), 142 deletions(-) diff --git a/cookbooks/postgresql/attributes/default.rb b/cookbooks/postgresql/attributes/default.rb index 54a224345..038fadeff 100644 --- a/cookbooks/postgresql/attributes/default.rb +++ b/cookbooks/postgresql/attributes/default.rb @@ -1,6 +1,5 @@ default[:postgresql][:versions] = [] default[:postgresql][:clusters] = {} -default[:postgresql][:monitor_tables] = true default[:postgresql][:settings][:defaults][:port] = "5432" default[:postgresql][:settings][:defaults][:max_connections] = "100" default[:postgresql][:settings][:defaults][:ssl] = "true" diff --git a/cookbooks/postgresql/recipes/default.rb b/cookbooks/postgresql/recipes/default.rb index a3ef7d23d..f5bab6231 100644 --- a/cookbooks/postgresql/recipes/default.rb +++ b/cookbooks/postgresql/recipes/default.rb @@ -166,12 +166,14 @@ template "/etc/prometheus/exporters/postgres_queries.yml" do mode "644" end +# lag / lag_seconds +# process_idle missing state prometheus_exporter "postgres" do port 9187 scrape_interval "1m" scrape_timeout "1m" user "postgres" - options "--extend.query-path=/etc/prometheus/exporters/postgres_queries.yml" + options "--no-collector.process_idle --extend.query-path=/etc/prometheus/exporters/postgres_queries.yml" environment "DATA_SOURCE_URI" => uris.sort.uniq.first, "PG_EXPORTER_AUTO_DISCOVER_DATABASES" => "true", "PG_EXPORTER_EXCLUDE_DATABASES" => "postgres,template0,template1" diff --git a/cookbooks/postgresql/templates/default/postgres_queries.yml.erb b/cookbooks/postgresql/templates/default/postgres_queries.yml.erb index 1c33e0c1e..a92382edc 100644 --- a/cookbooks/postgresql/templates/default/postgres_queries.yml.erb +++ b/cookbooks/postgresql/templates/default/postgres_queries.yml.erb @@ -1,128 +1,3 @@ -pg_replication: - query: "SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag_seconds" - master: true - metrics: - - lag_seconds: - usage: "GAUGE" - description: "Replication lag behind master in seconds" - -pg_postmaster: - query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()" - master: true - metrics: - - start_time_seconds: - usage: "GAUGE" - description: "Time at which postmaster started" -<% if node[:postgresql][:monitor_tables] -%> - -pg_stat_user_tables: - query: "SELECT current_database() datname, schemaname, relname, seq_scan, seq_tup_read, idx_scan, idx_tup_fetch, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, n_mod_since_analyze, COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, COALESCE(last_analyze, '1970-01-01Z') as last_analyze, COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, vacuum_count, autovacuum_count, analyze_count, autoanalyze_count FROM pg_stat_user_tables" - metrics: - - datname: - usage: "LABEL" - description: "Name of current database" - - schemaname: - usage: "LABEL" - description: "Name of the schema that this table is in" - - relname: - usage: "LABEL" - description: "Name of this table" - - seq_scan: - usage: "COUNTER" - description: "Number of sequential scans initiated on this table" - - seq_tup_read: - usage: "COUNTER" - description: "Number of live rows fetched by sequential scans" - - idx_scan: - usage: "COUNTER" - description: "Number of index scans initiated on this table" - - idx_tup_fetch: - usage: "COUNTER" - description: "Number of live rows fetched by index scans" - - n_tup_ins: - usage: "COUNTER" - description: "Number of rows inserted" - - n_tup_upd: - usage: "COUNTER" - description: "Number of rows updated" - - n_tup_del: - usage: "COUNTER" - description: "Number of rows deleted" - - n_tup_hot_upd: - usage: "COUNTER" - description: "Number of rows HOT updated (i.e., with no separate index update required)" - - n_live_tup: - usage: "GAUGE" - description: "Estimated number of live rows" - - n_dead_tup: - usage: "GAUGE" - description: "Estimated number of dead rows" - - n_mod_since_analyze: - usage: "GAUGE" - description: "Estimated number of rows changed since last analyze" - - last_vacuum: - usage: "GAUGE" - description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" - - last_autovacuum: - usage: "GAUGE" - description: "Last time at which this table was vacuumed by the autovacuum daemon" - - last_analyze: - usage: "GAUGE" - description: "Last time at which this table was manually analyzed" - - last_autoanalyze: - usage: "GAUGE" - description: "Last time at which this table was analyzed by the autovacuum daemon" - - vacuum_count: - usage: "COUNTER" - description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" - - autovacuum_count: - usage: "COUNTER" - description: "Number of times this table has been vacuumed by the autovacuum daemon" - - analyze_count: - usage: "COUNTER" - description: "Number of times this table has been manually analyzed" - - autoanalyze_count: - usage: "COUNTER" - description: "Number of times this table has been analyzed by the autovacuum daemon" - -pg_statio_user_tables: - query: "SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables" - metrics: - - datname: - usage: "LABEL" - description: "Name of current database" - - schemaname: - usage: "LABEL" - description: "Name of the schema that this table is in" - - relname: - usage: "LABEL" - description: "Name of this table" - - heap_blks_read: - usage: "COUNTER" - description: "Number of disk blocks read from this table" - - heap_blks_hit: - usage: "COUNTER" - description: "Number of buffer hits in this table" - - idx_blks_read: - usage: "COUNTER" - description: "Number of disk blocks read from all indexes on this table" - - idx_blks_hit: - usage: "COUNTER" - description: "Number of buffer hits in all indexes on this table" - - toast_blks_read: - usage: "COUNTER" - description: "Number of disk blocks read from this table's TOAST table (if any)" - - toast_blks_hit: - usage: "COUNTER" - description: "Number of buffer hits in this table's TOAST table (if any)" - - tidx_blks_read: - usage: "COUNTER" - description: "Number of disk blocks read from this table's TOAST table indexes (if any)" - - tidx_blks_hit: - usage: "COUNTER" - description: "Number of buffer hits in this table's TOAST table indexes (if any)" -<% end -%> - pg_process_idle: query: | WITH @@ -174,19 +49,6 @@ pg_process_idle: usage: "HISTOGRAM" description: "Idle time of server processes" -pg_unfrozen_ids: - query: "SELECT current_database() AS datname, max(age(relfrozenxid)) AS xid_age, max(mxid_age(relminmxid)) AS mxid_age FROM pg_class WHERE relkind IN ('r', 'm')" - metrics: - - datname: - usage: "LABEL" - description: "Name of the database" - - xid_age: - usage: "GAUGE" - description: "Age of the oldest unfrozen transaction ID in this database" - - mxid_age: - usage: "GAUGE" - description: "Age of the oldest unfrozen multixact ID in this database" - pg_wal: query: "SELECT count(*) AS segment_count FROM pg_ls_waldir() WHERE name ~ '^[0-9A-Z]{24}$'" master: true diff --git a/cookbooks/prometheus/templates/default/alert_rules.yml.erb b/cookbooks/prometheus/templates/default/alert_rules.yml.erb index 396de8de4..3c448cc2a 100644 --- a/cookbooks/prometheus/templates/default/alert_rules.yml.erb +++ b/cookbooks/prometheus/templates/default/alert_rules.yml.erb @@ -99,7 +99,7 @@ groups: - name: database rules: - alert: postgres replication delay - expr: pg_replication_lag_seconds > 30 + expr: pg_replication_lag > 30 for: 15m labels: alertgroup: database @@ -507,7 +507,7 @@ groups: labels: alertgroup: "{{ $labels.instance }}" - alert: postgresql replication delay - expr: pg_replication_lag_seconds > 30 + expr: pg_replication_lag > 30 for: 15m labels: alertgroup: "{{ $labels.instance }}" -- 2.39.5