X-Git-Url: https://git.openstreetmap.org./chef.git/blobdiff_plain/480d14d1f1967e4911302e485ebde397c12d8795..82f971ceac11eb18688337bc8341840d6ccae54a:/cookbooks/postgresql/templates/default/postgres_queries.yml.erb diff --git a/cookbooks/postgresql/templates/default/postgres_queries.yml.erb b/cookbooks/postgresql/templates/default/postgres_queries.yml.erb index 50c9ae49f..1c33e0c1e 100644 --- a/cookbooks/postgresql/templates/default/postgres_queries.yml.erb +++ b/cookbooks/postgresql/templates/default/postgres_queries.yml.erb @@ -1,22 +1,196 @@ pg_replication: - query: "SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) AS lag_seconds" + query: "SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag_seconds" master: true metrics: - lag_seconds: usage: "GAUGE" description: "Replication lag behind master in seconds" -pg_database: - query: "SELECT pg_database.oid AS datid, pg_database.datname, pg_database_size(pg_database.datname) AS size_bytes FROM pg_database" +pg_postmaster: + query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()" master: true - cache_seconds: 30 metrics: - - datid: + - start_time_seconds: + usage: "GAUGE" + description: "Time at which postmaster started" +<% if node[:postgresql][:monitor_tables] -%> + +pg_stat_user_tables: + query: "SELECT current_database() datname, schemaname, relname, seq_scan, seq_tup_read, idx_scan, idx_tup_fetch, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, n_mod_since_analyze, COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, COALESCE(last_analyze, '1970-01-01Z') as last_analyze, COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, vacuum_count, autovacuum_count, analyze_count, autoanalyze_count FROM pg_stat_user_tables" + metrics: + - datname: usage: "LABEL" - description: "ID of the database" + description: "Name of current database" + - schemaname: + usage: "LABEL" + description: "Name of the schema that this table is in" + - relname: + usage: "LABEL" + description: "Name of this table" + - seq_scan: + usage: "COUNTER" + description: "Number of sequential scans initiated on this table" + - seq_tup_read: + usage: "COUNTER" + description: "Number of live rows fetched by sequential scans" + - idx_scan: + usage: "COUNTER" + description: "Number of index scans initiated on this table" + - idx_tup_fetch: + usage: "COUNTER" + description: "Number of live rows fetched by index scans" + - n_tup_ins: + usage: "COUNTER" + description: "Number of rows inserted" + - n_tup_upd: + usage: "COUNTER" + description: "Number of rows updated" + - n_tup_del: + usage: "COUNTER" + description: "Number of rows deleted" + - n_tup_hot_upd: + usage: "COUNTER" + description: "Number of rows HOT updated (i.e., with no separate index update required)" + - n_live_tup: + usage: "GAUGE" + description: "Estimated number of live rows" + - n_dead_tup: + usage: "GAUGE" + description: "Estimated number of dead rows" + - n_mod_since_analyze: + usage: "GAUGE" + description: "Estimated number of rows changed since last analyze" + - last_vacuum: + usage: "GAUGE" + description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" + - last_autovacuum: + usage: "GAUGE" + description: "Last time at which this table was vacuumed by the autovacuum daemon" + - last_analyze: + usage: "GAUGE" + description: "Last time at which this table was manually analyzed" + - last_autoanalyze: + usage: "GAUGE" + description: "Last time at which this table was analyzed by the autovacuum daemon" + - vacuum_count: + usage: "COUNTER" + description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" + - autovacuum_count: + usage: "COUNTER" + description: "Number of times this table has been vacuumed by the autovacuum daemon" + - analyze_count: + usage: "COUNTER" + description: "Number of times this table has been manually analyzed" + - autoanalyze_count: + usage: "COUNTER" + description: "Number of times this table has been analyzed by the autovacuum daemon" + +pg_statio_user_tables: + query: "SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables" + metrics: + - datname: + usage: "LABEL" + description: "Name of current database" + - schemaname: + usage: "LABEL" + description: "Name of the schema that this table is in" + - relname: + usage: "LABEL" + description: "Name of this table" + - heap_blks_read: + usage: "COUNTER" + description: "Number of disk blocks read from this table" + - heap_blks_hit: + usage: "COUNTER" + description: "Number of buffer hits in this table" + - idx_blks_read: + usage: "COUNTER" + description: "Number of disk blocks read from all indexes on this table" + - idx_blks_hit: + usage: "COUNTER" + description: "Number of buffer hits in all indexes on this table" + - toast_blks_read: + usage: "COUNTER" + description: "Number of disk blocks read from this table's TOAST table (if any)" + - toast_blks_hit: + usage: "COUNTER" + description: "Number of buffer hits in this table's TOAST table (if any)" + - tidx_blks_read: + usage: "COUNTER" + description: "Number of disk blocks read from this table's TOAST table indexes (if any)" + - tidx_blks_hit: + usage: "COUNTER" + description: "Number of buffer hits in this table's TOAST table indexes (if any)" +<% end -%> + +pg_process_idle: + query: | + WITH + metrics AS ( + SELECT + state, + application_name, + SUM(EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change))::bigint)::float AS process_idle_seconds_sum, + COUNT(*) AS process_idle_seconds_count + FROM pg_stat_activity + WHERE state ~ '^idle' + GROUP BY state, application_name + ), + buckets AS ( + SELECT + state, + application_name, + le, + SUM( + CASE WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change)) <= le + THEN 1 + ELSE 0 + END + )::bigint AS bucket + FROM + pg_stat_activity, + UNNEST(ARRAY[1, 2, 5, 15, 30, 60, 90, 120, 300]) AS le + GROUP BY state, application_name, le + ORDER BY state, application_name, le + ) + SELECT + state, + application_name, + process_idle_seconds_sum as seconds_sum, + process_idle_seconds_count as seconds_count, + ARRAY_AGG(le) AS seconds, + ARRAY_AGG(bucket) AS seconds_bucket + FROM metrics JOIN buckets USING (state, application_name) + GROUP BY 1, 2, 3, 4 + master: true + metrics: + - state: + usage: "LABEL" + description: "State" + - application_name: + usage: "LABEL" + description: "Application Name" + - seconds: + usage: "HISTOGRAM" + description: "Idle time of server processes" + +pg_unfrozen_ids: + query: "SELECT current_database() AS datname, max(age(relfrozenxid)) AS xid_age, max(mxid_age(relminmxid)) AS mxid_age FROM pg_class WHERE relkind IN ('r', 'm')" + metrics: - datname: usage: "LABEL" description: "Name of the database" - - size_bytes: + - xid_age: + usage: "GAUGE" + description: "Age of the oldest unfrozen transaction ID in this database" + - mxid_age: + usage: "GAUGE" + description: "Age of the oldest unfrozen multixact ID in this database" + +pg_wal: + query: "SELECT count(*) AS segment_count FROM pg_ls_waldir() WHERE name ~ '^[0-9A-Z]{24}$'" + master: true + metrics: + - segment_count: usage: "GAUGE" - description: "Disk space used by the database" + description: "Number of WAL segments"