PostgreSQL exporter deployment #

In this tutorial, we’ll deploy the Prometheus PostgreSQL exporter with our recommanded configuration to collect all key metrics.

Requirements #

Helm (v3.0+)
Kubectl (v1.25+)
Kubernetes (v1.25+) with cluster admin permissions
PostgreSQL admin permissions

This tutorial assumes you have some knowledge of Kubernetes, Helm and PostgreSQL.

Steps #

Create prometheus_postgresql_exporter PostgreSQL user with pg_monitor role

CREATE ROLE prometheus_postgresql_exporter IN ROLE pg_monitor PASSWORD 'hackme' LOGIN; -- Replace with a better password

Create a Kubernetes secret that contain PostgreSQL exporter password

KUBERNETES_NAMESPACE=monitoring
POSTGRESQL_PASSWORD=hackme # Replace with a better password
kubectl create secret generic postgresql-exporter-credentials --namespace ${KUBERNETES_NAMESPACE} --from-literal=password=${POSTGRESQL_PASSWORD}

Create values.yaml configuration file for PostgreSQL exporter

config:
  datasource:
    host: <replace_with_your_postgresql_host> # Replace with you PostgreSQL host
    user: prometheus_postgresql_exporter
    passwordSecret:
      key: postgresql-exporter-credentials
      name: password

Download customized queries.yaml file

queries.yaml

---
pg_replication:
query: |
    SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag;    
master: true
metrics:
    - lag:
        usage: "GAUGE"
        description: "Replication lag behind master in seconds"

pg_stat_connections:
query: |
    SELECT state, count(*) as count from pg_stat_activity
    WHERE pid <> pg_backend_pid()
    GROUP BY 1
    ORDER BY 1;    
master: true
metrics:
    - state:
        usage: "LABEL"
        description: "Connection status"
    - count:
        usage: "GAUGE"
        description: "Number of connections"

pg_postmaster:
query: |
    SELECT pg_postmaster_start_time as start_time_seconds
    FROM pg_postmaster_start_time();    
master: true
metrics:
    - start_time_seconds:
        usage: "GAUGE"
        description: "Time at which postmaster started"

pg_database:
query: |
    SELECT pg_database.datname, pg_database_size(pg_database.datname) as size_bytes
    FROM pg_database    
master: true
cache_seconds: 30
metrics:
    - datname:
        usage: "LABEL"
        description: "Name of the database"
    - size_bytes:
        usage: "GAUGE"
        description: "Disk space used by the database"

pg_replication_slots:
query: |
    SELECT
        slot_name,
        database,
        active,
        slot_type,
        xmin,
        pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) as replication_lag,
        pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) as confirmed_lag,
        CASE
            WHEN safe_wal_size IS NOT NULL
                THEN (select (safe_wal_size / 1024 / 1024) * 100 / (setting::int) from pg_settings where name = 'max_slot_wal_keep_size')
            else
                100
        END as available_storage_percent
    FROM pg_replication_slots;    
master: true
metrics:
    - slot_name:
        usage: "LABEL"
        description: "Name of the replication slot"
    - database:
        usage: "LABEL"
        description: "Name of the database"
    - active:
        usage: "GAUGE"
        description: "Flag indicating if the slot is active"
    - slot_type:
        usage: "LABEL"
        description: "The slot type: physical or logical"
    - xmin:
        usage: "GAUGE"
        description: "The oldest transaction that this slot needs the database to retain. VACUUM cannot remove tuples deleted by any later transaction."
    - replication_lag:
        usage: "GAUGE"
        description: "Replication lag in bytes"
    - confirmed_lag:
        usage: "GAUGE"
        description: "Replication confirmed lag in bytes"
    - available_storage_percent:
        usage: "GAUGE"
        description: "Available storage regarding max_slot_wal_keep_size in percent"

pg_stat_database:
query: |
    SELECT datid, datname, numbackends, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted, conflicts, temp_files, temp_bytes, deadlocks, blk_read_time, blk_write_time, sessions, sessions_abandoned, stats_reset
    FROM pg_stat_database
    where datname = current_database();    
master: false  # This query need to be executed per database to avoid abnormal metrics in staging
metrics:
    - datid:
        usage: LABEL
        description: "OID of a database"
    - datname:
        usage: LABEL
        description: "Name of this database"
    - numbackends:
        usage: GAUGE
        description: "Number of backends currently connected to this database. This is the only column in this view that returns a value reflecting current state; all other columns return the accumulated values since the last reset."
    - xact_commit:
        usage: COUNTER
        description: "Number of transactions in this database that have been committed"
    - xact_rollback:
        usage: COUNTER
        description: "Number of transactions in this database that have been rolled back"
    - blks_read:
        usage: COUNTER
        description: "Number of disk blocks read in this database"
    - blks_hit:
        usage: COUNTER
        description: "Number of times disk blocks were found already in the buffer cache, so that a read was not necessary (this only includes hits in the PostgreSQL buffer cache, not the operating system's file system cache)"
    - tup_returned:
        usage: COUNTER
        description: "Number of rows returned by queries in this database"
    - tup_fetched:
        usage: COUNTER
        description: "Number of rows fetched by queries in this database"
    - tup_inserted:
        usage: COUNTER
        description: "Number of rows inserted by queries in this database"
    - tup_updated:
        usage: COUNTER
        description: "Number of rows updated by queries in this database"
    - tup_deleted:
        usage: COUNTER
        description: "Number of rows deleted by queries in this database"
    - conflicts:
        usage: COUNTER
        description: "Number of queries canceled due to conflicts with recovery in this database. (Conflicts occur only on standby servers; see pg_stat_database_conflicts for details.)"
    - temp_files:
        usage: COUNTER
        description: "Number of temporary files created by queries in this database. All temporary files are counted, regardless of why the temporary file was created (e.g., sorting or hashing), and regardless of the log_temp_files setting."
    - temp_bytes:
        usage: COUNTER
        description: "Total amount of data written to temporary files by queries in this database. All temporary files are counted, regardless of why the temporary file was created, and regardless of the log_temp_files setting."
    - deadlocks:
        usage: COUNTER
        description: "Number of deadlocks detected in this database"
    - blk_read_time:
        usage: COUNTER
        description: "Time spent reading data file blocks by backends in this database, in milliseconds"
    - blk_write_time:
        usage: COUNTER
        description: "Time spent writing data file blocks by backends in this database, in milliseconds"
    - stats_reset:
        usage: COUNTER
        description: "Time at which these statistics were last reset"
    - sessions:
        usage: COUNTER
        description: "Total number of sessions established to this database"
    - sessions_abandoned:
        usage: COUNTER
        description: "Number of database sessions to this database that were terminated because connection to the client was lost"

pg_stat_user_tables:
query: |
    SELECT
    current_database() datname,
    u.schemaname,
    u.relname,
    u.seq_scan,
    u.seq_tup_read,
    u.idx_scan,
    u.idx_tup_fetch,
    u.n_tup_ins,
    u.n_tup_upd,
    u.n_tup_del,
    u.n_tup_hot_upd,
    u.n_live_tup,
    u.n_dead_tup,
    u.n_mod_since_analyze,
    COALESCE(u.last_vacuum, '1970-01-01Z') as last_vacuum,
    COALESCE(u.last_autovacuum, '1970-01-01Z') as last_autovacuum,
    COALESCE(u.last_analyze, '1970-01-01Z') as last_analyze,
    COALESCE(u.last_autoanalyze, '1970-01-01Z') as last_autoanalyze,
    u.vacuum_count,
    u.autovacuum_count,
    u.analyze_count,
    u.autoanalyze_count,
    table_size + index_size AS total_bytes,
    index_size AS index_bytes,
    toast_size AS toast_bytes,
    table_size - coalesce(toast_size,0) AS table_bytes
    FROM
    pg_stat_user_tables u JOIN
    (
        SELECT
        pg_class.oid,
        pg_table_size(pg_class.oid)                    AS table_size,
        pg_indexes_size(pg_class.oid)                  AS index_size,
        pg_total_relation_size(pg_class.reltoastrelid) AS toast_size
        FROM
        pg_stat_user_tables u
        JOIN pg_class ON pg_class.oid = u.relid AND pg_class.relkind <> 'm' -- exclude matviews to prevent query being locked when refreshing MV
    UNION ALL
        SELECT
        c.oid,
        SUM(c.relpages::bigint*8192) AS table_size,
        coalesce(SUM(idx.index_bytes),0) as index_size,
        coalesce(SUM((c2.relpages+c3.relpages)::bigint*8192),0) AS toast_size
        FROM pg_stat_user_tables u
        JOIN pg_class c ON u.relid=c.oid AND c.relkind='m'   -- matviews only
        LEFT JOIN pg_class c2 ON c2.oid = c.reltoastrelid
        LEFT JOIN pg_index it ON it.indrelid=c.reltoastrelid  -- only one index per pg_toast table
        LEFT JOIN pg_class c3 ON c3.oid=it.indexrelid
        CROSS JOIN LATERAL (
            SELECT SUM(c4.relpages::bigint*8192) AS index_bytes
            FROM pg_index i JOIN pg_class c4 ON i.indrelid=c.oid AND c4.oid=i.indexrelid
        ) idx
        GROUP BY c.oid
    ) t ON u.relid = t.oid
    ;    
metrics:
    - datname:
        usage: "LABEL"
        description: "Name of current database"
    - schemaname:
        usage: "LABEL"
        description: "Name of the schema that this table is in"
    - relname:
        usage: "LABEL"
        description: "Name of this table"
    - seq_scan:
        usage: "COUNTER"
        description: "Number of sequential scans initiated on this table"
    - seq_tup_read:
        usage: "COUNTER"
        description: "Number of live rows fetched by sequential scans"
    - idx_scan:
        usage: "COUNTER"
        description: "Number of index scans initiated on this table"
    - idx_tup_fetch:
        usage: "COUNTER"
        description: "Number of live rows fetched by index scans"
    - n_tup_ins:
        usage: "COUNTER"
        description: "Number of rows inserted"
    - n_tup_upd:
        usage: "COUNTER"
        description: "Number of rows updated"
    - n_tup_del:
        usage: "COUNTER"
        description: "Number of rows deleted"
    - n_tup_hot_upd:
        usage: "COUNTER"
        description: "Number of rows HOT updated (i.e., with no separate index update required)"
    - n_live_tup:
        usage: "GAUGE"
        description: "Estimated number of live rows"
    - n_dead_tup:
        usage: "GAUGE"
        description: "Estimated number of dead rows"
    - n_mod_since_analyze:
        usage: "GAUGE"
        description: "Estimated number of rows changed since last analyze"
    - last_vacuum:
        usage: "GAUGE"
        description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)"
    - last_autovacuum:
        usage: "GAUGE"
        description: "Last time at which this table was vacuumed by the autovacuum daemon"
    - last_analyze:
        usage: "GAUGE"
        description: "Last time at which this table was manually analyzed"
    - last_autoanalyze:
        usage: "GAUGE"
        description: "Last time at which this table was analyzed by the autovacuum daemon"
    - vacuum_count:
        usage: "COUNTER"
        description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)"
    - autovacuum_count:
        usage: "COUNTER"
        description: "Number of times this table has been vacuumed by the autovacuum daemon"
    - analyze_count:
        usage: "COUNTER"
        description: "Number of times this table has been manually analyzed"
    - autoanalyze_count:
        usage: "COUNTER"
        description: "Number of times this table has been analyzed by the autovacuum daemon"
    - total_bytes:
        usage: "GAUGE"
        description: "Total disk space used by the specified table, including all indexes and TOAST data"
    - index_bytes:
        usage: "GAUGE"
        description: "Total disk space used by indexes attached to the specified table"
    - toast_bytes:
        usage: "GAUGE"
        description: "Total disk space used by TOAST data"
    - table_bytes:
        usage: "GAUGE"
        description: "Total disk space used by the specified table, excluding indexes and TOAST data"

pg_locks:
master: false  # This query need to be executed per database to avoid abnormal metrics in staging
query: |
    SELECT pg_database.datname,tmp.mode,COALESCE(count,0) as count
    FROM
    (
        VALUES ('accesssharelock'),
        ('rowsharelock'),
        ('rowexclusivelock'),
        ('shareupdateexclusivelock'),
        ('sharelock'),
        ('sharerowexclusivelock'),
        ('exclusivelock'),
        ('accessexclusivelock'),
        ('sireadlock')
    ) AS tmp(mode) CROSS JOIN pg_database
    LEFT JOIN
    (
        SELECT database, lower(mode) AS mode,count(*) AS count
        FROM pg_locks WHERE database IS NOT NULL
        GROUP BY database, lower(mode)
    ) AS tmp2
    ON tmp.mode=tmp2.mode and pg_database.oid = tmp2.database
    WHERE pg_database.datname = current_database()
    ORDER BY 1    
metrics:
    - datname:
        usage: "LABEL"
        description: "Name of current database"
    - mode:
        usage: "LABEL"
        description: "Name of the lock mode"
    - count:
        usage: "GAUGE"
        description: "Number of locks held by open transactions"

pg_statio_user_tables:
query: |
    SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit
    FROM pg_statio_user_tables;    
metrics:
    - datname:
        usage: "LABEL"
        description: "Name of current database"
    - schemaname:
        usage: "LABEL"
        description: "Name of the schema that this table is in"
    - relname:
        usage: "LABEL"
        description: "Name of this table"
    - heap_blks_read:
        usage: "COUNTER"
        description: "Number of disk blocks read from this table"
    - heap_blks_hit:
        usage: "COUNTER"
        description: "Number of buffer hits in this table"
    - idx_blks_read:
        usage: "COUNTER"
        description: "Number of disk blocks read from all indexes on this table"
    - idx_blks_hit:
        usage: "COUNTER"
        description: "Number of buffer hits in all indexes on this table"
    - toast_blks_read:
        usage: "COUNTER"
        description: "Number of disk blocks read from this table's TOAST table (if any)"
    - toast_blks_hit:
        usage: "COUNTER"
        description: "Number of buffer hits in this table's TOAST table (if any)"
    - tidx_blks_read:
        usage: "COUNTER"
        description: "Number of disk blocks read from this table's TOAST table indexes (if any)"
    - tidx_blks_hit:
        usage: "COUNTER"
        description: "Number of buffer hits in this table's TOAST table indexes (if any)"

pg_stat_progress_vacuum:
query: |
    SELECT
    s.pid,
    s.datname,
    s.relid,
    s.relid::regclass as relname,
    extract(epoch from now() - xact_start) as duration,
    CASE s.phase WHEN 'initializing' THEN 0
        WHEN 'scanning heap' THEN 1
        WHEN 'vacuuming indexes' THEN 2
        WHEN 'vacuuming heap' THEN 3
        WHEN 'cleaning up indexes' THEN 4
        WHEN 'truncating heap' THEN 5
        WHEN 'performing final cleanup' THEN 6
        END AS phase,
    CASE
        WHEN a.query ~*'^autovacuum.*to prevent wraparound' THEN 2
        WHEN a.query ~*'^vacuum' THEN 1
        ELSE 0
        END AS mode,
    s.heap_blks_total,
    s.heap_blks_scanned,
    s.heap_blks_vacuumed,
    s.index_vacuum_count,
    s.max_dead_tuples,
    s.num_dead_tuples
    FROM pg_stat_progress_vacuum as s
    JOIN pg_stat_activity AS a USING ( pid )
    where S.datname = current_database();    
master: false  # This query need to be executed per database to resolve relname (relid::regclass)
metrics:
    - pid:
        usage: GAUGE
        description: Process ID of backend
    - datname:
        usage: LABEL
        description: Name of the database to which this backend is connected
    - relname:
        usage: LABEL
        description: Name of the table
    - duration:
        usage: GAUGE
        description: Vacuum duration in seconds
    - relid:
        usage: LABEL
        description: OID of the resource
    - phase:
        usage: GAUGE
        description: Phase
    - mode:
        usage: GAUGE
        description: "Vacuum mode (0: autovacuum, 1: user, 2: prevent wraparound)"
    - heap_blks_total:
        usage: GAUGE
        description: Total number of heap blocks in the table
    - heap_blks_scanned:
        usage: GAUGE
        description: Number of heap blocks scanned
    - heap_blks_vacuumed:
        usage: GAUGE
        description: Number of heap blocks vacuumed
    - index_vacuum_count:
        usage: GAUGE
        description: Number of completed index vacuum cycles
    - max_dead_tuples:
        usage: GAUGE
        description: Number of dead tuples that we can store before needing to perform an index vacuum cycle.
    - num_dead_tuples:
        usage: GAUGE
        description: Number of dead tuples collected since the last index vacuum cycle.

pg_stat_replication_slots:
query: |
    SELECT slot_name, spill_txns, spill_count, spill_bytes, total_txns, total_bytes
    FROM pg_stat_replication_slots;    
master: true
metrics:
    - slot_name:
        usage: LABEL
        description: "A unique, cluster-wide identifier for the replication slot"
    - spill_txns:
        usage: COUNTER
        description: "Number of transactions spilled to disk once the memory used by logical decoding to decode changes from WAL has exceeded logical_decoding_work_mem. The counter gets incremented for both top-level transactions and subtransactions"
    - spill_count:
        usage: COUNTER
        description: "Number of times transactions were spilled to disk while decoding changes from WAL for this slot. This counter is incremented each time a transaction is spilled, and the same transaction may be spilled multiple times"
    - spill_bytes:
        usage: COUNTER
        description: "Amount of decoded transaction data spilled to disk while performing decoding of changes from WAL for this slot. This and other spill counters can be used to gauge the I/O which occurred during logical decoding and allow tuning logical_decoding_work_mem."
    - total_txns:
        usage: COUNTER
        description: "Number of decoded transactions sent to the decoding output plugin for this slot. This counts top-level transactions only, and is not incremented for subtransactions. Note that this includes the transactions that are streamed and/or spilled."
    - total_bytes:
        usage: COUNTER
        description: "Amount of transaction data decoded for sending transactions to the decoding output plugin while decoding changes from WAL for this slot. Note that this includes data that is streamed and/or spilled"

pg_stat_user_indexes:
query: |
    SELECT current_database() datname, us.schemaname, us.relname, us.indexrelname, us.idx_scan, us.idx_tup_read, us.idx_tup_fetch, io.idx_blks_read, io.idx_blks_hit, ind.indisvalid
    FROM pg_index as ind, pg_stat_user_indexes as us, pg_statio_user_indexes as io
    WHERE ind.indexrelid = us.indexrelid
    AND ind.indexrelid = io.indexrelid;    
master: false  # This query need to be executed per database to resolve relname (relid::regclass)
metrics:
    - datname:
        usage: "LABEL"
        description: "Name of current database"
    - schemaname:
        usage: "LABEL"
        description: "Name of the schema that this table is in"
    - relname:
        usage: "LABEL"
        description: "Name of this table"
    - indexrelname:
        usage: "LABEL"
        description: "Name of this index"
    - idx_scan:
        usage: "COUNTER"
        description: "Number of index scans initiated on this index"
    - idx_tup_read:
        usage: "COUNTER"
        description: "Number of index entries returned by scans on this index"
    - idx_tup_fetch:
        usage: "COUNTER"
        description: "Number of live table rows fetched by simple index scans using this index"
    - idx_blks_read:
        usage: "COUNTER"
        description: "Number of disk blocks read from this index"
    - idx_blks_hit:
        usage: "COUNTER"
        description: "Number of buffer hits in this index"
    - indisvalid:
        usage: "LABEL"
        description: "If true, the index is currently valid for queries. False means the index is possibly incomplete: it must still be modified by INSERT/UPDATE operations, but it cannot safely be used for queries"

# Collect pid, usename and datname information of long running queries
pg_active_backend:
query: |
    SELECT
        pid,
        usename,
        datname,
        EXTRACT('minutes' FROM  now() - state_change) AS duration_minutes
    FROM
        pg_stat_activity
    WHERE
        state = 'active'
        AND backend_type != 'walsender'
        AND EXTRACT('minutes' FROM  now() - state_change) > 5;    
master: true
metrics:
    - usename:
        usage: "LABEL"
        description: "PostgreSQL username"
    - datname:
        usage: "LABEL"
        description: "PostgreSQL database"
    - pid:
        usage: "LABEL"
        description: "Backend pid"
    - duration_minutes:
        usage: "GAUGE"
        description: "Query duration"

pg_statio_all_tables:
query: |
    SELECT
        relid,
        current_database() datname,
        schemaname,
        relname,
        heap_blks_read,
        heap_blks_hit,
        idx_blks_read,
        idx_blks_hit
    FROM
        pg_statio_all_tables
    WHERE
        current_database() in ('qonto_api', 'qonto_auth', 'qonto_biller', 'qonto_fraud')
        AND schemaname not in ('pg_catalog', 'pg_toast', 'information_schema')
        AND relname not like 'sca_sessions_partitions_%'    
master: false # This query need to be executed per database to have the metric
metrics:
    - relid:
        usage: LABEL
        description: OID of the resource
    - schemaname:
        usage: "LABEL"
        description: "Name of the schema that this table is in"
    - relname:
        usage: "LABEL"
        description: "Name of this table"
    - datname:
        usage: "LABEL"
        description: "Name of the database"
    - heap_blks_read:
        usage: "COUNTER"
        description: "Number of disk blocks read from this table"
    - heap_blks_hit:
        usage: "COUNTER"
        description: "Number of buffer hits in this table"
    - idx_blks_read:
        usage: "COUNTER"
        description: "Number of disk blocks read from all indexes on this table"
    - idx_blks_hit:
        usage: "COUNTER"
        description: "Number of buffer hits in all indexes on this table"

Deploy PostgreSQL exporter using prometheus-postgres-exporter helm chart

helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
helm upgrade --install prometheus-postgres-exporter prometheus-community/prometheus-postgres-exporter --namespace ${KUBERNETES_NAMESPACE} --values values.yaml --set-file config.queries=queries.yaml