Replication lag troubleshooting

🫀 Keep your circulation system healthy

This guide helps you diagnose and resolve streaming replication lag issues that can impact your high availability setup.

Quick diagnosis

Start with these queries to understand your replication status:

On Primary

-- Check replication status from primary
SELECT 
    client_addr,
    client_hostname,
    state,
    sent_lsn,
    write_lsn,
    flush_lsn,
    replay_lsn,
    write_lag,
    flush_lag,
    replay_lag,
    sync_state,
    sync_priority
FROM pg_stat_replication;

-- Check WAL generation rate
SELECT 
    pg_current_wal_lsn(),
    pg_walfile_name(pg_current_wal_lsn()) as current_wal_file;

-- Monitor WAL sender processes
SELECT 
    pid,
    usename,
    application_name,
    client_addr,
    backend_start,
    state,
    sync_state
FROM pg_stat_replication;

On Standby

-- Check if we're in recovery
SELECT pg_is_in_recovery();

-- Get last received and replayed WAL positions
SELECT 
    pg_last_wal_receive_lsn() as received_lsn,
    pg_last_wal_replay_lsn() as replayed_lsn,
    pg_last_xact_replay_timestamp() as last_replay_time;

-- Calculate lag in seconds
SELECT 
    EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) as lag_seconds,
    CASE 
        WHEN pg_last_xact_replay_timestamp() IS NULL THEN 'No activity'
        WHEN EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) < 5 THEN 'Excellent'
        WHEN EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) < 30 THEN 'Good'
        WHEN EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) < 60 THEN 'Warning'
        ELSE 'Critical'
    END as lag_status;

Detailed lag analysis

Measure byte lag

-- On primary: get current LSN
SELECT pg_current_wal_lsn() as primary_lsn;

-- On standby: compare with replay position  
SELECT 
    pg_last_wal_receive_lsn() as received_lsn,
    pg_last_wal_replay_lsn() as replayed_lsn,
    pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) as receive_replay_lag_bytes;

WAL streaming diagnostics

-- Check WAL receiver status (on standby)
SELECT 
    pid,
    status,
    receive_start_lsn,
    receive_start_tli,
    received_lsn,
    received_tli,
    last_msg_send_time,
    last_msg_receipt_time,
    latest_end_lsn,
    latest_end_time,
    slot_name,
    sender_host,
    sender_port,
    conninfo
FROM pg_stat_wal_receiver;

-- Check replication slots (on primary)
SELECT 
    slot_name,
    plugin,
    slot_type,
    datoid,
    database,
    active,
    active_pid,
    xmin,
    catalog_xmin,
    restart_lsn,
    confirmed_flush_lsn
FROM pg_replication_slots;

Common issues and solutions

High WAL generation

-- Find tables with most WAL activity
SELECT 
    schemaname,
    tablename,
    n_tup_ins + n_tup_upd + n_tup_del as total_changes,
    n_tup_hot_upd,
    n_dead_tup
FROM pg_stat_user_tables 
ORDER BY total_changes DESC 
LIMIT 10;

-- Check for long-running transactions (can cause WAL retention)
SELECT 
    pid,
    now() - xact_start as duration,
    state,
    query
FROM pg_stat_activity 
WHERE xact_start IS NOT NULL 
ORDER BY duration DESC;

Network and I/O monitoring

-- Monitor background writer activity
SELECT 
    checkpoints_timed,
    checkpoints_req,
    checkpoint_write_time,
    checkpoint_sync_time,
    buffers_checkpoint,
    buffers_clean,
    maxwritten_clean,
    buffers_backend,
    buffers_backend_fsync,
    buffers_alloc
FROM pg_stat_bgwriter;

-- Check for I/O bottlenecks
SELECT 
    schemaname,
    tablename,
    heap_blks_read,
    heap_blks_hit,
    heap_blks_hit::float / (heap_blks_hit + heap_blks_read) as hit_ratio
FROM pg_statio_user_tables 
WHERE heap_blks_read > 0
ORDER BY heap_blks_read DESC;

Configuration verification

Essential replication settings to check:

-- Check key replication parameters
SELECT name, setting, unit, context 
FROM pg_settings 
WHERE name IN (
    'wal_level',
    'max_wal_senders', 
    'max_replication_slots',
    'wal_keep_size',
    'hot_standby',
    'max_standby_streaming_delay',
    'max_standby_archive_delay'
);

-- Check archive settings
SELECT name, setting 
FROM pg_settings 
WHERE name LIKE '%archive%';

Monitoring setup

Create a monitoring view for ongoing lag tracking:

-- Create monitoring view (run on standby)
CREATE OR REPLACE VIEW replication_lag_monitor AS
SELECT 
    now() as check_time,
    pg_is_in_recovery() as is_standby,
    pg_last_wal_receive_lsn() as received_lsn,
    pg_last_wal_replay_lsn() as replayed_lsn,
    pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) as receive_replay_lag_bytes,
    EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) as lag_seconds,
    pg_last_xact_replay_timestamp() as last_replay_time;

-- Use the monitoring view
SELECT * FROM replication_lag_monitor;

Alerting thresholds

Set up alerts based on these thresholds:

Warning: Lag > 30 seconds or > 100MB
Critical: Lag > 300 seconds or > 1GB
Emergency: Lag > 1800 seconds or > 5GB

-- Sample alerting query
SELECT 
    CASE 
        WHEN lag_seconds > 1800 OR receive_replay_lag_bytes > 5*1024*1024*1024 THEN 'EMERGENCY'
        WHEN lag_seconds > 300 OR receive_replay_lag_bytes > 1024*1024*1024 THEN 'CRITICAL'
        WHEN lag_seconds > 30 OR receive_replay_lag_bytes > 100*1024*1024 THEN 'WARNING'
        ELSE 'OK'
    END as alert_level,
    lag_seconds,
    pg_size_pretty(receive_replay_lag_bytes) as lag_size
FROM replication_lag_monitor;

Replication lag troubleshooting

Quick diagnosis​

On Primary​

On Standby​

Detailed lag analysis​

Measure byte lag​

WAL streaming diagnostics​

Common issues and solutions​

High WAL generation​

Network and I/O monitoring​

Configuration verification​

Monitoring setup​

Alerting thresholds​