Skip to main content

Replication lag troubleshooting

🫀 Keep your circulation system healthy

This guide helps you diagnose and resolve streaming replication lag issues that can impact your high availability setup.

Quick diagnosis​

Start with these queries to understand your replication status:

On Primary​

-- Check replication status from primary
SELECT
client_addr,
client_hostname,
state,
sent_lsn,
write_lsn,
flush_lsn,
replay_lsn,
write_lag,
flush_lag,
replay_lag,
sync_state,
sync_priority
FROM pg_stat_replication;

-- Check WAL generation rate
SELECT
pg_current_wal_lsn(),
pg_walfile_name(pg_current_wal_lsn()) as current_wal_file;

-- Monitor WAL sender processes
SELECT
pid,
usename,
application_name,
client_addr,
backend_start,
state,
sync_state
FROM pg_stat_replication;

On Standby​

-- Check if we're in recovery
SELECT pg_is_in_recovery();

-- Get last received and replayed WAL positions
SELECT
pg_last_wal_receive_lsn() as received_lsn,
pg_last_wal_replay_lsn() as replayed_lsn,
pg_last_xact_replay_timestamp() as last_replay_time;

-- Calculate lag in seconds
SELECT
EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) as lag_seconds,
CASE
WHEN pg_last_xact_replay_timestamp() IS NULL THEN 'No activity'
WHEN EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) < 5 THEN 'Excellent'
WHEN EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) < 30 THEN 'Good'
WHEN EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) < 60 THEN 'Warning'
ELSE 'Critical'
END as lag_status;

Detailed lag analysis​

Measure byte lag​

-- On primary: get current LSN
SELECT pg_current_wal_lsn() as primary_lsn;

-- On standby: compare with replay position
SELECT
pg_last_wal_receive_lsn() as received_lsn,
pg_last_wal_replay_lsn() as replayed_lsn,
pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) as receive_replay_lag_bytes;

WAL streaming diagnostics​

-- Check WAL receiver status (on standby)
SELECT
pid,
status,
receive_start_lsn,
receive_start_tli,
received_lsn,
received_tli,
last_msg_send_time,
last_msg_receipt_time,
latest_end_lsn,
latest_end_time,
slot_name,
sender_host,
sender_port,
conninfo
FROM pg_stat_wal_receiver;

-- Check replication slots (on primary)
SELECT
slot_name,
plugin,
slot_type,
datoid,
database,
active,
active_pid,
xmin,
catalog_xmin,
restart_lsn,
confirmed_flush_lsn
FROM pg_replication_slots;

Common issues and solutions​

High WAL generation​

-- Find tables with most WAL activity
SELECT
schemaname,
tablename,
n_tup_ins + n_tup_upd + n_tup_del as total_changes,
n_tup_hot_upd,
n_dead_tup
FROM pg_stat_user_tables
ORDER BY total_changes DESC
LIMIT 10;

-- Check for long-running transactions (can cause WAL retention)
SELECT
pid,
now() - xact_start as duration,
state,
query
FROM pg_stat_activity
WHERE xact_start IS NOT NULL
ORDER BY duration DESC;

Network and I/O monitoring​

-- Monitor background writer activity
SELECT
checkpoints_timed,
checkpoints_req,
checkpoint_write_time,
checkpoint_sync_time,
buffers_checkpoint,
buffers_clean,
maxwritten_clean,
buffers_backend,
buffers_backend_fsync,
buffers_alloc
FROM pg_stat_bgwriter;

-- Check for I/O bottlenecks
SELECT
schemaname,
tablename,
heap_blks_read,
heap_blks_hit,
heap_blks_hit::float / (heap_blks_hit + heap_blks_read) as hit_ratio
FROM pg_statio_user_tables
WHERE heap_blks_read > 0
ORDER BY heap_blks_read DESC;

Configuration verification​

Essential replication settings to check:

-- Check key replication parameters
SELECT name, setting, unit, context
FROM pg_settings
WHERE name IN (
'wal_level',
'max_wal_senders',
'max_replication_slots',
'wal_keep_size',
'hot_standby',
'max_standby_streaming_delay',
'max_standby_archive_delay'
);

-- Check archive settings
SELECT name, setting
FROM pg_settings
WHERE name LIKE '%archive%';

Monitoring setup​

Create a monitoring view for ongoing lag tracking:

-- Create monitoring view (run on standby)
CREATE OR REPLACE VIEW replication_lag_monitor AS
SELECT
now() as check_time,
pg_is_in_recovery() as is_standby,
pg_last_wal_receive_lsn() as received_lsn,
pg_last_wal_replay_lsn() as replayed_lsn,
pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) as receive_replay_lag_bytes,
EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) as lag_seconds,
pg_last_xact_replay_timestamp() as last_replay_time;

-- Use the monitoring view
SELECT * FROM replication_lag_monitor;

Alerting thresholds​

Set up alerts based on these thresholds:

  • Warning: Lag > 30 seconds or > 100MB
  • Critical: Lag > 300 seconds or > 1GB
  • Emergency: Lag > 1800 seconds or > 5GB
-- Sample alerting query
SELECT
CASE
WHEN lag_seconds > 1800 OR receive_replay_lag_bytes > 5*1024*1024*1024 THEN 'EMERGENCY'
WHEN lag_seconds > 300 OR receive_replay_lag_bytes > 1024*1024*1024 THEN 'CRITICAL'
WHEN lag_seconds > 30 OR receive_replay_lag_bytes > 100*1024*1024 THEN 'WARNING'
ELSE 'OK'
END as alert_level,
lag_seconds,
pg_size_pretty(receive_replay_lag_bytes) as lag_size
FROM replication_lag_monitor;