@@ -1431,6 +1431,142 @@ wait_wal_lsn(const char *wal_segment_dir, XLogRecPtr target_lsn, bool is_start_l
14311431 }
14321432}
14331433
1434+ /*
1435+ * Check stop_lsn (returned from pg_stop_backup()) and update backup->stop_lsn
1436+ */
1437+ void
1438+ wait_wal_and_calculate_stop_lsn (const char * xlog_path , XLogRecPtr stop_lsn , pgBackup * backup )
1439+ {
1440+ bool stop_lsn_exists = false;
1441+
1442+ /* It is ok for replica to return invalid STOP LSN
1443+ * UPD: Apparently it is ok even for a master.
1444+ */
1445+ if (!XRecOffIsValid (stop_lsn ))
1446+ {
1447+ XLogSegNo segno = 0 ;
1448+ XLogRecPtr lsn_tmp = InvalidXLogRecPtr ;
1449+
1450+ /*
1451+ * Even though the value is invalid, it's expected postgres behaviour
1452+ * and we're trying to fix it below.
1453+ */
1454+ elog (LOG , "Invalid offset in stop_lsn value %X/%X, trying to fix" ,
1455+ (uint32 ) (stop_lsn >> 32 ), (uint32 ) (stop_lsn ));
1456+
1457+ /*
1458+ * Note: even with gdb it is very hard to produce automated tests for
1459+ * contrecord + invalid LSN, so emulate it for manual testing.
1460+ */
1461+ //lsn = lsn - XLOG_SEG_SIZE;
1462+ //elog(WARNING, "New Invalid stop_backup_lsn value %X/%X",
1463+ // (uint32) (stop_lsn >> 32), (uint32) (stop_lsn));
1464+
1465+ GetXLogSegNo (stop_lsn , segno , instance_config .xlog_seg_size );
1466+
1467+ /*
1468+ * Note, that there is no guarantee that corresponding WAL file even exists.
1469+ * Replica may return LSN from future and keep staying in present.
1470+ * Or it can return invalid LSN.
1471+ *
1472+ * That's bad, since we want to get real LSN to save it in backup label file
1473+ * and to use it in WAL validation.
1474+ *
1475+ * So we try to do the following:
1476+ * 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and
1477+ * look for the first valid record in it.
1478+ * It solves the problem of occasional invalid LSN on write-busy system.
1479+ * 2. Failing that, look for record in previous segment with endpoint
1480+ * equal or greater than stop_lsn. It may(!) solve the problem of invalid LSN
1481+ * on write-idle system. If that fails too, error out.
1482+ */
1483+
1484+ /* stop_lsn is pointing to a 0 byte of xlog segment */
1485+ if (stop_lsn % instance_config .xlog_seg_size == 0 )
1486+ {
1487+ /* Wait for segment with current stop_lsn, it is ok for it to never arrive */
1488+ wait_wal_lsn (xlog_path , stop_lsn , false, backup -> tli ,
1489+ false, true, WARNING , backup -> stream );
1490+
1491+ /* Get the first record in segment with current stop_lsn */
1492+ lsn_tmp = get_first_record_lsn (xlog_path , segno , backup -> tli ,
1493+ instance_config .xlog_seg_size ,
1494+ instance_config .archive_timeout );
1495+
1496+ /* Check that returned LSN is valid and greater than stop_lsn */
1497+ if (XLogRecPtrIsInvalid (lsn_tmp ) ||
1498+ !XRecOffIsValid (lsn_tmp ) ||
1499+ lsn_tmp < stop_lsn )
1500+ {
1501+ /* Backup from master should error out here */
1502+ if (!backup -> from_replica )
1503+ elog (ERROR , "Failed to get next WAL record after %X/%X" ,
1504+ (uint32 ) (stop_lsn >> 32 ),
1505+ (uint32 ) (stop_lsn ));
1506+
1507+ /* No luck, falling back to looking up for previous record */
1508+ elog (WARNING , "Failed to get next WAL record after %X/%X, "
1509+ "looking for previous WAL record" ,
1510+ (uint32 ) (stop_lsn >> 32 ),
1511+ (uint32 ) (stop_lsn ));
1512+
1513+ /* Despite looking for previous record there is not guarantee of success
1514+ * because previous record can be the contrecord.
1515+ */
1516+ lsn_tmp = wait_wal_lsn (xlog_path , stop_lsn , false, backup -> tli ,
1517+ true, false, ERROR , backup -> stream );
1518+
1519+ /* sanity */
1520+ if (!XRecOffIsValid (lsn_tmp ) || XLogRecPtrIsInvalid (lsn_tmp ))
1521+ elog (ERROR , "Failed to get WAL record prior to %X/%X" ,
1522+ (uint32 ) (stop_lsn >> 32 ),
1523+ (uint32 ) (stop_lsn ));
1524+ }
1525+ }
1526+ /* stop lsn is aligned to xlog block size, just find next lsn */
1527+ else if (stop_lsn % XLOG_BLCKSZ == 0 )
1528+ {
1529+ /* Wait for segment with current stop_lsn */
1530+ wait_wal_lsn (xlog_path , stop_lsn , false, backup -> tli ,
1531+ false, true, ERROR , backup -> stream );
1532+
1533+ /* Get the next closest record in segment with current stop_lsn */
1534+ lsn_tmp = get_next_record_lsn (xlog_path , segno , backup -> tli ,
1535+ instance_config .xlog_seg_size ,
1536+ instance_config .archive_timeout ,
1537+ stop_lsn );
1538+
1539+ /* sanity */
1540+ if (!XRecOffIsValid (lsn_tmp ) || XLogRecPtrIsInvalid (lsn_tmp ))
1541+ elog (ERROR , "Failed to get WAL record next to %X/%X" ,
1542+ (uint32 ) (stop_lsn >> 32 ),
1543+ (uint32 ) (stop_lsn ));
1544+ }
1545+ /* PostgreSQL returned something very illegal as STOP_LSN, error out */
1546+ else
1547+ elog (ERROR , "Invalid stop_backup_lsn value %X/%X" ,
1548+ (uint32 ) (stop_lsn >> 32 ), (uint32 ) (stop_lsn ));
1549+
1550+ /* Setting stop_backup_lsn will set stop point for streaming */
1551+ stop_backup_lsn = lsn_tmp ;
1552+ stop_lsn_exists = true;
1553+ }
1554+
1555+ elog (LOG , "stop_lsn: %X/%X" ,
1556+ (uint32 ) (stop_lsn >> 32 ), (uint32 ) (stop_lsn ));
1557+
1558+ /*
1559+ * Wait for stop_lsn to be archived or streamed.
1560+ * If replica returned valid STOP_LSN of not actually existing record,
1561+ * look for previous record with endpoint >= STOP_LSN.
1562+ */
1563+ if (!stop_lsn_exists )
1564+ stop_backup_lsn = wait_wal_lsn (xlog_path , stop_lsn , false, backup -> tli ,
1565+ false, false, ERROR , backup -> stream );
1566+
1567+ backup -> stop_lsn = stop_backup_lsn ;
1568+ }
1569+
14341570/* Remove annoying NOTICE messages generated by backend */
14351571void
14361572pg_silent_client_messages (PGconn * conn )
@@ -1729,7 +1865,6 @@ static void
17291865pg_stop_backup (InstanceState * instanceState , pgBackup * backup , PGconn * pg_startbackup_conn ,
17301866 PGNodeInfo * nodeInfo )
17311867{
1732- bool stop_lsn_exists = false;
17331868 PGStopBackupResult stop_backup_result ;
17341869 char * xlog_path , stream_xlog_path [MAXPGPATH ];
17351870 /* kludge against some old bug in archive_timeout. TODO: remove in 3.0.0 */
@@ -1772,121 +1907,7 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb
17721907 else
17731908 xlog_path = instanceState -> instance_wal_subdir_path ;
17741909
1775- /* It is ok for replica to return invalid STOP LSN
1776- * UPD: Apparently it is ok even for a master.
1777- */
1778- if (!XRecOffIsValid (stop_backup_result .lsn ))
1779- {
1780- XLogSegNo segno = 0 ;
1781- XLogRecPtr lsn_tmp = InvalidXLogRecPtr ;
1782-
1783- /*
1784- * Even though the value is invalid, it's expected postgres behaviour
1785- * and we're trying to fix it below.
1786- */
1787- elog (LOG , "Invalid offset in stop_lsn value %X/%X, trying to fix" ,
1788- (uint32 ) (stop_backup_result .lsn >> 32 ), (uint32 ) (stop_backup_result .lsn ));
1789-
1790- /*
1791- * Note: even with gdb it is very hard to produce automated tests for
1792- * contrecord + invalid LSN, so emulate it for manual testing.
1793- */
1794- //stop_backup_result.lsn = stop_backup_result.lsn - XLOG_SEG_SIZE;
1795- //elog(WARNING, "New Invalid stop_backup_lsn value %X/%X",
1796- // (uint32) (stop_backup_result.lsn >> 32), (uint32) (stop_backup_result.lsn));
1797-
1798- GetXLogSegNo (stop_backup_result .lsn , segno , instance_config .xlog_seg_size );
1799-
1800- /*
1801- * Note, that there is no guarantee that corresponding WAL file even exists.
1802- * Replica may return LSN from future and keep staying in present.
1803- * Or it can return invalid LSN.
1804- *
1805- * That's bad, since we want to get real LSN to save it in backup label file
1806- * and to use it in WAL validation.
1807- *
1808- * So we try to do the following:
1809- * 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and
1810- * look for the first valid record in it.
1811- * It solves the problem of occasional invalid LSN on write-busy system.
1812- * 2. Failing that, look for record in previous segment with endpoint
1813- * equal or greater than stop_lsn. It may(!) solve the problem of invalid LSN
1814- * on write-idle system. If that fails too, error out.
1815- */
1816-
1817- /* stop_lsn is pointing to a 0 byte of xlog segment */
1818- if (stop_backup_result .lsn % instance_config .xlog_seg_size == 0 )
1819- {
1820- /* Wait for segment with current stop_lsn, it is ok for it to never arrive */
1821- wait_wal_lsn (xlog_path , stop_backup_result .lsn , false, backup -> tli ,
1822- false, true, WARNING , backup -> stream );
1823-
1824- /* Get the first record in segment with current stop_lsn */
1825- lsn_tmp = get_first_record_lsn (xlog_path , segno , backup -> tli ,
1826- instance_config .xlog_seg_size ,
1827- instance_config .archive_timeout );
1828-
1829- /* Check that returned LSN is valid and greater than stop_lsn */
1830- if (XLogRecPtrIsInvalid (lsn_tmp ) ||
1831- !XRecOffIsValid (lsn_tmp ) ||
1832- lsn_tmp < stop_backup_result .lsn )
1833- {
1834- /* Backup from master should error out here */
1835- if (!backup -> from_replica )
1836- elog (ERROR , "Failed to get next WAL record after %X/%X" ,
1837- (uint32 ) (stop_backup_result .lsn >> 32 ),
1838- (uint32 ) (stop_backup_result .lsn ));
1839-
1840- /* No luck, falling back to looking up for previous record */
1841- elog (WARNING , "Failed to get next WAL record after %X/%X, "
1842- "looking for previous WAL record" ,
1843- (uint32 ) (stop_backup_result .lsn >> 32 ),
1844- (uint32 ) (stop_backup_result .lsn ));
1845-
1846- /* Despite looking for previous record there is not guarantee of success
1847- * because previous record can be the contrecord.
1848- */
1849- lsn_tmp = wait_wal_lsn (xlog_path , stop_backup_result .lsn , false, backup -> tli ,
1850- true, false, ERROR , backup -> stream );
1851-
1852- /* sanity */
1853- if (!XRecOffIsValid (lsn_tmp ) || XLogRecPtrIsInvalid (lsn_tmp ))
1854- elog (ERROR , "Failed to get WAL record prior to %X/%X" ,
1855- (uint32 ) (stop_backup_result .lsn >> 32 ),
1856- (uint32 ) (stop_backup_result .lsn ));
1857- }
1858- }
1859- /* stop lsn is aligned to xlog block size, just find next lsn */
1860- else if (stop_backup_result .lsn % XLOG_BLCKSZ == 0 )
1861- {
1862- /* Wait for segment with current stop_lsn */
1863- wait_wal_lsn (xlog_path , stop_backup_result .lsn , false, backup -> tli ,
1864- false, true, ERROR , backup -> stream );
1865-
1866- /* Get the next closest record in segment with current stop_lsn */
1867- lsn_tmp = get_next_record_lsn (xlog_path , segno , backup -> tli ,
1868- instance_config .xlog_seg_size ,
1869- instance_config .archive_timeout ,
1870- stop_backup_result .lsn );
1871-
1872- /* sanity */
1873- if (!XRecOffIsValid (lsn_tmp ) || XLogRecPtrIsInvalid (lsn_tmp ))
1874- elog (ERROR , "Failed to get WAL record next to %X/%X" ,
1875- (uint32 ) (stop_backup_result .lsn >> 32 ),
1876- (uint32 ) (stop_backup_result .lsn ));
1877- }
1878- /* PostgreSQL returned something very illegal as STOP_LSN, error out */
1879- else
1880- elog (ERROR , "Invalid stop_backup_lsn value %X/%X" ,
1881- (uint32 ) (stop_backup_result .lsn >> 32 ), (uint32 ) (stop_backup_result .lsn ));
1882-
1883- /* Setting stop_backup_lsn will set stop point for streaming */
1884- stop_backup_lsn = lsn_tmp ;
1885- stop_lsn_exists = true;
1886- }
1887-
1888- elog (LOG , "stop_lsn: %X/%X" ,
1889- (uint32 ) (stop_backup_result .lsn >> 32 ), (uint32 ) (stop_backup_result .lsn ));
1910+ wait_wal_and_calculate_stop_lsn (xlog_path , stop_backup_result .lsn , backup );
18901911
18911912 /* Write backup_label and tablespace_map */
18921913 if (!exclusive_backup )
@@ -1917,15 +1938,6 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb
19171938 }
19181939 }
19191940
1920- /*
1921- * Wait for stop_lsn to be archived or streamed.
1922- * If replica returned valid STOP_LSN of not actually existing record,
1923- * look for previous record with endpoint >= STOP_LSN.
1924- */
1925- if (!stop_lsn_exists )
1926- stop_backup_lsn = wait_wal_lsn (xlog_path , stop_backup_result .lsn , false, backup -> tli ,
1927- false, false, ERROR , backup -> stream );
1928-
19291941 if (backup -> stream )
19301942 {
19311943 /* This function will also add list of xlog files
@@ -1934,7 +1946,6 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb
19341946 elog (ERROR , "WAL streaming failed" );
19351947 }
19361948
1937- backup -> stop_lsn = stop_backup_lsn ;
19381949 backup -> recovery_xid = stop_backup_result .snapshot_xid ;
19391950
19401951 elog (LOG , "Getting the Recovery Time from WAL" );
0 commit comments