除了源码,了解WAL最好的方式是通过lt_waldump入手:
[lightdb@lightdb1 bin]$ ./lt_waldump --help
lt_waldump decodes and displays LightDB write-ahead logs for debugging.
Usage:
lt_waldump [OPTION]... [STARTSEG [ENDSEG]]
Options:
-b, --bkp-details output detailed information about backup blocks
-e, --end=RECPTR stop reading at WAL location RECPTR
-f, --follow keep retrying after reaching end of WAL
-n, --limit=N number of records to display
-p, --path=PATH directory in which to find log segment files or a
directory with a ./pg_wal that contains such files
(default: current directory, ./pg_wal, $PGDATA/pg_wal)
-q, --quiet do not print any output, except for errors
-r, --rmgr=RMGR only show records generated by resource manager RMGR;
use --rmgr=list to list valid resource manager names
-s, --start=RECPTR start reading at WAL location RECPTR
-t, --timeline=TLI timeline from which to read log records
(default: 1 or the value used in STARTSEG)
-V, --version output version information, then exit
-x, --xid=XID only show records with transaction ID XID
-z, --stats[=record] show statistics instead of records
(optionally, show per-record statistics)
-?, --help show this help, then exit
Report bugs to <https://github.com/hslightdb>.
LightDB home page: <https://www.hs.net/lightdb>
[lightdb@lightdb1 bin]$ ./lt_waldump --rmgr=list
XLOG
Transaction
Storage
CLOG
Database
Tablespace
MultiXact
RelMap
Standby
Heap2
Heap
Btree
Hash
Gin
Gist
Sequence
SPGist
BRIN
CommitTs
ReplicationOrigin
Generic
LogicalMessage
对应的资源管理器定义结构如下:
/*
* List of resource manager entries. Note that order of entries defines the
* numerical values of each rmgr's ID, which is stored in WAL records. New
* entries should be added at the end, to avoid changing IDs of existing
* entries.
*
* Changes to this list possibly need an XLOG_PAGE_MAGIC bump.
*/
/* symbol name, textual name, redo, desc, identify, startup, cleanup */
PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL)
PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL)
PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL)
PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL)
PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL)
PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL)
PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL)
PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL)
PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask)
PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask)
PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask)
PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask)
PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL)
PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
对应的,每种access method通常都有对应的xlog(它主要对应block data/main data的物理组织结构)结构 ,如下:
具体到heapam中,新增、修改、删除、multi insert等等都是不同的结构定义,具体可见heapam_xlog.h。
先看pg_waldump的解析日志:
insert一条带主键的记录
rmgr: XLOG len (rec/tot): 49/ 8169, tx: 0, lsn: 0/4CBB9D78, prev 0/4CBB9D40, desc: FPI_FOR_HINT , blkref #0: rel 1663/13580/2619 blk 16 FPW
rmgr: Heap2 len (rec/tot): 56/ 56, tx: 0, lsn: 0/4CBBBD80, prev 0/4CBB9D78, desc: CLEAN remxid 538, blkref #0: rel 1663/13580/2619 blk 16
rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/4CBBBDB8, prev 0/4CBBBD80, desc: RUNNING_XACTS nextXid 592 latestCompletedXid 591 oldestRunningXid 592
rmgr: Heap len (rec/tot): 54/ 3510, tx: 592, lsn: 0/4CBBBDF0, prev 0/4CBBBDB8, desc: INSERT off 78 flags 0x00, blkref #0: rel 1663/13580/32899 blk 5405 FPW
rmgr: Btree len (rec/tot): 53/ 5641, tx: 592, lsn: 0/4CBBCBC0, prev 0/4CBBBDF0, desc: INSERT_LEAF off 8, blkref #0【本wal记录内块号】: rel 1663【表空间】/13580【数据库】/41089【对象】 blk 848【文件内块号】 FPW
rmgr: Standby len (rec/tot): 54/ 54, tx: 0, lsn: 0/4CBBE1E8, prev 0/4CBBCBC0, desc: RUNNING_XACTS nextXid 593 latestCompletedXid 591 oldestRunningXid 592; 1 xacts: 592
pg_waldump: fatal: error in WAL record at 0/4CBBE1E8: invalid record length at 0/4CBBE220: wanted 24, got 0rec<tot基本上是因为发生了FPW,如果开启了FPW,则压缩是非常有价值的。
rmgr: Standby len (rec/tot): 54/ 54, tx: 0, lsn: 0/4CBBE220, prev 0/4CBBE1E8, desc: RUNNING_XACTS nextXid 593 latestCompletedXid 591 oldestRunningXid 592; 1 xacts: 592
rmgr: XLOG len (rec/tot): 114/ 114, tx: 0, lsn: 0/4CBBE258, prev 0/4CBBE220, desc: CHECKPOINT_ONLINE redo 0/4CBBE220; tli 1; prev tli 1; fpw true; xid 0:593; oid 57492; multi 1; offset 0; oldest xid 478 in DB 1; oldest multi 1 in DB 1; oldest/newest commit timestamp xid: 0/0; oldest running xid 592; online
rmgr: Standby len (rec/tot): 54/ 54, tx: 0, lsn: 0/4CBBE2D0, prev 0/4CBBE258, desc: RUNNING_XACTS nextXid 593 latestCompletedXid 591 oldestRunningXid 592; 1 xacts: 592
rmgr: Transaction len (rec/tot): 34/ 34, tx: 592, lsn: 0/4CBBE308, prev 0/4CBBE2D0, desc: COMMIT 2021-07-17 15:44:51.835849 CST
rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/4CBBE330, prev 0/4CBBE308, desc: RUNNING_XACTS nextXid 593 latestCompletedXid 592 oldestRunningXid 593
blkref 说明:分配给一个wal记录的缓冲默认情况下为5个块,由下列常量定义。
/*
* The minimum size of the WAL construction working area. If you need to
* register more than XLR_NORMAL_MAX_BLOCK_ID block references or have more
* than XLR_NORMAL_RDATAS data chunks in a single WAL record, you must call
* XLogEnsureRecordSpace() first to allocate more working memory.
*/
#define XLR_NORMAL_MAX_BLOCK_ID 4
#define XLR_NORMAL_RDATAS 20
正常都是从block id=0开始,如下:
所以可以看到除非text/bytea等大字段类型,一般情况下blkref都为0。
=====
wal记录可能包含两部分,FPI和非FPI,通过DecodedBkpBlock.has_img区分某个wal record的各个块是FPI还是非FPI。
typedef struct
{
/* Is this block ref in use? */
bool in_use;
/* Identify the block this refers to */
RelFileNode rnode;
ForkNumber forknum;
BlockNumber blkno;
/* copy of the fork_flags field from the XLogRecordBlockHeader */
uint8 flags;
/* Information on full-page image, if any */
bool has_image; /* has image, even for consistency checking */
bool apply_image; /* has image that should be restored */
char *bkp_image;
uint16 hole_offset;
uint16 hole_length;
uint16 bimg_len;
uint8 bimg_info;
/* Buffer holding the rmgr-specific data associated with this block */
bool has_data;
char *data;
uint16 data_len;
uint16 data_bufsz;
} DecodedBkpBlock;
struct XLogReaderState
{
/*
* Operational callbacks
*/
XLogReaderRoutine routine;
/* ----------------------------------------
* Public parameters
* ----------------------------------------
*/
/*
* System identifier of the xlog files we're about to read. Set to zero
* (the default value) if unknown or unimportant.
*/
uint64 system_identifier;
/*
* Opaque data for callbacks to use. Not used by XLogReader.
*/
void *private_data;
/*
* Start and end point of last record read. EndRecPtr is also used as the
* position to read next. Calling XLogBeginRead() sets EndRecPtr to the
* starting position and ReadRecPtr to invalid.
*/
XLogRecPtr ReadRecPtr; /* start of last record read */
XLogRecPtr EndRecPtr; /* end+1 of last record read */
/* ----------------------------------------
* Decoded representation of current record
*
* Use XLogRecGet* functions to investigate the record; these fields
* should not be accessed directly.
* ----------------------------------------
*/
XLogRecord *decoded_record; /* currently decoded record */
char *main_data; /* record's main data portion */
uint32 main_data_len; /* main data portion's length */
uint32 main_data_bufsz; /* allocated size of the buffer */
RepOriginId record_origin;
/* information about blocks referenced by the record. */
DecodedBkpBlock blocks[XLR_MAX_BLOCK_ID + 1];
int max_block_id; /* highest block_id in use (-1 if none) */
/* ----------------------------------------
* private/internal state
* ----------------------------------------
*/
/*
* Buffer for currently read page (XLOG_BLCKSZ bytes, valid up to at least
* readLen bytes)
*/
char *readBuf;
uint32 readLen;
/* last read XLOG position for data currently in readBuf */
WALSegmentContext segcxt;
WALOpenSegment seg;
uint32 segoff;
/*
* beginning of prior page read, and its TLI. Doesn't necessarily
* correspond to what's in readBuf; used for timeline sanity checks.
*/
XLogRecPtr latestPagePtr;
TimeLineID latestPageTLI;
/* beginning of the WAL record being read. */
XLogRecPtr currRecPtr;
/* timeline to read it from, 0 if a lookup is required */
TimeLineID currTLI;
/*
* Safe point to read to in currTLI if current TLI is historical
* (tliSwitchPoint) or InvalidXLogRecPtr if on current timeline.
*
* Actually set to the start of the segment containing the timeline switch
* that ends currTLI's validity, not the LSN of the switch its self, since
* we can't assume the old segment will be present.
*/
XLogRecPtr currTLIValidUntil;
/*
* If currTLI is not the most recent known timeline, the next timeline to
* read from when currTLIValidUntil is reached.
*/
TimeLineID nextTLI;
/*
* Buffer for current ReadRecord result (expandable), used when a record
* crosses a page boundary.
*/
char *readRecordBuf;
uint32 readRecordBufSize;
/* Buffer to hold error message */
char *errormsg_buf;
};
wal记录固定长度头部的定义
/*
* The overall layout of an XLOG record is:
* Fixed-size header (XLogRecord struct)
* XLogRecordBlockHeader struct
* XLogRecordBlockHeader struct
* ...
* XLogRecordDataHeader[Short|Long] struct
* block data
* block data
* ...
* main data
*
* There can be zero or more XLogRecordBlockHeaders, and 0 or more bytes of
* rmgr-specific data not associated with a block. XLogRecord structs
* always start on MAXALIGN boundaries in the WAL files, but the rest of
* the fields are not aligned.
*
* The XLogRecordBlockHeader, XLogRecordDataHeaderShort and
* XLogRecordDataHeaderLong structs all begin with a single 'id' byte. It's
* used to distinguish between block references, and the main data structs.
*/
typedef struct XLogRecord
{
uint32 xl_tot_len; /* total len of entire record */
TransactionId xl_xid; /* xact id */
XLogRecPtr xl_prev; /* ptr to previous record in log */
uint8 xl_info; /* flag bits, see below */ 分为前4字节(rmgr保存各个rmgr特有的一些标记信息,如一致性检查,典型的例子是xlog_identify(uint8 info))和后4字节
RmgrId xl_rmid; /* resource manager for this record */
/* 2 bytes of padding here, initialize to zero */
pg_crc32c xl_crc; /* CRC for this record */
/* XLogRecordBlockHeaders and XLogRecordDataHeader follow, no padding */
} XLogRecord;
理解XLOG的这个布局是非常重要的,因为waldump解析出来的每条记录和语句不是一一对应的,而是和修改的物理对象一一对应(否则做不到物理apply)。
waldump读的逻辑基本时还是依赖xlogreader.c完成,waldump只是做了具体的实现,和sql执行过程一样,所有的状态由XLogReaderState整个周期内维护。
/*
* pg_switch_wal: switch to next xlog file
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_switch_wal(PG_FUNCTION_ARGS)
{
XLogRecPtr switchpoint;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
switchpoint = RequestXLogSwitch(false); // 具体的实现
/*
* As a convenience, return the WAL location of the switch record
*/
PG_RETURN_LSN(switchpoint);
}
注:pg_switch_wal是一个内置函数。在pg_proc.dat中注册(genbki.pl据此生成postgres.bki,gen_fmgrtab.pl生成fmgroids.h, fmgrprotos.h, and fmgrtab.c)。
{ oid => '2848', descr => 'switch to new wal file',
proname => 'pg_switch_wal', provolatile => 'v', prorettype => 'pg_lsn',
proargtypes => '', prosrc => 'pg_switch_wal' },
select * from pg_proc where proname='pg_switch_wal';
Name |Value |
---------------+-------------------+
oid |2848 |
proname |pg_switch_wal |
pronamespace |11 |
proowner |10 |
prolang |12 |
procost |1.0 |
prorows |0.0 |
provariadic |0 |
prosupport |- |
prokind |f |
prosecdef |false |
proleakproof |false |
proisstrict |true |
proretset |false |
provolatile |v |
proparallel |s |
pronargs |0 |
pronargdefaults|0 |
prorettype |3220 |
proargtypes |{} |
proallargtypes |NULL |
proargmodes |NULL |
proargnames |NULL |
proargdefaults | |
protrftypes |NULL |
prosrc |pg_switch_wal |
probin | |
proconfig |NULL |
proacl |{lightdb=X/lightdb}|
下面来分析wal文件的生成与切换。
pg_switch_wal moves to the next write-ahead log file, allowing the current file to be archived (assuming you are using continuous archiving). The return value is the ending write-ahead log location + 1 within the just-completed write-ahead log file. If there has been no write-ahead log activity since the last write-ahead log switch, pg_switch_wal does nothing and returns the start location of the write-ahead log file currently in use.
postgres=# select pg_switch_wal();
pg_switch_wal
---------------
0/4CBBE468
(1 row)
[zjh@hs-10-20-30-193 data]$ ll pg_wal/
total 2097192
-rw------- 1 zjh zjh 1073741824 Jul 17 16:00 000000010000000000000001
-rw------- 1 zjh zjh 1073741824 Jul 17 16:01 000000010000000000000002
drwx------ 2 zjh zjh 6 Jun 14 19:05 archive_status
-rw-rw-r-- 1 zjh zjh 37691 Jul 15 16:06 gmon.out
postgres=# select pg_switch_wal();
pg_switch_wal
---------------
0/80000078
(1 row)
[zjh@hs-10-20-30-193 data]$ ll pg_wal/
total 3145768
-rw------- 1 zjh zjh 1073741824 Jul 17 16:00 000000010000000000000001
-rw------- 1 zjh zjh 1073741824 Jul 17 16:01 000000010000000000000002
-rw------- 1 zjh zjh 1073741824 Jul 17 16:01 000000010000000000000003
drwx------ 2 zjh zjh 6 Jun 14 19:05 archive_status
-rw-rw-r-- 1 zjh zjh 37691 Jul 15 16:06 gmon.out
postgres=# select pg_switch_wal();
pg_switch_wal
---------------
0/C0000078
(1 row)
[zjh@hs-10-20-30-193 data]$ ll pg_wal/
total 3145768
-rw------- 1 zjh zjh 1073741824 Jul 17 16:00 000000010000000000000001
-rw------- 1 zjh zjh 1073741824 Jul 17 16:01 000000010000000000000002
-rw------- 1 zjh zjh 1073741824 Jul 17 16:01 000000010000000000000003
drwx------ 2 zjh zjh 6 Jun 14 19:05 archive_status
-rw-rw-r-- 1 zjh zjh 37691 Jul 15 16:06 gmon.out
postgres=# select pg_switch_wal();
pg_switch_wal
---------------
1/0 ??这个值什么时候会出现?
(1 row)
[zjh@hs-10-20-30-193 data]$ ll pg_wal/
total 4194344
-rw------- 1 zjh zjh 1073741824 Jul 17 16:00 000000010000000000000001
-rw------- 1 zjh zjh 1073741824 Jul 17 16:01 000000010000000000000002
-rw------- 1 zjh zjh 1073741824 Jul 17 16:01 000000010000000000000003
-rw------- 1 zjh zjh 1073741824 Jul 17 16:01 000000010000000100000000
drwx------ 2 zjh zjh 6 Jun 14 19:05 archive_status
-rw-rw-r-- 1 zjh zjh 37691 Jul 15 16:06 gmon.out
postgres=# select pg_switch_wal();
pg_switch_wal
---------------
1/78 ??这个值什么时候会出现?
(1 row)
[zjh@hs-10-20-30-193 data]$ ll pg_wal/
total 4194344
-rw------- 1 zjh zjh 1073741824 Jul 17 16:00 000000010000000000000001
-rw------- 1 zjh zjh 1073741824 Jul 17 16:01 000000010000000000000002
-rw------- 1 zjh zjh 1073741824 Jul 17 16:01 000000010000000000000003
-rw------- 1 zjh zjh 1073741824 Jul 17 16:01 000000010000000100000000
drwx------ 2 zjh zjh 6 Jun 14 19:05 archive_status
-rw-rw-r-- 1 zjh zjh 37691 Jul 15 16:06 gmon.out
postgres=# select pg_switch_wal();
pg_switch_wal
---------------
1/40000000
(1 row)
postgres=# select pg_switch_wal();
pg_switch_wal
---------------
1/40000000
(1 row)
postgres=# select pg_switch_wal();
pg_switch_wal
---------------
1/40000000
(1 row)
postgres=# select pg_switch_wal();
pg_switch_wal
---------------
1/40000000
(1 row)
postgres=# select pg_switch_wal();
pg_switch_wal
---------------
1/40000000
(1 row)
postgres=# select pg_switch_wal();
pg_switch_wal
---------------
1/40000000
(1 row)
postgres=# select pg_current_wal_lsn();
pg_current_wal_lsn
--------------------
1/40000148
(1 row)
postgres=# select pg_current_wal_lsn();
pg_current_wal_lsn
--------------------
1/40000148
(1 row)
postgres=# insert into t_sample select* from t_sample limit 1000000;
rmgr: XLOG len (rec/tot): 114/ 114, tx: 0, lsn: 1/40000098, prev 1/40000060, desc: CHECKPOINT_ONLINE redo 1/40000060; tli 1; prev tli 1; fpw true; xid 0:593; oid 57492; m
ulti 1; offset 0; oldest xid 478 in DB 1; oldest multi 1 in DB 1; oldest/newest commit timestamp xid: 0/0; oldest running xid 593; online
rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 1/40000110, prev 1/40000098, desc: RUNNING_XACTS nextXid 593 latestCompletedXid 592 oldestRunningXid 593
=== wal_level >= WAL_LEVEL_REPLICA时,会写Standby。
=== 下一行开始不停的生成
rmgr: Heap len (rec/tot): 54/ 3554, tx: 593, lsn: 1/40000148, prev 1/40000110, desc: INSERT off 79 flags 0x00, blkref #0: rel 1663/13580/32899 blk 5405 FPW # full page write
rmgr: Btree len (rec/tot): 53/ 5661, tx: 593, lsn: 1/40000F30, prev 1/40000148, desc: INSERT_LEAF off 9, blkref #0: rel 1663/13580/41089 blk 848 FPW
rmgr: Heap len (rec/tot): 64/ 64, tx: 593, lsn: 1/40002568, prev 1/40000F30, desc: INSERT off 80 flags 0x00, blkref #0: rel 1663/13580/32899 blk 5405
rmgr: Btree len (rec/tot): 64/ 64, tx: 593, lsn: 1/400025A8, prev 1/40002568, desc: INSERT_LEAF off 10, blkref #0: rel 1663/13580/41089 blk 848
rmgr: Heap len (rec/tot): 64/ 64, tx: 593, lsn: 1/400025E8, prev 1/400025A8, desc: INSERT off 81 flags 0x00, blkref #0: rel 1663/13580/32899 blk 5405
rmgr: Btree len (rec/tot): 64/ 64, tx: 593, lsn: 1/40002628, prev 1/400025E8, desc: INSERT_LEAF off 11, blkref #0: rel 1663/13580/41089 blk 848
正常情况下len都比较小,即使插入8k长度记录也一样(会被TOAST+压缩,所以很小),要模拟大记录,要设置列存储方式为plain。此时就可以看到大的rec记录。
rmgr: Heap len (rec/tot): 8159/ 8159, tx: 2215040, lsn: E/A0128F10, prev E/A0126F18, desc: INSERT+INIT off 1 flags 0x00, blkref #0: rel 1663/15154/434247 blk 28
rmgr: Heap len (rec/tot): 8159/ 8159, tx: 2215040, lsn: E/A012AF08, prev E/A0128F10, desc: INSERT+INIT off 1 flags 0x00, blkref #0: rel 1663/15154/434247 blk 29
rmgr: Heap len (rec/tot): 8159/ 8159, tx: 2215040, lsn: E/A012CF00, prev E/A012AF08, desc: INSERT+INIT off 1 flags 0x00, blkref #0: rel 1663/15154/434247 blk 30
rmgr: Heap len (rec/tot): 8159/ 8159, tx: 2215040, lsn: E/A012EEF8, prev E/A012CF00, desc: INSERT+INIT off 1 flags 0x00, blkref #0: rel 1663/15154/434247 blk 31
rmgr: Transaction len (rec/tot): 34/ 34, tx: 2215040, lsn: E/A0130EF0, prev E/A012EEF8, desc: COMMIT 2021-11-07 19:14:27.878160 CST
timeline https://postgreshelp.com/postgresql-timelines/切换,跟oracle的resetlogs一样,在不完全恢复后增加1
=== rmgr,所有有物理记录的都是资源,定义在rmgrlist.h接口中。src/include/access/rmgr.h,src/bin/pg_waldump/rmgrdesc.h
typedef struct RmgrDescData
{
const char *rm_name;
void (*rm_desc) (StringInfo buf, XLogReaderState *record);
const char *(*rm_identify) (uint8 info);
} RmgrDescData;
#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
{ name, desc, identify},
const RmgrDescData RmgrDescTable[RM_MAX_ID + 1] = {
#include "access/rmgrlist.h" # 居然还有这种用法????预处理器果然NB
};
src/bin/pg_waldump/pg_waldump.c是pg_waldump入口
desc定义在src/backend/access/rmgrdesc/xxxdesc.c中,每个资源管理器一个。
postgres=# select pg_current_wal_lsn();
pg_current_wal_lsn
--------------------
1/47BB62D0 -- 7BB62D0(10进制等于129721040,约123M)
(1 row)
postgres=# insert into t_sample select* from t_sample limit 1000000;
INSERT 0 1000000
postgres=# select pg_current_wal_lsn();
pg_current_wal_lsn
--------------------
1/521B5A00 -- A5FF730(10进制等于174061360,约168M)
(1 row)
注:LSN和十进制LSN的对应关系如下:
[zjh@hs-10-20-30-193 data]$ ll pg_wal/ --- 但是WAL文件多生成了1个
total 5242920
-rw------- 1 zjh zjh 1073741824 Jul 17 16:23 000000010000000100000001
-rw------- 1 zjh zjh 1073741824 Jul 17 16:00 000000010000000100000002
-rw------- 1 zjh zjh 1073741824 Jul 17 16:01 000000010000000100000003
-rw------- 1 zjh zjh 1073741824 Jul 17 16:01 000000010000000200000000
-rw------- 1 zjh zjh 1073741824 Jul 17 16:02 000000010000000200000001
drwx------ 2 zjh zjh 6 Jun 14 19:05 archive_status
-rw-rw-r-- 1 zjh zjh 37691 Jul 15 16:06 gmon.out
postgres=# select pg_current_wal_lsn();
pg_current_wal_lsn
--------------------
1/521B5A00
(1 row)
postgres=# select pg_walfile_name('1/521B5A70'); -- 这就说明虽然执行了switch file,但是WAL还是往000000010000000100000001在写??这不就有问题么?
pg_walfile_name
--------------------------
000000010000000100000001
(1 row)
postgres=# select pg_walfile_name('1/521B5A00');
pg_walfile_name
--------------------------
000000010000000100000001
(1 row)
postgres=# select pg_switch_wal();
pg_switch_wal
---------------
1/521B5B38
(1 row)
postgres=# select pg_switch_wal();
pg_switch_wal
---------------
1/80000078
(1 row)
postgres=# select pg_walfile_name('1/80000078');
pg_walfile_name
--------------------------
000000010000000100000002
(1 row)
## 即使再切换,也没有生成新的WAL文件,是因为没有开启归档,同时已经满足了min_wal_size的要求
max_wal_size = 64GB
min_wal_size = 5GB
============现在开始分析wal文件的规则=================
postgres=# select pg_current_wal_flush_lsn(),pg_current_wal_insert_lsn(),pg_current_wal_lsn(),pg_walfile_name_offset(pg_current_wal_lsn());
pg_current_wal_flush_lsn | pg_current_wal_insert_lsn | pg_current_wal_lsn | pg_walfile
_name_offset
--------------------------+---------------------------+--------------------+-----------------
-------------------
E/601CDF60 | E/601CDF60 | E/601CDF60 | (000000030000000
E00000003,1892192)
在pg的wal中,有个三个位置,write位置、insert位置以及flush位置。三者的定义和差异为:
- insert位置是指从逻辑角度而言,已经被占用的位置,也就是已经分配给wal writer或进程
- write位置是已经调用了fwrite,但是还没有调用fflush,但是服务器外部可能可见
- flush是指已经调用了fflush,只有fflush之后的wal,才是真正完成的。
↓flush的位置 ↓write的位置 ↓insert的位置
00000000000000001000000000000000000100000000000000000000000100000000000000000000000000000000000000000000000000000000000000
从切换可知,wal文件名一定是递增的。
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ 13.3-21.2]$ cd data/defaultCluster/pg_wal/
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ll
total 2097180
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:26 000000030000000C00000007
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:35 000000030000000D00000000
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:25 000000030000000D00000001
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:26 000000030000000D00000002
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
drwx------ 2 lightdb lightdb 4096 Nov 7 11:26 archive_status
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ls -lt
total 2097180
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:35 000000030000000D00000000
drwx------ 2 lightdb lightdb 4096 Nov 7 11:26 archive_status
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:26 000000030000000C00000007
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:26 000000030000000D00000002
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:25 000000030000000D00000001
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ls -lt
total 2097180
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:35 000000030000000D00000000
drwx------ 2 lightdb lightdb 4096 Nov 7 11:26 archive_status
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:26 000000030000000C00000007
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:26 000000030000000D00000002
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:25 000000030000000D00000001
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ls -lt
total 2097180
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:36 000000030000000D00000001
drwx------ 2 lightdb lightdb 4096 Nov 7 11:36 archive_status
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:36 000000030000000D00000000
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:26 000000030000000C00000007
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:26 000000030000000D00000002
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ls -lt
total 2097180
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:36 000000030000000D00000001
drwx------ 2 lightdb lightdb 4096 Nov 7 11:36 archive_status
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:36 000000030000000D00000000
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:26 000000030000000C00000007
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:26 000000030000000D00000002
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ls -lt
total 2097180
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:36 000000030000000D00000001
drwx------ 2 lightdb lightdb 4096 Nov 7 11:36 archive_status
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:36 000000030000000D00000000
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:26 000000030000000C00000007
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:26 000000030000000D00000002
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ls -lt
total 2097180
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:37 000000030000000D00000002
drwx------ 2 lightdb lightdb 4096 Nov 7 11:37 archive_status
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:37 000000030000000D00000001
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:36 000000030000000D00000000
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:26 000000030000000C00000007
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ls -lt
total 2097180
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:37 000000030000000D00000003
drwx------ 2 lightdb lightdb 4096 Nov 7 11:37 archive_status
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:37 000000030000000D00000006
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:37 000000030000000D00000005
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:36 000000030000000D00000004
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ls -lt
total 2097180
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:38 000000030000000D00000004
drwx------ 2 lightdb lightdb 4096 Nov 7 11:38 archive_status
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:38 000000030000000D00000003
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:37 000000030000000D00000006
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:37 000000030000000D00000005
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ls -lt
total 2097180
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:38 000000030000000D00000005
drwx------ 2 lightdb lightdb 4096 Nov 7 11:38 archive_status
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:38 000000030000000D00000007
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:38 000000030000000E00000000
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:37 000000030000000D00000006
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ls -lt
total 2097180
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:39 000000030000000D00000006
drwx------ 2 lightdb lightdb 4096 Nov 7 11:39 archive_status
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:39 000000030000000D00000005
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:38 000000030000000D00000007
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:38 000000030000000E00000000
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ls -lt
total 2097180
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:40 000000030000000D00000007
drwx------ 2 lightdb lightdb 4096 Nov 7 11:40 archive_status
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:40 000000030000000E00000002
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:39 000000030000000E00000001
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:38 000000030000000E00000000
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ls -lt
total 2097180
-rw------- 1 lightdb lightdb 536870912 Nov 7 12:30 000000030000000D00000007
drwx------ 2 lightdb lightdb 4096 Nov 7 11:40 archive_status
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:40 000000030000000E00000002
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:39 000000030000000E00000001
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:38 000000030000000E00000000
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ ls -lt
total 2097180
-rw------- 1 lightdb lightdb 536870912 Nov 7 12:30 000000030000000E00000000
drwx------ 2 lightdb lightdb 4096 Nov 7 12:30 archive_status
-rw------- 1 lightdb lightdb 536870912 Nov 7 12:30 000000030000000D00000007
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:40 000000030000000E00000002
-rw------- 1 lightdb lightdb 536870912 Nov 7 11:39 000000030000000E00000001
-rw------- 1 lightdb lightdb 85 Nov 4 23:17 00000003.history
-rw------- 1 lightdb lightdb 42 Nov 4 23:05 00000002.history
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ lt_waldump -s E/61045FB0 000000030000000E00000003
rmgr: Transaction len (rec/tot): 34/ 34, tx: 2075705, lsn: E/61052460, prev E/610523C0, desc: COMMIT 2021-11-07 14:42:26.209643 CST
rmgr: Heap len (rec/tot): 155/ 155, tx: 2075706, lsn: E/61052488, prev E/61052460, desc: HOT_UPDATE off 18 xmax 2075706 flags 0x60 ; new off 19 xmax 0, blkref #0: rel 1663/20678/16575 blk 13
rmgr: Transaction len (rec/tot): 34/ 34, tx: 2075706, lsn: E/61052528, prev E/61052488, desc: COMMIT 2021-11-07 14:42:26.210615 CST
rmgr: Heap len (rec/tot): 78/ 78, tx: 2075707, lsn: E/61052550, prev E/61052528, desc: HOT_UPDATE off 24 xmax 2075707 flags 0x60 ; new off 25 xmax 0, blkref #0: rel 1663/15154/17153 blk 19
rmgr: Heap len (rec/tot): 155/ 155, tx: 2075707, lsn: E/610525A0, prev E/61052550, desc: HOT_UPDATE off 25 xmax 2075707 flags 0x60 ; new off 26 xmax 0, blkref #0: rel 1663/15154/17153 blk 19
rmgr: Transaction len (rec/tot): 34/ 34, tx: 2075707, lsn: E/61052640, prev E/610525A0, desc: COMMIT 2021-11-07 14:42:26.211391 CST
rmgr: XLOG len (rec/tot): 24/ 24, tx: 0, lsn: E/61052668, prev E/61052640, desc: SWITCH --执行了切换命令
## 不指定-s也可以用来查看某个wal文件的起始lsn、事务号,反过来可以看到上个文件的最后lsn和事务号
[lightdb@iZvv70fyhxdyvjgzmr7ikpZ pg_wal]$ lt_waldump -n 5 000000030000000E00000004
rmgr: Heap len (rec/tot): 111/ 111, tx: 2075708, lsn: E/80000028【第一条记录刚好从XLogLongPageHeaderData之后开始】, prev E/61052668, desc: INSERT off 75 flags 0x00, blkref #0: rel 1663/17542/17577 blk 1200
rmgr: Heap len (rec/tot): 163/ 163, tx: 2075710, lsn: E/80000098, prev E/80000028, desc: HOT_UPDATE off 19 xmax 2075710 flags 0x60 ; new off 20 xmax 0, blkref #0: rel 1663/20678/16575 blk 13
rmgr: Heap len (rec/tot): 163/ 163, tx: 2075711, lsn: E/80000140, prev E/80000098, desc: HOT_UPDATE off 32 xmax 2075711 flags 0x60 ; new off 33 xmax 0, blkref #0: rel 1663/17542/16575 blk 21
rmgr: Transaction len (rec/tot): 34/ 34, tx: 2075710, lsn: E/800001E8, prev E/80000140, desc: COMMIT 2021-11-07 14:42:35.151189 CST
rmgr: Heap len (rec/tot): 163/ 163, tx: 2075709, lsn: E/80000210, prev E/800001E8, desc: HOT_UPDATE off 50 xmax 2075709 flags 0x60 ; new off 51 xmax 0, blkref #0: rel 1663/17614/16575 blk 19
postgres=# SELECT * FROM pg_walfile_name_offset('E/80000028');
file_name | file_offset
--------------------------+-------------
000000030000000E00000004 | 40
(1 row)
如果开启了归档的话,文件在满足min_wal_size后被删除之前一定会被归档。
回到切换的源码实现RequestXLogSwitch。
XLogRecPtr
RequestXLogSwitch(bool mark_unimportant)
{
XLogRecPtr RecPtr;
/* XLOG SWITCH has no data */
XLogBeginInsert();
if (mark_unimportant)
XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
return RecPtr;
}
XLogRecPtr
XLogInsert(RmgrId rmid, uint8 info)
{
XLogRecPtr EndPos;
/* XLogBeginInsert() must have been called. */
if (!begininsert_called)
elog(ERROR, "XLogBeginInsert was not called");
/*
* The caller can set rmgr bits, XLR_SPECIAL_REL_UPDATE and
* XLR_CHECK_CONSISTENCY; the rest are reserved for use by me.
*/
if ((info & ~(XLR_RMGR_INFO_MASK |
XLR_SPECIAL_REL_UPDATE |
XLR_CHECK_CONSISTENCY)) != 0)
elog(PANIC, "invalid xlog info mask %02X", info);
TRACE_POSTGRESQL_WAL_INSERT(rmid, info);
/*
* In bootstrap mode, we don't actually log anything but XLOG resources;
* return a phony record pointer.
*/
if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
{
XLogResetInsertion();
EndPos = SizeOfXLogLongPHD; /* start of 1st chkpt record */
return EndPos;
}
do
{
XLogRecPtr RedoRecPtr;
bool doPageWrites;
XLogRecPtr fpw_lsn;
XLogRecData *rdt;
int num_fpi = 0;
/*
* Get values needed to decide whether to do full-page writes. Since
* we don't yet have an insertion lock, these could change under us,
* but XLogInsertRecord will recheck them once it has a lock.
*/
GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites);
rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites,
&fpw_lsn, &num_fpi);
EndPos = XLogInsertRecord(rdt, fpw_lsn, curinsert_flags, num_fpi); ## 统一实现写wal记录和切换的函数
} while (EndPos == InvalidXLogRecPtr);
XLogResetInsertion();
return EndPos;
}
checkpoint_timeout=1800(默认300)checkpoint完全应该自动,根据恢复时间要求来自动调整,跟oracle一样(因为checkpoint以来写了多少WAL,启动恢复相应数量的WAL需要多久也知道,所以就能算出来)。超过max_wal_size之后也会执行强制checkpoint(因为wal被重用)
pg_start_backup和pg_basebackup,CREATE DATABASE / DROP DATABASE执行的时候,停止pg的时候也会执行checkpoint。
事务隔离性使得写wal可以随便并发。
wal_log_hints会导致checkpoint后,页如果发生了即使不重要的数据改动,也遵循全页写机制。如果wal_checksum启用了,这个参数会自动强制启用。
full_page_writes为了避免部分写(8k被内核拆分为4k,被磁盘拆分为512字节) https://www.2ndquadrant.com/en/blog/on-the-impact-of-full-page-writes/,所以只要恢复一部分数据块即可,还不用先读取data file。这样的话checkpoint的频率以及shared buffer的大小(增量检查点很重要)就非常重要了。
checksum在每次写到OS以及读的时候计算,不是每次修改都计算,所以那负载就好多了(offload cpu checksum),所以OLTP开启吧https://postgreshelp.com/postgresql-checksum/,DSS和批处理不要开启。
pg_rewind(PostgreSQL 9.5官方包含,用于主从故障切换),一般推荐使用repmgr rejoin机制。
WAL的写入
一直以为wal日志记录的写入是由wal writer负责的,debug下来发现主要是backend进程写的。如下:
2021-07-17 22:39:08.196670T psql zjh@postgres localhost(58634) client backend INSERT 00000 [2021-07-17 22:39:01 CST] 598 [59686] LOG: 00000: INSERT @ 1/C18B8330: - Heap/INSERT: off 57 flags 0x00
2021-07-17 22:39:08.196670T psql zjh@postgres localhost(58634) client backend INSERT 00000 [2021-07-17 22:39:01 CST] 598 [59686] LOCATION: XLogInsertRecord, xlog.c:1245
2021-07-17 22:39:08.196670T psql zjh@postgres localhost(58634) client backend INSERT 00000 [2021-07-17 22:39:01 CST] 598 [59686] STATEMENT: insert into t_sample select* from t_sample limit 100000;
2021-07-17 22:39:08.196692T psql zjh@postgres localhost(58634) client backend INSERT 00000 [2021-07-17 22:39:01 CST] 598 [59686] LOG: 00000: INSERT @ 1/C18B8370: - Btree/INSERT_LEAF: off 259
2021-07-17 22:39:08.196692T psql zjh@postgres localhost(58634) client backend INSERT 00000 [2021-07-17 22:39:01 CST] 598 [59686] LOCATION: XLogInsertRecord, xlog.c:1245
2021-07-17 22:39:08.196692T psql zjh@postgres localhost(58634) client backend INSERT 00000 [2021-07-17 22:39:01 CST] 598 [59686] STATEMENT: insert into t_sample select* from t_sample limit 100000;
2021-07-17 23:59:15.533598T psql zjh@postgres localhost(48414) client backend INSERT 00000 [2021-07-17 23:59:11 CST] 599 [131103] DEBUG: 00000: CommitTransaction(1) name: unnamed; blockState
: STARTED; state: INPROGRESS, xid/subid/cid: 599/1/0 (used)
2021-07-17 23:59:15.533598T psql zjh@postgres localhost(48414) client backend INSERT 00000 [2021-07-17 23:59:11 CST] 599 [131103] LOCATION: ShowTransactionStateRec, xact.c:5351
2021-07-17 23:59:15.533598T psql zjh@postgres localhost(48414) client backend INSERT 00000 [2021-07-17 23:59:11 CST] 599 [131103] STATEMENT: insert into t_sample select* from t_sample limit
10;
2021-07-17 23:59:15.533642T psql zjh@postgres localhost(48414) client backend INSERT 00000 [2021-07-17 23:59:11 CST] 599 [131103] LOG: 00000: INSERT @ 1/C18BA970: - Transaction/COMMIT: 2021
-07-17 23:59:15.533614+08
2021-07-17 23:59:15.533642T psql zjh@postgres localhost(48414) client backend INSERT 00000 [2021-07-17 23:59:11 CST] 599 [131103] LOCATION: XLogInsertRecord, xlog.c:1245
2021-07-17 23:59:15.533642T psql zjh@postgres localhost(48414) client backend INSERT 00000 [2021-07-17 23:59:11 CST] 599 [131103] STATEMENT: insert into t_sample select* from t_sample limit
10;
2021-07-17 23:59:15.533658T psql zjh@postgres localhost(48414) client backend INSERT 00000 [2021-07-17 23:59:11 CST] 599 [131103] LOG: 00000: xlog flush request 1/C18BA970; write 1/C18B8538;
flush 1/C18B8538
2021-07-17 23:59:15.533658T psql zjh@postgres localhost(48414) client backend INSERT 00000 [2021-07-17 23:59:11 CST] 599 [131103] LOCATION: XLogFlush, xlog.c:2873
2021-07-17 23:59:15.533658T psql zjh@postgres localhost(48414) client backend INSERT 00000 [2021-07-17 23:59:11 CST] 599 [131103] STATEMENT: insert into t_sample select* from t_sample limit
10;
/*
* If this is the single and first tuple on page, we can reinit the
* page instead of restoring the whole thing. Set flag, and hide
* buffer references from XLogInsert.
*/
if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
{
info |= XLOG_HEAP_INIT_PAGE;
bufflags |= REGBUF_WILL_INIT;
}
xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
xlrec.flags = 0;
if (all_visible_cleared)
xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED;
if (options & HEAP_INSERT_SPECULATIVE)
xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
/*
* For logical decoding, we need the tuple even if we're doing a full
* page write, so make sure it's included even if we take a full-page
* image. (XXX We could alternatively store a pointer into the FPW).
*/
if (RelationIsLogicallyLogged(relation) &&
!(options & HEAP_INSERT_NO_LOGICAL))
{
xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
bufflags |= REGBUF_KEEP_DATA;
}
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
xlhdr.t_infomask = heaptup->t_data->t_infomask;
xlhdr.t_hoff = heaptup->t_data->t_hoff;
/*
* note we mark xlhdr as belonging to buffer; if XLogInsert decides to
* write the whole page to the xlog, we don't need to store
* xl_heap_header in the xlog.
*/
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
XLogRegisterBufData(0,
(char *) heaptup->t_data + SizeofHeapTupleHeader,
heaptup->t_len - SizeofHeapTupleHeader);
/* filtering by origin on a row level is much more efficient */
XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
recptr = XLogInsert(RM_HEAP_ID, info);
PageSetLSN(page, recptr);
所有写入逻辑通过xloginsert.c中的XLogInsert函数调用XLogInsertRecord实现,这样的话wal offset的位置如何分配就非常重要了(毕竟会有连接死掉,超过max_wal_size时还有事务未提交等情况)。
至于wal writer。异步模式下,wal writer负责定时(wal_writer_delay控制)负责调用XLogBackgroundFlush(void)
2021-07-17 22:39:29.119288T @ walwriter 00000 [2021-07-15 09:26:52 CST] 0 [133565] LOG: 00000: xlog bg flush request write 1/C18B8450; flush: 1/C18B8450, current is write 1/C18B8418; flush 1/C18B8418
2021-07-17 22:39:29.119288T @ walwriter 00000 [2021-07-15 09:26:52 CST] 0 [133565] LOCATION: XLogBackgroundFlush, xlog.c:3125
2021-07-17 22:39:29.119559T @ walwriter 00000 [2021-07-15 09:26:52 CST] 0 [133565] DEBUG: 00000: initialized 92 pages, up to 1/C1CB8000
2021-07-17 22:39:29.119559T @ walwriter 00000 [2021-07-15 09:26:52 CST] 0 [133565] LOCATION: AdvanceXLInsertBuffer, xlog.c:2283
- AdvanceXLInsertBuffer
- XLogInsert
- XLogBackgroundFlush
- logical_read_xlog_page
- main
- StartChildProcess
WAL解析
要正确解析wal,需要使用一个大小为最大理论上支持的并发数的wal entry数组做解析缓存。维护一个全局当前已经解析的lsn个apply的lsn。除了get外,解析和应用也可以完全并行。
XLOG/WAL的主要状态
/*
* Total shared-memory state for XLOG.
*/
typedef struct XLogCtlData
{
XLogCtlInsert Insert;
/* Protected by info_lck: */
XLogwrtRqst LogwrtRqst;
XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */
FullTransactionId ckptFullXid; /* nextFullXid of latest checkpoint */
XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
/* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
XLogRecPtr unloggedLSN;
slock_t ulsn_lck;
/* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
pg_time_t lastSegSwitchTime;
XLogRecPtr lastSegSwitchLSN;
/*
* Protected by info_lck and WALWriteLock (you must hold either lock to
* read it, but both to update)
*/
XLogwrtResult LogwrtResult;
/*
* Latest initialized page in the cache (last byte position + 1).
*
* To change the identity of a buffer (and InitializedUpTo), you need to
* hold WALBufMappingLock. To change the identity of a buffer that's
* still dirty, the old page needs to be written out first, and for that
* you need WALWriteLock, and you need to ensure that there are no
* in-progress insertions to the page by calling
* WaitXLogInsertionsToFinish().
*/
XLogRecPtr InitializedUpTo;
/*
* These values do not change after startup, although the pointed-to pages
* and xlblocks values certainly do. xlblocks values are protected by
* WALBufMappingLock.
*/
char *pages; /* buffers for unwritten XLOG pages */
XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
int XLogCacheBlck; /* highest allocated xlog buffer index */
/*
* Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
* If we created a new timeline when the system was started up,
* PrevTimeLineID is the old timeline's ID that we forked off from.
* Otherwise it's equal to ThisTimeLineID.
*/
TimeLineID ThisTimeLineID;
TimeLineID PrevTimeLineID;
/*
* SharedRecoveryState indicates if we're still in crash or archive
* recovery. Protected by info_lck.
*/
RecoveryState SharedRecoveryState;
/*
* SharedHotStandbyActive indicates if we allow hot standby queries to be
* run. Protected by info_lck.
*/
bool SharedHotStandbyActive;
/*
* SharedPromoteIsTriggered indicates if a standby promotion has been
* triggered. Protected by info_lck.
*/
bool SharedPromoteIsTriggered;
/*
* WalWriterSleeping indicates whether the WAL writer is currently in
* low-power mode (and hence should be nudged if an async commit occurs).
* Protected by info_lck.
*/
bool WalWriterSleeping;
/*
* recoveryWakeupLatch is used to wake up the startup process to continue
* WAL replay, if it is waiting for WAL to arrive or failover trigger file
* to appear.
*/
Latch recoveryWakeupLatch;
/*
* During recovery, we keep a copy of the latest checkpoint record here.
* lastCheckPointRecPtr points to start of checkpoint record and
* lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the
* checkpointer when it wants to create a restartpoint.
*
* Protected by info_lck.
*/
XLogRecPtr lastCheckPointRecPtr;
XLogRecPtr lastCheckPointEndPtr;
CheckPoint lastCheckPoint;
/*
* lastReplayedEndRecPtr points to end+1 of the last record successfully
* replayed. When we're currently replaying a record, ie. in a redo
* function, replayEndRecPtr points to the end+1 of the record being
* replayed, otherwise it's equal to lastReplayedEndRecPtr.
*/
XLogRecPtr lastReplayedEndRecPtr;
TimeLineID lastReplayedTLI;
XLogRecPtr replayEndRecPtr;
TimeLineID replayEndTLI;
/* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
TimestampTz recoveryLastXTime;
/*
* timestamp of when we started replaying the current chunk of WAL data,
* only relevant for replication or archive recovery
*/
TimestampTz currentChunkStartTime;
/* Are we requested to pause recovery? */
bool recoveryPause;
/*
* lastFpwDisableRecPtr points to the start of the last replayed
* XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
*/
XLogRecPtr lastFpwDisableRecPtr;
slock_t info_lck; /* locks shared variables shown above */
} XLogCtlData;
标签:rw,lightdb,-------,wal,pg,Nov From: https://blog.51cto.com/zhjh256/5927788