-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Avoid redundand memory allocation and sycnhronization in walredo #144
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -99,6 +99,9 @@ static ssize_t buffered_read(void *buf, size_t count); | |
|
||
static BufferTag target_redo_tag; | ||
|
||
bool am_wal_redo_postgres; | ||
static XLogReaderState reader_state; | ||
|
||
#define TRACE DEBUG5 | ||
|
||
#ifdef HAVE_LIBSECCOMP | ||
|
@@ -166,6 +169,7 @@ WalRedoMain(int argc, char *argv[], | |
InitStandaloneProcess(argv[0]); | ||
|
||
SetProcessingMode(InitProcessing); | ||
am_wal_redo_postgres = true; | ||
|
||
/* | ||
* Set default values for command-line options. | ||
|
@@ -293,6 +297,7 @@ WalRedoMain(int argc, char *argv[], | |
if (RmgrTable[rmid].rm_startup != NULL) | ||
RmgrTable[rmid].rm_startup(); | ||
} | ||
reader_state.errormsg_buf = MemoryContextAlloc(TopMemoryContext, 1000 + 1); /* MAX_ERRORMSG_LEN */ | ||
knizhnik marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
#ifdef HAVE_LIBSECCOMP | ||
/* We prefer opt-out to opt-in for greater security */ | ||
|
@@ -313,24 +318,23 @@ WalRedoMain(int argc, char *argv[], | |
/* | ||
* Main processing loop | ||
*/ | ||
MemoryContextSwitchTo(MessageContext); | ||
initStringInfo(&input_message); | ||
|
||
Comment on lines
+329
to
+331
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This makes us vulnerable to memory leaks. I guess that's OK; redo functions should not leak memory, or that would be a problem with normal WAL recovery too. We just have to be careful to not leak memory in our code in this file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately there is no fast way to get memory context size, otherwise I can add assert that size of MessageContext is not exceeding some limit. |
||
for (;;) | ||
{ | ||
/* | ||
* Release storage left over from prior query cycle, and create a new | ||
* query input buffer in the cleared MessageContext. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "cleared MessageContext" isn't accurate anymore, as we don't reset in the loop anymore. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will fix this comment |
||
*/ | ||
MemoryContextSwitchTo(MessageContext); | ||
MemoryContextResetAndDeleteChildren(MessageContext); | ||
|
||
initStringInfo(&input_message); | ||
resetStringInfo(&input_message); | ||
|
||
set_ps_display("idle"); | ||
|
||
/* | ||
* (3) read a command (loop blocks here) | ||
*/ | ||
firstchar = ReadRedoCommand(&input_message); | ||
|
||
switch (firstchar) | ||
{ | ||
case 'B': /* BeginRedoForBlock */ | ||
|
@@ -406,23 +410,6 @@ pprint_buffer(char *data, int len) | |
return s.data; | ||
} | ||
|
||
static char * | ||
pprint_tag(BufferTag *tag) | ||
{ | ||
StringInfoData s; | ||
|
||
initStringInfo(&s); | ||
|
||
appendStringInfo(&s, "%u/%u/%u.%d blk %u", | ||
tag->rnode.spcNode, | ||
tag->rnode.dbNode, | ||
tag->rnode.relNode, | ||
tag->forkNum, | ||
tag->blockNum | ||
); | ||
|
||
return s.data; | ||
} | ||
/* ---------------------------------------------------------------- | ||
* routines to obtain user input | ||
* ---------------------------------------------------------------- | ||
|
@@ -492,7 +479,6 @@ ReadRedoCommand(StringInfo inBuf) | |
return qtype; | ||
} | ||
|
||
|
||
/* | ||
* Prepare for WAL replay on given block | ||
*/ | ||
|
@@ -502,7 +488,6 @@ BeginRedoForBlock(StringInfo input_message) | |
RelFileNode rnode; | ||
ForkNumber forknum; | ||
BlockNumber blknum; | ||
MemoryContext oldcxt; | ||
SMgrRelation reln; | ||
|
||
/* | ||
|
@@ -520,16 +505,14 @@ BeginRedoForBlock(StringInfo input_message) | |
rnode.relNode = pq_getmsgint(input_message, 4); | ||
blknum = pq_getmsgint(input_message, 4); | ||
|
||
oldcxt = MemoryContextSwitchTo(TopMemoryContext); | ||
INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum); | ||
|
||
{ | ||
char* buf = pprint_tag(&target_redo_tag); | ||
elog(TRACE, "BeginRedoForBlock %s", buf); | ||
pfree(buf); | ||
} | ||
|
||
MemoryContextSwitchTo(oldcxt); | ||
elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u", | ||
hlinnaka marked this conversation as resolved.
Show resolved
Hide resolved
|
||
target_redo_tag.rnode.spcNode, | ||
target_redo_tag.rnode.dbNode, | ||
target_redo_tag.rnode.relNode, | ||
target_redo_tag.forkNum, | ||
target_redo_tag.blockNum); | ||
|
||
reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); | ||
if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber || | ||
|
@@ -589,7 +572,6 @@ ApplyRecord(StringInfo input_message) | |
XLogRecPtr lsn; | ||
XLogRecord *record; | ||
int nleft; | ||
XLogReaderState reader_state; | ||
|
||
/* | ||
* message format: | ||
|
@@ -608,20 +590,15 @@ ApplyRecord(StringInfo input_message) | |
record->xl_tot_len, (int) sizeof(XLogRecord) + nleft); | ||
|
||
/* FIXME: use XLogReaderAllocate() */ | ||
knizhnik marked this conversation as resolved.
Show resolved
Hide resolved
|
||
memset(&reader_state, 0, sizeof(XLogReaderState)); | ||
reader_state.ReadRecPtr = 0; /* no 'prev' record */ | ||
reader_state.EndRecPtr = lsn; /* this record */ | ||
XLogBeginRead(&reader_state, lsn); | ||
reader_state.decoded_record = record; | ||
reader_state.errormsg_buf = palloc(1000 + 1); /* MAX_ERRORMSG_LEN */ | ||
|
||
if (!DecodeXLogRecord(&reader_state, record, &errormsg)) | ||
elog(ERROR, "failed to decode WAL record: %s", errormsg); | ||
|
||
/* Ignore any other blocks than the ones the caller is interested in */ | ||
redo_read_buffer_filter = redo_block_filter; | ||
|
||
RmgrTable[record->xl_rmid].rm_redo(&reader_state); | ||
|
||
redo_read_buffer_filter = NULL; | ||
|
||
elog(TRACE, "applied WAL record with LSN %X/%X", | ||
|
@@ -701,7 +678,7 @@ GetPage(StringInfo input_message) | |
} while (tot_written < BLCKSZ); | ||
|
||
ReleaseBuffer(buf); | ||
DropDatabaseBuffers(rnode.dbNode); | ||
DropRelFileNodeAllLocalBuffers(rnode); | ||
smgrinit(); //reset inmem smgr state | ||
|
||
elog(TRACE, "Page sent back for block %u", blknum); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this make any difference? Every WAL redo call dirties a page, so 'inmem_files' will never be empty.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could do one of the following:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I think I get it now. These functions are only used for pages that are evicted from the buffer cache. That should pretty much never happen with a WAL redo function. Replaying a record won't affect more than a handful of pages. XLR_MAX_BLOCK_ID is 32, so not more than that, actually. Are inmem_write() and inmem_read() dead code in practice?
In any case, let's add comments about that!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And let's do this:
The default num_temp_buffers is 1024, which is much more than we need. And DropRelFileNodeAllLocalBuffers() scans through all buffers. In my quick profiling with 'perf', DropRelFileNodeAllLocalBuffers() was taking about 10% of the CPU time (of the WAL redo process), and after changing it to 64, it's not visible in the profile anymore.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As far as I remember there are some cases when wal redo writes data directly to file, bypassing shared buffers. This is why this
inmem_file
is needed. Without it (without resetting files) some tests didn't pass.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually even 32 (XLR_MAX_BLOCK_ID) pages are not needed because during recovery we restore only target page and ignoring all other.
I will try to check once again what is the maximal number of pages dirtied during wal redo (when it can be larger than 1).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense: will do.