Skip to content

Commit

Permalink
Avoid redundand memory allocation and sycnhronization in walredo (#144)
Browse files Browse the repository at this point in the history
* Avoid redundand memory allocation and sycnhronization in walredo

* Address review comments

* Reduce number of temp buffers and size of inmem file storage for wal redo postgres

* Misc cleanup

Add comments on 'inmem_smgr.c', remove superfluous copy-pasted comments,
pgindent.

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>
  • Loading branch information
2 people authored and tristan957 committed Aug 9, 2023
1 parent f6a66d8 commit 0715d9f
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 164 deletions.
188 changes: 74 additions & 114 deletions contrib/zenith/inmem_smgr.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,55 +2,60 @@
*
* inmem_smgr.c
*
* This is an implementation of the SMGR interface, used in the WAL redo
* process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent
* storage, the pages that are written out are kept in a small number of
* in-memory buffers.
*
* Normally, replaying a WAL record only needs to access a handful of
* buffers, which fit in the normal buffer cache, so this is just for
* "overflow" storage when the buffer cache is not large enough.
*
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* contrib/zenith/inmem_smgr.c
*
* TODO cleanup obsolete copy-pasted comments
*-------------------------------------------------------------------------
*/
#include "postgres.h"

#include "pagestore_client.h"
#include "storage/block.h"
#include "storage/buf_internals.h"
#include "storage/relfilenode.h"
#include "pagestore_client.h"
#include "utils/hsearch.h"
#include "access/xlog.h"
#include "storage/smgr.h"

typedef struct
{
RelFileNode node;
ForkNumber forknum;
BlockNumber blkno;
} WrNodeKey;
#define MAX_PAGES 128

typedef struct
{
WrNodeKey tag;
char data[BLCKSZ];
} WrNode;
static BufferTag page_tag[MAX_PAGES];
static char page_body[MAX_PAGES][BLCKSZ];
static int used_pages;

HTAB *inmem_files;
static int
locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
{
/* We only hold a small number of pages, so linear search */
for (int i = 0; i < used_pages; i++)
{
if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
&& forknum == page_tag[i].forkNum
&& blkno == page_tag[i].blockNum)
{
return i;
}
}
return -1;
}

/*
* inmem_init() -- Initialize private state
*/
void
inmem_init(void)
{
HASHCTL hashCtl;

hashCtl.keysize = sizeof(WrNodeKey);
hashCtl.entrysize = sizeof(WrNode);

if (inmem_files)
hash_destroy(inmem_files);

inmem_files = hash_create("wal-redo files map",
1024,
&hashCtl,
HASH_ELEM | HASH_BLOBS);
used_pages = 0;
}

/*
Expand All @@ -59,15 +64,15 @@ inmem_init(void)
bool
inmem_exists(SMgrRelation reln, ForkNumber forknum)
{
WrNodeKey key;

key.node = reln->smgr_rnode.node;
key.forknum = forknum;
key.blkno = 0;
return hash_search(inmem_files,
&key,
HASH_FIND,
NULL) != NULL;
for (int i = 0; i < used_pages; i++)
{
if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
&& forknum == page_tag[i].forkNum)
{
return true;
}
}
return false;
}

/*
Expand All @@ -82,21 +87,6 @@ inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)

/*
* inmem_unlink() -- Unlink a relation.
*
* Note that we're passed a RelFileNodeBackend --- by the time this is called,
* there won't be an SMgrRelation hashtable entry anymore.
*
* forknum can be a fork number to delete a specific fork, or InvalidForkNumber
* to delete all forks.
*
*
* If isRedo is true, it's unsurprising for the relation to be already gone.
* Also, we should remove the file immediately instead of queuing a request
* for later, since during redo there's no possibility of creating a
* conflicting relation.
*
* Note: any failure should be reported as WARNING not ERROR, because
* we are usually not in a transaction anymore when this is called.
*/
void
inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
Expand All @@ -116,17 +106,8 @@ void
inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
char *buffer, bool skipFsync)
{
WrNodeKey key;
WrNode *node;

key.node = reln->smgr_rnode.node;
key.forknum = forknum;
key.blkno = blkno;
node = hash_search(inmem_files,
&key,
HASH_ENTER,
NULL);
memcpy(node->data, buffer, BLCKSZ);
/* same as smgwrite() for us */
inmem_write(reln, forknum, blkno, buffer, skipFsync);
}

/*
Expand Down Expand Up @@ -156,9 +137,6 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)

/*
* inmem_writeback() -- Tell the kernel to write pages back to storage.
*
* This accepts a range of blocks because flushing several pages at once is
* considerably more efficient than doing so individually.
*/
void
inmem_writeback(SMgrRelation reln, ForkNumber forknum,
Expand All @@ -173,20 +151,13 @@ void
inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
char *buffer)
{
WrNodeKey key;
WrNode *node;
int pg;

key.node = reln->smgr_rnode.node;
key.forknum = forknum;
key.blkno = blkno;
node = hash_search(inmem_files,
&key,
HASH_FIND,
NULL);
if (node != NULL)
memcpy(buffer, node->data, BLCKSZ);
else
pg = locate_page(reln, forknum, blkno);
if (pg < 0)
memset(buffer, 0, BLCKSZ);
else
memcpy(buffer, page_body[pg], BLCKSZ);
}

/*
Expand All @@ -200,17 +171,19 @@ void
inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer, bool skipFsync)
{
WrNodeKey key;
WrNode *node;
int pg;

pg = locate_page(reln, forknum, blocknum);
if (pg < 0)
{
if (used_pages == MAX_PAGES)
elog(ERROR, "Inmem storage overflow");

key.node = reln->smgr_rnode.node;
key.forknum = forknum;
key.blkno = blocknum;
node = hash_search(inmem_files,
&key,
HASH_ENTER,
NULL);
memcpy(node->data, buffer, BLCKSZ);
pg = used_pages;
used_pages++;
INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
}
memcpy(page_body[pg], buffer, BLCKSZ);
}

/*
Expand All @@ -219,23 +192,18 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
BlockNumber
inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
{
WrNodeKey key;
WrNode *node;

key.node = reln->smgr_rnode.node;
key.forknum = forknum;
key.blkno = 0;
int nblocks = 0;

while (true)
for (int i = 0; i < used_pages; i++)
{
node = hash_search(inmem_files,
&key,
HASH_FIND,
NULL);
if (node == NULL)
return key.blkno;
key.blkno += 1;
if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
&& forknum == page_tag[i].forkNum)
{
if (page_tag[i].blockNum >= nblocks)
nblocks = page_tag[i].blockNum + 1;
}
}
return nblocks;
}

/*
Expand All @@ -248,19 +216,12 @@ inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)

/*
* inmem_immedsync() -- Immediately sync a relation to stable storage.
*
* Note that only writes already issued are synced; this routine knows
* nothing of dirty buffers that may exist inside the buffer manager. We
* sync active and inactive segments; smgrDoPendingSyncs() relies on this.
* Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of
* some segment, then mdtruncate() renders that segment inactive. If we
* crash before the next checkpoint syncs the newly-inactive segment, that
* segment may survive recovery, reintroducing unwanted data into the table.
*/
void
inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
{
}

static const struct f_smgr inmem_smgr =
{
.smgr_init = inmem_init,
Expand All @@ -283,12 +244,11 @@ static const struct f_smgr inmem_smgr =
const f_smgr *
smgr_inmem(BackendId backend, RelFileNode rnode)
{
if (backend != InvalidBackendId && !InRecovery)
Assert(InRecovery);
if (backend != InvalidBackendId)
return smgr_standard(backend, rnode);
else
{
return &inmem_smgr;
}
}

void
Expand Down
7 changes: 5 additions & 2 deletions src/backend/storage/buffer/bufmgr.c
Original file line number Diff line number Diff line change
Expand Up @@ -804,7 +804,6 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
mode, strategy, &hit);
}


/*
* ReadBuffer_common -- common logic for all ReadBuffer variants
*
Expand All @@ -819,7 +818,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
Block bufBlock;
bool found;
bool isExtend;
bool isLocalBuf = SmgrIsTemp(smgr);
/*
* wal_redo postgres is working in single user mode, we do not need to synchronize access to shared buffer,
* so let's use local buffers instead
*/
bool isLocalBuf = SmgrIsTemp(smgr) || am_wal_redo_postgres;

*hit = false;

Expand Down
Loading

0 comments on commit 0715d9f

Please sign in to comment.