From 600316040c26f82c0ff91570c025a5e2536ec688 Mon Sep 17 00:00:00 2001 From: jxdking Date: Sat, 10 Apr 2021 01:04:09 +0000 Subject: [PATCH] Add 2 Module Parameters Regarding Log Size Limit zfs_wrlog_data_max The upper limit of TX_WRITE log data. Once it is reached, write operation is blocked, until log data is cleared out after txg sync. It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. This defaults to the same value of zfs_dirty_data_max zfs_wrlog_data_sync_percent The least TX_WRITE log data (as a percentage of zfs_wrlog_data_max) to kick a txg sync. Signed-off-by: jxdking --- include/sys/dsl_pool.h | 8 +++++ module/zfs/arc.c | 4 +++ module/zfs/dsl_pool.c | 68 ++++++++++++++++++++++++++++++++++++++++++ module/zfs/zfs_log.c | 7 +++++ module/zfs/zvol.c | 8 +++-- 5 files changed, 93 insertions(+), 2 deletions(-) diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 8249bb8fc633..1fd02fc09957 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -58,6 +58,8 @@ struct dsl_deadlist; extern unsigned long zfs_dirty_data_max; extern unsigned long zfs_dirty_data_max_max; +extern unsigned long zfs_wrlog_data_max; +extern int zfs_wrlog_data_sync_percent; extern int zfs_dirty_data_sync_percent; extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_max_percent; @@ -119,6 +121,11 @@ typedef struct dsl_pool { uint64_t dp_mos_compressed_delta; uint64_t dp_mos_uncompressed_delta; + /* Uses dp_wrlog_lock */ + kmutex_t dp_wrlog_lock; + uint64_t dp_wrlog_pertxg[TXG_SIZE]; + uint64_t dp_wrlog_total; + /* * Time of most recently scheduled (furthest in the future) * wakeup for delayed transactions. @@ -158,6 +165,7 @@ int dsl_pool_sync_context(dsl_pool_t *dp); uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy); uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy); +void dsl_pool_wrlog_delay(int64_t resid, dsl_pool_t *dp, uint64_t txg); void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index f0ae3938a333..fef1e19e4935 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -7706,6 +7706,10 @@ arc_init(void) zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } + + if (zfs_wrlog_data_max == 0) { + zfs_wrlog_data_max = zfs_dirty_data_max; + } } void diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index c770eafa75d8..4a0b9d56e2ef 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -104,6 +104,20 @@ unsigned long zfs_dirty_data_max_max = 0; int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; +/* + * zfs_wrlog_data_max, the upper limit of TX_WRITE log data. + * Once it is reached, write operation is blocked, + * until log data is cleared out after txg sync. + * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. + * This defaults to the same value of zfs_dirty_data_max. + * + * zfs_wrlog_data_sync_percent, the least TX_WRITE log data + * (as a percentage of zfs_wrlog_data_max) to push a txg. + */ +unsigned long zfs_wrlog_data_max = 0; +int zfs_wrlog_data_sync_percent = 30; + + /* * If there's at least this much dirty data (as a percentage of * zfs_dirty_data_max), push out a txg. This should be less than @@ -218,6 +232,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&dp->dp_wrlog_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, @@ -415,6 +430,7 @@ dsl_pool_close(dsl_pool_t *dp) rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); + mutex_destroy(&dp->dp_wrlog_lock); cv_destroy(&dp->dp_spaceavail_cv); taskq_destroy(dp->dp_unlinked_drain_taskq); taskq_destroy(dp->dp_zrele_taskq); @@ -593,6 +609,49 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) cv_signal(&dp->dp_spaceavail_cv); } +void +dsl_pool_wrlog_delay(int64_t size, dsl_pool_t *dp, uint64_t txg) +{ + uint64_t last_total; + + mutex_enter(&dp->dp_wrlog_lock); + last_total = dp->dp_wrlog_total; + dp->dp_wrlog_total += size; + dp->dp_wrlog_pertxg[txg & TXG_MASK] += size; + mutex_exit(&dp->dp_wrlog_lock); + + if (last_total > zfs_wrlog_data_max) { + dprintf("write log total exceeds zfs_wrlog_data_max. " + "dp_wrlog_total: %llu", last_total); + + /* + * Current txg need to stay open to process current + * write transaction. We only can wait until txg - 1 + * is synced. + */ + if (txg > 2) { + txg_wait_synced(dp, txg - 1); + } + } + + if (last_total >= + zfs_wrlog_data_sync_percent * zfs_wrlog_data_max / 100) { + txg_kick(dp); + } +} + +static void +dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) +{ + mutex_enter(&dp->dp_wrlog_lock); + dp->dp_wrlog_total -= dp->dp_wrlog_pertxg[txg & TXG_MASK]; + dp->dp_wrlog_pertxg[txg & TXG_MASK] = 0; + mutex_exit(&dp->dp_wrlog_lock); + + dprintf("write log total cleared for txg: %llu, " + "dp_wrlog_total: %llu", txg, dp->dp_wrlog_total); +} + #ifdef ZFS_DEBUG static boolean_t dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) @@ -817,6 +876,9 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); dmu_buf_rele(ds->ds_dbuf, zilog); } + + dsl_pool_wrlog_clear(dp, txg); + ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } @@ -1393,6 +1455,12 @@ ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, "Determines the dirty space limit"); +ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, + "The write log limit"); + +ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_sync_percent, INT, ZMOD_RW, + "The data txg sync threshold as a percentage of zfs_wrlog_data_max"); + /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 30d5c4821ae5..f3a75f681422 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -45,6 +45,8 @@ #include #include #include +#include + /* * These zfs_log_* functions must be called within a dmu tx, in one @@ -541,6 +543,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, itx_wr_state_t write_state; uintptr_t fsync_cnt; uint64_t gen = 0; + ssize_t size = resid; if (zil_replaying(zilog, tx) || zp->z_unlinked || zfs_xattr_owner_unlinked(zp)) { @@ -626,6 +629,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, off += len; resid -= len; } + + if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { + dsl_pool_wrlog_delay(size, zilog->zl_dmu_pool, tx->tx_txg); + } } /* diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index b6609363f047..7cd2c5b692af 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -84,9 +84,8 @@ #include #include #include - #include - +#include unsigned int zvol_inhibit_dev = 0; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; @@ -579,6 +578,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; itx_wr_state_t write_state; + uint64_t sz = size; if (zil_replaying(zilog, tx)) return; @@ -630,6 +630,10 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, offset += len; size -= len; } + + if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { + dsl_pool_wrlog_delay(sz, zilog->zl_dmu_pool, tx->tx_txg); + } } /*