From ad3ee4a5e15c813277c5cb9b69e4a17f009118b8 Mon Sep 17 00:00:00 2001 From: Mariusz Zaborski Date: Tue, 7 Dec 2021 11:49:01 +0100 Subject: [PATCH] Introduce write throttle smoothing To avoid stalls and uneven performance during heavy write workloads, continue to apply write throttling even when the amount of dirty data has temporarily dipped below zfs_delay_min_dirty_percent for up to zfs_write_smoothing seconds. Signed-off-by: Allan Jude Signed-off-by: Mariusz Zaborski --- include/sys/dsl_pool.h | 3 +++ man/man4/zfs.4 | 17 ++++++++++++++++- module/zfs/dmu_tx.c | 35 +++++++++++++++++++++++++---------- module/zfs/dsl_pool.c | 11 ++++++++++- 4 files changed, 54 insertions(+), 12 deletions(-) diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 44900f8ceb2f..4080017ee067 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -65,6 +65,8 @@ extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_max_percent; extern int zfs_delay_min_dirty_percent; extern unsigned long zfs_delay_scale; +extern unsigned long zfs_smoothing_scale; +extern unsigned long zfs_smoothing_write; /* These macros are for indexing into the zfs_all_blkstats_t. */ #define DMU_OT_DEFERRED DMU_OT_NONE @@ -116,6 +118,7 @@ typedef struct dsl_pool { kcondvar_t dp_spaceavail_cv; uint64_t dp_dirty_pertxg[TXG_SIZE]; uint64_t dp_dirty_total; + hrtime_t dp_last_smooth; uint64_t dp_long_free_dirty_pertxg[TXG_SIZE]; uint64_t dp_mos_used_delta; uint64_t dp_mos_compressed_delta; diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 9ca4451a2791..9dc649e5e278 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -15,7 +15,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd June 1, 2021 +.Dd January 8, 2021 .Dt ZFS 4 .Os . @@ -940,6 +940,21 @@ This will smoothly handle between ten times and a tenth of this number. .Pp .Sy zfs_delay_scale * zfs_dirty_data_max Em must be smaller than Sy 2^64 . . +.It Sy zfs_smoothing_write Ns = Ns Sy 0 Ns s Pq ulong +This controls for how many seconds smoothing should be applied. +The smoothing mechanism is used to add additional transaction delays +after the amount of dirty data drops below +.Sy zfs_delay_min_dirty_percent . +This mechanism may be used to avoid stalls and uneven performance during +heavy write workloads +. +.It Sy zfs_smoothing_scale Ns = Ns Sy 100000 Pq int +Similar to +.Sy zfs_delay_scale , +but for write smoothing. +This variable controls the scale of smoothing curve. +Larger values cause longer delays for a given amount of dirty data. +. .It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int Disables requirement for IVset GUIDs to be present and match when doing a raw receive of encrypted datasets. diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 5fa516866668..db0696118793 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -778,14 +778,16 @@ int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ * of zfs_delay_scale to increase the steepness of the curve. */ static void -dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) +dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty, hrtime_t last_smooth) { dsl_pool_t *dp = tx->tx_pool; uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - hrtime_t wakeup, min_tx_time, now; + hrtime_t wakeup, min_tx_time, now, smoothing_time, delay_time; - if (dirty <= delay_min_bytes) + now = gethrtime(); + + if (dirty <= delay_min_bytes && last_smooth <= now) return; /* @@ -796,11 +798,20 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) */ ASSERT3U(dirty, <, zfs_dirty_data_max); - now = gethrtime(); - min_tx_time = zfs_delay_scale * - (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); - min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); - if (now > tx->tx_start + min_tx_time) + smoothing_time = 0; + delay_time = 0; + + if (dirty > delay_min_bytes) { + delay_time = zfs_delay_scale * + (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); + } + if (last_smooth > now) { + smoothing_time = zfs_smoothing_scale * dirty / + (zfs_dirty_data_max - dirty); + } + + min_tx_time = MIN(MAX(smoothing_time, delay_time), zfs_delay_max_ns); + if (zfs_smoothing_write == 0 && now > tx->tx_start + min_tx_time) return; DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, @@ -810,6 +821,9 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) wakeup = MAX(tx->tx_start + min_tx_time, dp->dp_last_wakeup + min_tx_time); dp->dp_last_wakeup = wakeup; + if (dirty > delay_min_bytes) { + dp->dp_last_smooth = now + zfs_smoothing_write * NANOSEC; + } mutex_exit(&dp->dp_lock); zfs_sleep_until(wakeup); @@ -1071,7 +1085,7 @@ dmu_tx_wait(dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; dsl_pool_t *dp = tx->tx_pool; - hrtime_t before; + hrtime_t before, last_smooth; ASSERT(tx->tx_txg == 0); ASSERT(!dsl_pool_config_held(tx->tx_pool)); @@ -1091,10 +1105,11 @@ dmu_tx_wait(dmu_tx_t *tx) DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); while (dp->dp_dirty_total >= zfs_dirty_data_max) cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); + last_smooth = dp->dp_last_smooth; dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); - dmu_tx_delay(tx, dirty); + dmu_tx_delay(tx, dirty, last_smooth); tx->tx_wait_dirty = B_FALSE; diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 5d1522a7bfd0..6ee8ba4d5d0c 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -103,6 +103,7 @@ unsigned long zfs_dirty_data_max = 0; unsigned long zfs_dirty_data_max_max = 0; int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; +unsigned long zfs_smoothing_write = 0; /* * zfs_wrlog_data_max, the upper limit of TX_WRITE log data. @@ -140,6 +141,7 @@ int zfs_delay_min_dirty_percent = 60; * multiply in dmu_tx_delay(). */ unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000; +unsigned long zfs_smoothing_scale = 100000; /* * This determines the number of threads used by the dp_sync_taskq. @@ -958,9 +960,10 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) mutex_enter(&dp->dp_lock); uint64_t dirty = dp->dp_dirty_total; + hrtime_t last_delay = dp->dp_last_smooth; mutex_exit(&dp->dp_lock); - return (dirty > delay_min_bytes); + return (dirty > delay_min_bytes || last_delay > gethrtime()); } static boolean_t @@ -1465,6 +1468,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, "The size limit of write-transaction zil log data"); +ZFS_MODULE_PARAM(zfs, zfs_, smoothing_write, ULONG, ZMOD_RW, + "How long should we smooth write after last delay (sec)"); + /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); @@ -1475,6 +1481,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW, "How quickly delay approaches infinity"); +ZFS_MODULE_PARAM(zfs, zfs_, smoothing_scale, ULONG, ZMOD_RW, + "Delay smoothing scale"); + ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, "Max percent of CPUs that are used to sync dirty data");