diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index 71a9ac7ca7bf..ad3f1b0e47ca 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -124,8 +124,8 @@ typedef struct dmu_tx_stats { kstat_named_t dmu_tx_dirty_throttle; kstat_named_t dmu_tx_dirty_delay; kstat_named_t dmu_tx_dirty_over_max; - kstat_named_t dmu_tx_wrlog_over_max; kstat_named_t dmu_tx_dirty_frees_delay; + kstat_named_t dmu_tx_wrlog_delay; kstat_named_t dmu_tx_quota; } dmu_tx_stats_t; diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 2cd83d1ffb1c..d2ddc1c0e92f 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -165,7 +165,7 @@ uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy); uint64_t dsl_pool_deferred_space(dsl_pool_t *dp); void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg); -boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp); +boolean_t dsl_pool_need_wrlog_delay(dsl_pool_t *dp); void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index a2f14fb557b0..d1ca69f80309 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -70,7 +70,7 @@ to a log2 fraction of the target ARC size. dnode slots allocated in a single operation as a power of 2. The default value minimizes lock contention for the bulk operation performed. . -.It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128MB Pc Pq int +.It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq int Limit the amount we can prefetch with one call to this amount in bytes. This helps to limit the amount of memory that can be used by prefetching. . @@ -109,6 +109,11 @@ A value of .Sy 100 disables this feature. . +.It Sy l2arc_exclude_special Ns = Ns Sy 0 Ns | Ns 1 Pq int +Controls whether buffers present on special vdevs are eligible for caching +into L2ARC. +If set to 1, exclude dbufs on special vdevs from being cached to L2ARC. +. .It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq int Controls whether only MFU metadata and data are cached from ARC into L2ARC. This may be desired to avoid wasting space on L2ARC when reading/writing large @@ -159,7 +164,7 @@ If set to .Sy 100 we TRIM twice the space required to accommodate upcoming writes. A minimum of -.Sy 64MB +.Sy 64 MiB will be trimmed. It also enables TRIM of the whole L2ARC device upon creation or addition to an existing pool or if the header of the device is @@ -189,12 +194,12 @@ to enable caching/reading prefetches to/from L2ARC. .It Sy l2arc_norw Ns = Ns Sy 0 Ns | Ns 1 Pq int No reads during writes. . -.It Sy l2arc_write_boost Ns = Ns Sy 8388608 Ns B Po 8MB Pc Pq ulong +.It Sy l2arc_write_boost Ns = Ns Sy 8388608 Ns B Po 8 MiB Pc Pq ulong Cold L2ARC devices will have .Sy l2arc_write_max increased by this amount while they remain cold. . -.It Sy l2arc_write_max Ns = Ns Sy 8388608 Ns B Po 8MB Pc Pq ulong +.It Sy l2arc_write_max Ns = Ns Sy 8388608 Ns B Po 8 MiB Pc Pq ulong Max write bytes per interval. . .It Sy l2arc_rebuild_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int @@ -204,35 +209,35 @@ or attaching an L2ARC device (e.g. the L2ARC device is slow in reading stored log metadata, or the metadata has become somehow fragmented/unusable). . -.It Sy l2arc_rebuild_blocks_min_l2size Ns = Ns Sy 1073741824 Ns B Po 1GB Pc Pq ulong +.It Sy l2arc_rebuild_blocks_min_l2size Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq ulong Mininum size of an L2ARC device required in order to write log blocks in it. The log blocks are used upon importing the pool to rebuild the persistent L2ARC. .Pp -For L2ARC devices less than 1GB, the amount of data +For L2ARC devices less than 1 GiB, the amount of data .Fn l2arc_evict evicts is significant compared to the amount of restored L2ARC data. In this case, do not write log blocks in L2ARC in order not to waste space. . -.It Sy metaslab_aliquot Ns = Ns Sy 524288 Ns B Po 512kB Pc Pq ulong +.It Sy metaslab_aliquot Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq ulong Metaslab granularity, in bytes. This is roughly similar to what would be referred to as the "stripe size" in traditional RAID arrays. -In normal operation, ZFS will try to write this amount of data -to a top-level vdev before moving on to the next one. +In normal operation, ZFS will try to write this amount of data to each disk +before moving on to the next top-level vdev. . .It Sy metaslab_bias_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable metaslab group biasing based on their vdevs' over- or under-utilization relative to the pool. . -.It Sy metaslab_force_ganging Ns = Ns Sy 16777217 Ns B Ns B Po 16MB + 1B Pc Pq ulong +.It Sy metaslab_force_ganging Ns = Ns Sy 16777217 Ns B Po 16 MiB + 1 B Pc Pq ulong Make some blocks above a certain size be gang blocks. This option is used by the test suite to facilitate testing. . -.It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Ns B Po 1MB Pc Pq int +.It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq int When attempting to log an output nvlist of an ioctl in the on-disk history, the output will not be stored if it is larger than this size (in bytes). This must be less than -.Sy DMU_MAX_ACCESS Pq 64MB . +.Sy DMU_MAX_ACCESS Pq 64 MiB . This applies primarily to .Fn zfs_ioc_channel_program Pq cf. Xr zfs-program 8 . . @@ -256,7 +261,7 @@ Prevent metaslabs from being unloaded. .It Sy metaslab_fragmentation_factor_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable use of the fragmentation metric in computing metaslab weights. . -.It Sy metaslab_df_max_search Ns = Ns Sy 16777216 Ns B Po 16MB Pc Pq int +.It Sy metaslab_df_max_search Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int Maximum distance to search forward from the last offset. Without this limit, fragmented pools can see .Em >100`000 @@ -265,7 +270,7 @@ iterations and becomes the performance limiting factor on high-performance storage. .Pp With the default setting of -.Sy 16MB , +.Sy 16 MiB , we typically see less than .Em 500 iterations, even with very fragmented @@ -274,7 +279,7 @@ pools. The maximum number of iterations possible is .Sy metaslab_df_max_search / 2^(ashift+1) . With the default setting of -.Sy 16MB +.Sy 16 MiB this is .Em 16*1024 Pq with Sy ashift Ns = Ns Sy 9 or @@ -288,7 +293,7 @@ this tunable controls which segment is used. If set, we will use the largest free segment. If unset, we will use a segment of at least the requested size. . -.It Sy zfs_metaslab_max_size_cache_sec Ns = Ns Sy 3600 Ns s Po 1h Pc Pq ulong +.It Sy zfs_metaslab_max_size_cache_sec Ns = Ns Sy 3600 Ns s Po 1 hour Pc Pq ulong When we unload a metaslab, we cache the size of the largest free chunk. We use that cached size to determine whether or not to load a metaslab for a given allocation. @@ -339,11 +344,11 @@ and the allocation can't actually be satisfied .It Sy zfs_vdev_default_ms_count Ns = Ns Sy 200 Pq int When a vdev is added, target this number of metaslabs per top-level vdev. . -.It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512MB Pc Pq int +.It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512 MiB Pc Pq int Default limit for metaslab size. . .It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy ASHIFT_MAX Po 16 Pc Pq ulong -Maximum ashift used when optimizing for logical -> physical sector size on new +Maximum ashift used when optimizing for logical \[->] physical sector size on new top-level vdevs. . .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq ulong @@ -375,7 +380,7 @@ Note that both this many TXGs and .Sy metaslab_unload_delay_ms milliseconds must pass before unloading will occur. . -.It Sy metaslab_unload_delay_ms Ns = Ns Sy 600000 Ns ms Po 10min Pc Pq int +.It Sy metaslab_unload_delay_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq int After a metaslab is used, we keep it loaded for this many milliseconds, to attempt to reduce unnecessary reloading. Note, that both this many milliseconds and @@ -449,7 +454,14 @@ If we have less than this amount of free space, most ZPL operations (e.g. write, create) will return .Sy ENOSPC . . -.It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32kB Pc Pq int +.It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint +Limits the number of on-disk error log entries that will be converted to the +new format when enabling the +.Sy head_errlog +feature. +The default is to convert all log entries. +. +.It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq int During top-level vdev removal, chunks of data are copied from the vdev which may include free space in order to trade bandwidth for IOPS. This parameter determines the maximum span of free space, in bytes, @@ -460,10 +472,10 @@ The default value here was chosen to align with which is a similar concept when doing regular reads (but there's no reason it has to be the same). . -.It Sy vdev_file_logical_ashift Ns = Ns Sy 9 Po 512B Pc Pq ulong +.It Sy vdev_file_logical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq ulong Logical ashift for file-based devices. . -.It Sy vdev_file_physical_ashift Ns = Ns Sy 9 Po 512B Pc Pq ulong +.It Sy vdev_file_physical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq ulong Physical ashift for file-based devices. . .It Sy zap_iterate_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int @@ -472,13 +484,13 @@ prefetch the entire object (all leaf blocks). However, this is limited by .Sy dmu_prefetch_max . . -.It Sy zfetch_array_rd_sz Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq ulong +.It Sy zfetch_array_rd_sz Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq ulong If prefetching is enabled, disable prefetching for reads larger than this size. . -.It Sy zfetch_max_distance Ns = Ns Sy 8388608 Ns B Po 8MB Pc Pq uint +.It Sy zfetch_max_distance Ns = Ns Sy 8388608 Ns B Po 8 MiB Pc Pq uint Max bytes to prefetch per stream. . -.It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64MB Pc Pq uint +.It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint Max bytes to prefetch indirects for per stream. . .It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint @@ -493,7 +505,7 @@ linear in kernel memory. Disabling can improve performance in some code paths at the expense of fragmented kernel memory. . -.It Sy zfs_abd_scatter_max_order Ns = Ns Sy MAX_ORDER-1 Pq uint +.It Sy zfs_abd_scatter_max_order Ns = Ns Sy MAX_ORDER\-1 Pq uint Maximum number of consecutive memory pages allocated in a single block for scatter/gather lists. .Pp @@ -501,7 +513,7 @@ The value of .Sy MAX_ORDER depends on kernel configuration. . -.It Sy zfs_abd_scatter_min_size Ns = Ns Sy 1536 Ns B Po 1.5kB Pc Pq uint +.It Sy zfs_abd_scatter_min_size Ns = Ns Sy 1536 Ns B Po 1.5 KiB Pc Pq uint This is the minimum allocation size that will use scatter (page-based) ABDs. Smaller allocations will use linear ABDs. . @@ -533,10 +545,10 @@ Percentage of ARC dnodes to try to scan in response to demand for non-metadata when the number of bytes consumed by dnodes exceeds .Sy zfs_arc_dnode_limit . . -.It Sy zfs_arc_average_blocksize Ns = Ns Sy 8192 Ns B Po 8kB Pc Pq int +.It Sy zfs_arc_average_blocksize Ns = Ns Sy 8192 Ns B Po 8 KiB Pc Pq int The ARC's buffer hash table is sized based on the assumption of an average block size of this value. -This works out to roughly 1MB of hash table per 1GB of physical memory +This works out to roughly 1 MiB of hash table per 1 GiB of physical memory with 8-byte pointers. For configurations with a known larger average block size, this value can be increased to reduce the memory footprint. @@ -547,9 +559,9 @@ When .Fn arc_get_data_impl waits for this percent of the requested amount of data to be evicted. For example, by default, for every -.Em 2kB +.Em 2 KiB that's evicted, -.Em 1kB +.Em 1 KiB of it may be "reused" by a new allocation. Since this is above .Sy 100 Ns % , @@ -590,10 +602,12 @@ Under Linux, half of system memory will be used as the limit. Under .Fx , the larger of -.Sy all_system_memory - 1GB No and Sy 5/8 * all_system_memory +.Sy all_system_memory No \- Sy 1 GiB +and +.Sy 5/8 No \(mu Sy all_system_memory will be used as the limit. This value must be at least -.Sy 67108864 Ns B Pq 64MB . +.Sy 67108864 Ns B Pq 64 MiB . .Pp This value can be changed dynamically, with some caveats. It cannot be set back to @@ -661,7 +675,9 @@ to evict the required number of metadata buffers. Min size of ARC in bytes. .No If set to Sy 0 , arc_c_min will default to consuming the larger of -.Sy 32MB No or Sy all_system_memory/32 . +.Sy 32 MiB +and +.Sy all_system_memory No / Sy 32 . . .It Sy zfs_arc_min_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 1s Pc Pq int Minimum time prefetched blocks are locked in the ARC. @@ -700,7 +716,7 @@ If equivalent to a quarter of the user-wired memory limit under .Fx and to -.Sy 134217728 Ns B Pq 128MB +.Sy 134217728 Ns B Pq 128 MiB under Linux. . .It Sy zfs_multilist_num_sublists Ns = Ns Sy 0 Pq int @@ -721,10 +737,10 @@ ARC target size .Pq Sy arc_c by thresholds determined by this parameter. Exceeding by -.Sy ( arc_c >> zfs_arc_overflow_shift ) * 0.5 +.Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No / Sy 2 starts ARC reclamation process. If that appears insufficient, exceeding by -.Sy ( arc_c >> zfs_arc_overflow_shift ) * 1.5 +.Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No \(mu Sy 1.5 blocks new buffer allocation until the reclaim thread catches up. Started reclamation process continues till ARC size returns below the target size. @@ -778,10 +794,10 @@ Note that in practice, the kernel's shrinker can ask us to evict up to about four times this for one allocation attempt. .Pp The default limit of -.Sy 10000 Pq in practice, Em 160MB No per allocation attempt with 4kB pages +.Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages limits the amount of time spent attempting to reclaim ARC memory to -less than 100ms per allocation attempt, -even with a small average compressed block size of ~8kB. +less than 100 ms per allocation attempt, +even with a small average compressed block size of ~8 KiB. .Pp The parameter can be set to 0 (zero) to disable the limit, and only applies on Linux. @@ -789,7 +805,7 @@ and only applies on Linux. .It Sy zfs_arc_sys_free Ns = Ns Sy 0 Ns B Pq ulong The target number of bytes the ARC should leave as free memory on the system. If zero, equivalent to the bigger of -.Sy 512kB No and Sy all_system_memory/64 . +.Sy 512 KiB No and Sy all_system_memory/64 . . .It Sy zfs_autoimport_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int Disable pool import at module load by ignoring the cache file @@ -830,12 +846,12 @@ bytes of memory and if the obsolete space map object uses more than bytes on-disk. The condensing process is an attempt to save memory by removing obsolete mappings. . -.It Sy zfs_condense_max_obsolete_bytes Ns = Ns Sy 1073741824 Ns B Po 1GB Pc Pq ulong +.It Sy zfs_condense_max_obsolete_bytes Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq ulong Only attempt to condense indirect vdev mappings if the on-disk size of the obsolete space map object is greater than this number of bytes .Pq see Sy zfs_condense_indirect_vdevs_enable . . -.It Sy zfs_condense_min_mapping_bytes Ns = Ns Sy 131072 Ns B Po 128kB Pc Pq ulong +.It Sy zfs_condense_min_mapping_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq ulong Minimum size vdev mapping to attempt to condense .Pq see Sy zfs_condense_indirect_vdevs_enable . . @@ -851,7 +867,7 @@ to the file clears the log. This setting does not influence debug prints due to .Sy zfs_flags . . -.It Sy zfs_dbgmsg_maxsize Ns = Ns Sy 4194304 Ns B Po 4MB Pc Pq int +.It Sy zfs_dbgmsg_maxsize Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq int Maximum size of the internal ZFS debug log. . .It Sy zfs_dbuf_state_index Ns = Ns Sy 0 Pq int @@ -871,7 +887,7 @@ is set, then the deadman behavior is invoked as described by .Sy zfs_deadman_failmode . By default, the deadman is enabled and set to .Sy wait -which results in "hung" I/Os only being logged. +which results in "hung" I/O operations only being logged. The deadman is automatically disabled when a pool gets suspended. . .It Sy zfs_deadman_failmode Ns = Ns Sy wait Pq charp @@ -891,21 +907,21 @@ This can be used to facilitate automatic fail-over to a properly configured fail-over partner. .El . -.It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1min Pc Pq int +.It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq int Check time in milliseconds. This defines the frequency at which we check for hung I/O requests and potentially invoke the .Sy zfs_deadman_failmode behavior. . -.It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10min Pc Pq ulong +.It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq ulong Interval in milliseconds after which the deadman is triggered and also the interval after which a pool sync operation is considered to be "hung". Once this limit is exceeded the deadman will be invoked every .Sy zfs_deadman_checktime_ms milliseconds until the pool sync completes. . -.It Sy zfs_deadman_ziotime_ms Ns = Ns Sy 300000 Ns ms Po 5min Pc Pq ulong +.It Sy zfs_deadman_ziotime_ms Ns = Ns Sy 300000 Ns ms Po 5 min Pc Pq ulong Interval in milliseconds after which the deadman is triggered and an individual I/O operation is considered to be "hung". As long as the operation remains "hung", @@ -933,7 +949,7 @@ by the maximum number of operations per second. This will smoothly handle between ten times and a tenth of this number. .No See Sx ZFS TRANSACTION DELAY . .Pp -.Sy zfs_delay_scale * zfs_dirty_data_max Em must be smaller than Sy 2^64 . +.Sy zfs_delay_scale No \(mu Sy zfs_dirty_data_max Em must No be smaller than Sy 2^64 . . .It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int Disables requirement for IVset GUIDs to be present and match when doing a raw @@ -955,10 +971,10 @@ will result in objects waiting when there is not actually contention on the same object. . .It Sy zfs_slow_io_events_per_second Ns = Ns Sy 20 Ns /s Pq int -Rate limit delay and deadman zevents (which report slow I/Os) to this many per +Rate limit delay and deadman zevents (which report slow I/O operations) to this many per second. . -.It Sy zfs_unflushed_max_mem_amt Ns = Ns Sy 1073741824 Ns B Po 1GB Pc Pq ulong +.It Sy zfs_unflushed_max_mem_amt Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq ulong Upper-bound limit for unflushed metadata changes to be held by the log spacemap in memory, in bytes. . @@ -966,22 +982,22 @@ log spacemap in memory, in bytes. Part of overall system memory that ZFS allows to be used for unflushed metadata changes by the log spacemap, in millionths. . -.It Sy zfs_unflushed_log_block_max Ns = Ns Sy 262144 Po 256k Pc Pq ulong +.It Sy zfs_unflushed_log_block_max Ns = Ns Sy 131072 Po 128k Pc Pq ulong Describes the maximum number of log spacemap blocks allowed for each pool. The default value means that the space in all the log spacemaps can add up to no more than -.Sy 262144 +.Sy 131072 blocks (which means -.Em 32GB +.Em 16 GiB of logical space before compression and ditto blocks, assuming that blocksize is -.Em 128kB ) . +.Em 128 KiB ) . .Pp This tunable is important because it involves a trade-off between import time after an unclean export and the frequency of flushing metaslabs. The higher this number is, the more log blocks we allow when the pool is active which means that we flush metaslabs less often and thus decrease -the number of I/Os for spacemap updates per TXG. +the number of I/O operations for spacemap updates per TXG. At the same time though, that means that in the event of an unclean export, there will be more log spacemap blocks for us to read, inducing overhead in the import time of the pool. @@ -1082,9 +1098,9 @@ This should be less than . .It Sy zfs_wrlog_data_max Ns = Pq int The upper limit of write-transaction zil log data size in bytes. -Once it is reached, write operation is blocked, until log data is cleared out -after transaction group sync. Because of some overhead, it should be set -at least 2 times the size of +Write operations are throttled when approaching the limit until log data is +cleared out after transaction group sync. +Because of some overhead, it should be set at least 2 times the size of .Sy zfs_dirty_data_max .No to prevent harming normal write throughput. It also should be smaller than the size of the slog device if slog is present. @@ -1141,11 +1157,6 @@ Maximum number of blocks freed in a single TXG. .It Sy zfs_max_async_dedup_frees Ns = Ns Sy 100000 Po 10^5 Pc Pq ulong Maximum number of dedup blocks freed in a single TXG. . -.It Sy zfs_override_estimate_recordsize Ns = Ns Sy 0 Pq ulong -If nonzer, override record size calculation for -.Nm zfs Cm send -estimates. -. .It Sy zfs_vdev_async_read_max_active Ns = Ns Sy 3 Pq int Maximum asynchronous read I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . @@ -1256,7 +1267,7 @@ For non-interactive I/O (scrub, resilver, removal, initialize and rebuild), the number of concurrently-active I/O operations is limited to .Sy zfs_*_min_active , unless the vdev is "idle". -When there are no interactive I/O operatinons active (synchronous or otherwise), +When there are no interactive I/O operations active (synchronous or otherwise), and .Sy zfs_vdev_nia_delay operations have completed since the last interactive operation, @@ -1383,7 +1394,7 @@ Similar to .Sy zfs_free_min_time_ms , but for cleanup of old indirection records for removed vdevs. . -.It Sy zfs_immediate_write_sz Ns = Ns Sy 32768 Ns B Po 32kB Pc Pq long +.It Sy zfs_immediate_write_sz Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq long Largest data block to write to the ZIL. Larger blocks will be treated as if the dataset being written to had the .Sy logbias Ns = Ns Sy throughput @@ -1393,7 +1404,7 @@ property set. Pattern written to vdev free space by .Xr zpool-initialize 8 . . -.It Sy zfs_initialize_chunk_size Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq ulong +.It Sy zfs_initialize_chunk_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq ulong Size of writes used by .Xr zpool-initialize 8 . This option is used by the test suite. @@ -1422,7 +1433,7 @@ This option is used by the test suite to track race conditions. . .It Sy zfs_livelist_condense_sync_pause Ns = Ns Sy 0 Ns | Ns 1 Pq int When set, the livelist condense process pauses indefinitely before -executing the synctask - +executing the synctask \(em .Fn spa_livelist_condense_sync . This option is used by the test suite to trigger race conditions. . @@ -1441,7 +1452,7 @@ This option is used by the test suite to trigger race conditions. The maximum execution time limit that can be set for a ZFS channel program, specified as a number of Lua instructions. . -.It Sy zfs_lua_max_memlimit Ns = Ns Sy 104857600 Po 100MB Pc Pq ulong +.It Sy zfs_lua_max_memlimit Ns = Ns Sy 104857600 Po 100 MiB Pc Pq ulong The maximum memory limit that can be set for a ZFS channel program, specified in bytes. . @@ -1457,15 +1468,15 @@ feature uses to estimate incoming log blocks. .It Sy zfs_max_logsm_summary_length Ns = Ns Sy 10 Pq ulong Maximum number of rows allowed in the summary of the spacemap log. . -.It Sy zfs_max_recordsize Ns = Ns Sy 1048576 Po 1MB Pc Pq int +.It Sy zfs_max_recordsize Ns = Ns Sy 16777216 Po 16 MiB Pc Pq int We currently support block sizes from -.Em 512B No to Em 16MB . +.Em 512 Po 512 B Pc No to Em 16777216 Po 16 MiB Pc . The benefits of larger blocks, and thus larger I/O, need to be weighed against the cost of COWing a giant block to modify one byte. Additionally, very large blocks can have an impact on I/O latency, and also potentially on the memory allocator. -Therefore, we do not allow the recordsize to be set larger than this tunable. -Larger blocks can be created by changing it, +Therefore, we formerly forbade creating blocks larger than 1M. +Larger blocks could be created by changing it, and pools with larger blocks can always be imported and used, regardless of this setting. . @@ -1523,7 +1534,7 @@ into the special allocation class. Historical statistics for this many latest multihost updates will be available in .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /multihost . . -.It Sy zfs_multihost_interval Ns = Ns Sy 1000 Ns ms Po 1s Pc Pq ulong +.It Sy zfs_multihost_interval Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq ulong Used to control the frequency of multihost writes which are performed when the .Sy multihost pool property is on. @@ -1531,7 +1542,7 @@ This is one of the factors used to determine the length of the activity check during import. .Pp The multihost write period is -.Sy zfs_multihost_interval / leaf-vdevs . +.Sy zfs_multihost_interval No / Sy leaf-vdevs . On average a multihost write will be issued for each leaf vdev every .Sy zfs_multihost_interval @@ -1548,7 +1559,7 @@ the risk of failing to detect an active pool. The total activity check time is never allowed to drop below one second. .Pp On import the activity check waits a minimum amount of time determined by -.Sy zfs_multihost_interval * zfs_multihost_import_intervals , +.Sy zfs_multihost_interval No \(mu Sy zfs_multihost_import_intervals , or the same product computed on the host which last had the pool imported, whichever is greater. The activity check time may be further extended if the value of MMP @@ -1556,7 +1567,7 @@ delay found in the best uberblock indicates actual multihost updates happened at longer intervals than .Sy zfs_multihost_interval . A minimum of -.Em 100ms +.Em 100 ms is enforced. .Pp .Sy 0 No is equivalent to Sy 1 . @@ -1573,7 +1584,7 @@ its configuration may take action such as suspending the pool or offlining a device. .Pp Otherwise, the pool will be suspended if -.Sy zfs_multihost_fail_intervals * zfs_multihost_interval +.Sy zfs_multihost_fail_intervals No \(mu Sy zfs_multihost_interval milliseconds pass without a successful MMP write. This guarantees the activity test will see MMP writes if the pool is imported. .Sy 1 No is equivalent to Sy 2 ; @@ -1605,7 +1616,7 @@ When enabled forces ZFS to sync data when flags are used allowing holes in a file to be accurately reported. When disabled holes will not be reported in recently dirtied files. . -.It Sy zfs_pd_bytes_max Ns = Ns Sy 52428800 Ns B Po 50MB Pc Pq int +.It Sy zfs_pd_bytes_max Ns = Ns Sy 52428800 Ns B Po 50 MiB Pc Pq int The number of bytes which should be prefetched during a pool traversal, like .Nm zfs Cm send or other data crawling operations. @@ -1623,8 +1634,8 @@ After this threshold is crossed, additional frees will wait until the next TXG. . .It Sy zfs_prefetch_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable predictive prefetch. -Note that it leaves "prescient" prefetch (for. e.g.\& -.Nm zfs Cm send ) +Note that it leaves "prescient" prefetch +.Pq for, e.g., Nm zfs Cm send intact. Unlike predictive prefetch, prescient prefetch never issues I/O that ends up not being needed, so it can't hurt performance. @@ -1644,7 +1655,7 @@ Disable QAT hardware acceleration for AES-GCM encryption. May be unset after the ZFS modules have been loaded to initialize the QAT hardware as long as support is compiled in and the QAT driver is present. . -.It Sy zfs_vnops_read_chunk_size Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq long +.It Sy zfs_vnops_read_chunk_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq long Bytes to read per chunk. . .It Sy zfs_read_history Ns = Ns Sy 0 Pq int @@ -1654,7 +1665,7 @@ Historical statistics for this many latest reads will be available in .It Sy zfs_read_history_hits Ns = Ns Sy 0 Ns | Ns 1 Pq int Include cache hits in read history . -.It Sy zfs_rebuild_max_segment Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq ulong +.It Sy zfs_rebuild_max_segment Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq ulong Maximum read segment size to issue when sequentially resilvering a top-level vdev. . @@ -1664,7 +1675,7 @@ completes in order to verify the checksums of all blocks which have been resilvered. This is enabled by default and strongly recommended. . -.It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 33554432 Ns B Po 32MB Pc Pq ulong +.It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq ulong Maximum amount of I/O that can be concurrently issued for a sequential resilver per leaf device, given in bytes. . @@ -1684,8 +1695,8 @@ This should only be used as a last resort, as it typically results in leaked space, or worse. . .It Sy zfs_removal_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int -Ignore hard IO errors during device removal. -When set, if a device encounters a hard IO error during the removal process +Ignore hard I/O errors during device removal. +When set, if a device encounters a hard I/O error during the removal process the removal will not be cancelled. This can result in a normally recoverable block becoming permanently damaged and is hence not recommended. @@ -1696,7 +1707,7 @@ pool cannot be returned to a healthy state prior to removing the device. This is used by the test suite so that it can ensure that certain actions happen while in the middle of a removal. . -.It Sy zfs_remove_max_segment Ns = Ns Sy 16777216 Ns B Po 16MB Pc Pq int +.It Sy zfs_remove_max_segment Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int The largest contiguous segment that we will attempt to allocate when removing a device. If there is a performance problem with attempting to allocate large blocks, @@ -1709,7 +1720,7 @@ Ignore the feature, causing an operation that would start a resilver to immediately restart the one in progress. . -.It Sy zfs_resilver_min_time_ms Ns = Ns Sy 3000 Ns ms Po 3s Pc Pq int +.It Sy zfs_resilver_min_time_ms Ns = Ns Sy 3000 Ns ms Po 3 s Pc Pq int Resilvers are processed by the sync thread. While resilvering, it will spend at least this much time working on a resilver between TXG flushes. @@ -1720,12 +1731,12 @@ even if there were unrepairable errors. Intended to be used during pool repair or recovery to stop resilvering when the pool is next imported. . -.It Sy zfs_scrub_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1s Pc Pq int +.It Sy zfs_scrub_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq int Scrubs are processed by the sync thread. While scrubbing, it will spend at least this much time working on a scrub between TXG flushes. . -.It Sy zfs_scan_checkpoint_intval Ns = Ns Sy 7200 Ns s Po 2h Pc Pq int +.It Sy zfs_scan_checkpoint_intval Ns = Ns Sy 7200 Ns s Po 2 hour Pc Pq int To preserve progress across reboots, the sequential scan algorithm periodically needs to stop metadata scanning and issue all the verification I/O to disk. The frequency of this flushing is determined by this tunable. @@ -1736,7 +1747,7 @@ A higher number indicates that we care more about how filled in a segment is, while a lower number indicates we care more about the size of the extent without considering the gaps within a segment. This value is only tunable upon module insertion. -Changing the value afterwards will have no affect on scrub or resilver performance. +Changing the value afterwards will have no effect on scrub or resilver performance. . .It Sy zfs_scan_issue_strategy Ns = Ns Sy 0 Pq int Determines the order that data will be verified while scrubbing or resilvering: @@ -1762,7 +1773,7 @@ Otherwise indicates that the legacy algorithm will be used, where I/O is initiated as soon as it is discovered. Unsetting will not affect scrubs or resilvers that are already in progress. . -.It Sy zfs_scan_max_ext_gap Ns = Ns Sy 2097152 Ns B Po 2MB Pc Pq int +.It Sy zfs_scan_max_ext_gap Ns = Ns Sy 2097152 Ns B Po 2 MiB Pc Pq int Sets the largest gap in bytes between scrub/resilver I/O operations that will still be considered sequential for sorting purposes. Changing this value will not @@ -1791,7 +1802,7 @@ When disabled, the memory limit may be exceeded by fast disks. Freezes a scrub/resilver in progress without actually pausing it. Intended for testing/debugging. . -.It Sy zfs_scan_vdev_limit Ns = Ns Sy 4194304 Ns B Po 4MB Pc Pq int +.It Sy zfs_scan_vdev_limit Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq int Maximum amount of data that can be concurrently issued at once for scrubs and resilvers per leaf device, given in bytes. . @@ -1805,48 +1816,48 @@ remove the spill block from an existing object. Including unmodified copies of the spill blocks creates a backwards-compatible stream which will recreate a spill block if it was incorrectly removed. . -.It Sy zfs_send_no_prefetch_queue_ff Ns = Ns Sy 20 Ns ^-1 Pq int +.It Sy zfs_send_no_prefetch_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq int The fill fraction of the .Nm zfs Cm send internal queues. The fill fraction controls the timing with which internal threads are woken up. . -.It Sy zfs_send_no_prefetch_queue_length Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq int +.It Sy zfs_send_no_prefetch_queue_length Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq int The maximum number of bytes allowed in .Nm zfs Cm send Ns 's internal queues. . -.It Sy zfs_send_queue_ff Ns = Ns Sy 20 Ns ^-1 Pq int +.It Sy zfs_send_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq int The fill fraction of the .Nm zfs Cm send prefetch queue. The fill fraction controls the timing with which internal threads are woken up. . -.It Sy zfs_send_queue_length Ns = Ns Sy 16777216 Ns B Po 16MB Pc Pq int +.It Sy zfs_send_queue_length Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int The maximum number of bytes allowed that will be prefetched by .Nm zfs Cm send . This value must be at least twice the maximum block size in use. . -.It Sy zfs_recv_queue_ff Ns = Ns Sy 20 Ns ^-1 Pq int +.It Sy zfs_recv_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq int The fill fraction of the .Nm zfs Cm receive queue. The fill fraction controls the timing with which internal threads are woken up. . -.It Sy zfs_recv_queue_length Ns = Ns Sy 16777216 Ns B Po 16MB Pc Pq int +.It Sy zfs_recv_queue_length Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int The maximum number of bytes allowed in the .Nm zfs Cm receive queue. This value must be at least twice the maximum block size in use. . -.It Sy zfs_recv_write_batch_size Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq int +.It Sy zfs_recv_write_batch_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq int The maximum amount of data, in bytes, that .Nm zfs Cm receive will write in one DMU transaction. This is the uncompressed size, even when receiving a compressed send stream. This setting will not reduce the write size below a single block. Capped at a maximum of -.Sy 32MB . +.Sy 32 MiB . . .It Sy zfs_override_estimate_recordsize Ns = Ns Sy 0 Ns | Ns 1 Pq ulong Setting this variable overrides the default logic for estimating block @@ -1861,7 +1872,7 @@ and you require accurate zfs send size estimates. Flushing of data to disk is done in passes. Defer frees starting in this pass. . -.It Sy zfs_spa_discard_memory_limit Ns = Ns Sy 16777216 Ns B Po 16MB Pc Pq int +.It Sy zfs_spa_discard_memory_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int Maximum memory used for prefetching a checkpoint's space map on each vdev while discarding the checkpoint. . @@ -1883,11 +1894,11 @@ the average number of sync passes; because when we turn compression off, many blocks' size will change, and thus we have to re-allocate (not overwrite) them. It also increases the number of -.Em 128kB +.Em 128 KiB allocations (e.g. for indirect blocks and spacemaps) because these will not be compressed. The -.Em 128kB +.Em 128 KiB allocations are especially detrimental to performance on highly fragmented systems, which may have very few free segments of this size, and may need to load new metaslabs to satisfy these allocations. @@ -1902,11 +1913,11 @@ The default value of .Sy 75% will create a maximum of one thread per CPU. . -.It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128MB Pc Pq uint +.It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint Maximum size of TRIM command. Larger ranges will be split into chunks no larger than this value before issuing. . -.It Sy zfs_trim_extent_bytes_min Ns = Ns Sy 32768 Ns B Po 32kB Pc Pq uint +.It Sy zfs_trim_extent_bytes_min Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint Minimum size of TRIM commands. TRIM ranges smaller than this will be skipped, unless they're part of a larger range which was chunked. @@ -1949,25 +1960,25 @@ Historical statistics for this many latest TXGs will be available in Flush dirty data to disk at least every this many seconds (maximum TXG duration). . .It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq int -Allow TRIM I/Os to be aggregated. +Allow TRIM I/O operations to be aggregated. This is normally not helpful because the extents to be trimmed will have been already been aggregated by the metaslab. This option is provided for debugging and performance analysis. . -.It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq int +.It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq int Max vdev I/O aggregation size. . -.It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128kB Pc Pq int +.It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq int Max vdev I/O aggregation size for non-rotating media. . -.It Sy zfs_vdev_cache_bshift Ns = Ns Sy 16 Po 64kB Pc Pq int +.It Sy zfs_vdev_cache_bshift Ns = Ns Sy 16 Po 64 KiB Pc Pq int Shift size to inflate reads to. . -.It Sy zfs_vdev_cache_max Ns = Ns Sy 16384 Ns B Po 16kB Pc Pq int +.It Sy zfs_vdev_cache_max Ns = Ns Sy 16384 Ns B Po 16 KiB Pc Pq int Inflate reads smaller than this value to meet the .Sy zfs_vdev_cache_bshift size -.Pq default Sy 64kB . +.Pq default Sy 64 KiB . . .It Sy zfs_vdev_cache_size Ns = Ns Sy 0 Pq int Total size of the per-disk cache in bytes. @@ -1989,7 +2000,7 @@ lacks locality as defined by Operations within this that are not immediately following the previous operation are incremented by half. . -.It Sy zfs_vdev_mirror_rotating_seek_offset Ns = Ns Sy 1048576 Ns B Po 1MB Pc Pq int +.It Sy zfs_vdev_mirror_rotating_seek_offset Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq int The maximum distance for the last queued I/O operation in which the balancing algorithm considers an operation to have locality. .No See Sx ZFS I/O SCHEDULER . @@ -2007,11 +2018,11 @@ locality as defined by the Operations within this that are not immediately following the previous operation are incremented by half. . -.It Sy zfs_vdev_read_gap_limit Ns = Ns Sy 32768 Ns B Po 32kB Pc Pq int +.It Sy zfs_vdev_read_gap_limit Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq int Aggregate read I/O operations if the on-disk gap between them is within this threshold. . -.It Sy zfs_vdev_write_gap_limit Ns = Ns Sy 4096 Ns B Po 4kB Pc Pq int +.It Sy zfs_vdev_write_gap_limit Ns = Ns Sy 4096 Ns B Po 4 KiB Pc Pq int Aggregate write I/O operations if the on-disk gap between them is within this threshold. . @@ -2059,7 +2070,7 @@ Setting this to .Sy 0 disables duplicate detection. . -.It Sy zfs_zevent_retain_expire_secs Ns = Ns Sy 900 Ns s Po 15min Pc Pq int +.It Sy zfs_zevent_retain_expire_secs Ns = Ns Sy 900 Ns s Po 15 min Pc Pq int Lifespan for a recent ereport that was retained for duplicate checking. . .It Sy zfs_zil_clean_taskq_maxalloc Ns = Ns Sy 1048576 Pq int @@ -2078,10 +2089,10 @@ The default value of .Sy 100% will create a maximum of one thread per cpu. . -.It Sy zil_maxblocksize Ns = Ns Sy 131072 Ns B Po 128kB Pc Pq int +.It Sy zil_maxblocksize Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq int This sets the maximum block size used by the ZIL. On very fragmented pools, lowering this -.Pq typically to Sy 36kB +.Pq typically to Sy 36 KiB can improve performance. . .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int @@ -2094,12 +2105,22 @@ if a volatile out-of-order write cache is enabled. Disable intent logging replay. Can be disabled for recovery from corrupted ZIL. . -.It Sy zil_slog_bulk Ns = Ns Sy 786432 Ns B Po 768kB Pc Pq ulong +.It Sy zil_slog_bulk Ns = Ns Sy 786432 Ns B Po 768 KiB Pc Pq ulong Limit SLOG write size per commit executed with synchronous priority. Any writes above that will be executed with lower (asynchronous) priority to limit potential SLOG device abuse by single active ZIL writer. . -.It Sy zfs_embedded_slog_min_ms Ns = Ns Sy 64 Pq int +.It Sy zfs_zil_saxattr Ns = Ns Sy 1 Ns | Ns 0 Pq int +Setting this tunable to zero disables ZIL logging of new +.Sy xattr Ns = Ns Sy sa +records if the +.Sy org.openzfs:zilsaxattr +feature is enabled on the pool. +This would only be necessary to work around bugs in the ZIL logging or replay +code for this record type. +The tunable has no effect if the feature is disabled. +. +.It Sy zfs_embedded_slog_min_ms Ns = Ns Sy 64 Pq int Usually, one metaslab from each normal-class vdev is dedicated for use by the ZIL to log synchronous writes. However, if there are fewer than @@ -2107,6 +2128,14 @@ However, if there are fewer than metaslabs in the vdev, this functionality is disabled. This ensures that we don't set aside an unreasonable amount of space for the ZIL. . +.It Sy zfs_zstd_earlyabort_pass Ns = Ns Sy 1 Pq int +Whether heuristic for detection of incompressible data with zstd levels >= 3 +using LZ4 and zstd-1 passes is enabled. +. +.It Sy zfs_zstd_abort_size Ns = Ns Sy 131072 Pq int +Minimal uncompressed size (inclusive) of a record before the early abort +heuristic will be attempted. +. .It Sy zio_deadman_log_all Ns = Ns Sy 0 Ns | Ns 1 Pq int If non-zero, the zio deadman will produce debugging messages .Pq see Sy zfs_dbgmsg_enable @@ -2116,7 +2145,7 @@ diagnostic information for hang conditions which don't involve a mutex or other locking primitive: typically conditions in which a thread in the zio pipeline is looping indefinitely. . -.It Sy zio_slow_io_ms Ns = Ns Sy 30000 Ns ms Po 30s Pc Pq int +.It Sy zio_slow_io_ms Ns = Ns Sy 30000 Ns ms Po 30 s Pc Pq int When an I/O operation takes more than this much time to complete, it's marked as slow. Each slow operation causes a delay zevent. @@ -2134,12 +2163,12 @@ is limited by Control the naming scheme used when setting new xattrs in the user namespace. If .Sy 0 -(the default on Linux), user namespace xattr names are prefixed with the -namespace, to be backwards compatible with previous versions of ZFS on Linux. +.Pq the default on Linux , +user namespace xattr names are prefixed with the namespace, to be backwards +compatible with previous versions of ZFS on Linux. If .Sy 1 -(the default on -.Fx ), +.Pq the default on Fx , user namespace xattr names are not prefixed, to be backwards compatible with previous versions of ZFS on illumos and .Fx . @@ -2148,8 +2177,8 @@ Either naming scheme can be read on this and future versions of ZFS, regardless of this tunable, but legacy ZFS on illumos or .Fx are unable to read user namespace xattrs written in the Linux format, and -legacy versions of ZFS on Linux are unable to read user namespace xattrs -written in the legacy ZFS format. +legacy versions of ZFS on Linux are unable to read user namespace xattrs written +in the legacy ZFS format. .Pp An existing xattr with the alternate naming scheme is removed when overwriting the xattr so as to not accumulate duplicates. @@ -2192,7 +2221,7 @@ many blocks, where block size is determined by the .Sy volblocksize property of a zvol. . -.It Sy zvol_prefetch_bytes Ns = Ns Sy 131072 Ns B Po 128kB Pc Pq uint +.It Sy zvol_prefetch_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint When adding a zvol to the system, prefetch this many bytes from the start and end of the volume. Prefetching these regions of the volume is desirable, @@ -2343,7 +2372,7 @@ This credits the transaction for "time already served", e.g. reading indirect blocks. .Pp The minimum time for a transaction to take is calculated as -.Dl min_time = min( Ns Sy zfs_delay_scale No * (dirty - min) / (max - dirty), 100ms) +.D1 min_time = min( Ns Sy zfs_delay_scale No \(mu Po Sy dirty No \- Sy min Pc / Po Sy max No \- Sy dirty Pc , 100ms) .Pp The delay has two degrees of freedom that can be adjusted via tunables. The percentage of dirty data at which we start to delay is defined by @@ -2384,7 +2413,7 @@ delay Note, that since the delay is added to the outstanding time remaining on the most recent transaction it's effectively the inverse of IOPS. Here, the midpoint of -.Em 500us +.Em 500 us translates to .Em 2000 IOPS . The shape of the curve diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 5fa516866668..1eed0526b51d 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -53,8 +53,8 @@ dmu_tx_stats_t dmu_tx_stats = { { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, - { "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, + { "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_quota", KSTAT_DATA_UINT64 }, }; @@ -781,34 +781,49 @@ static void dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) { dsl_pool_t *dp = tx->tx_pool; - uint64_t delay_min_bytes = + uint64_t delay_min_bytes, wrlog; + hrtime_t wakeup, tx_time = 0, now; + + /* Calculate minimum transaction time for the dirty data amount. */ + delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - hrtime_t wakeup, min_tx_time, now; + if (dirty > delay_min_bytes) { + /* + * The caller has already waited until we are under the max. + * We make them pass us the amount of dirty data so we don't + * have to handle the case of it being >= the max, which + * could cause a divide-by-zero if it's == the max. + */ + ASSERT3U(dirty, <, zfs_dirty_data_max); - if (dirty <= delay_min_bytes) - return; + tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / + (zfs_dirty_data_max - dirty); + } - /* - * The caller has already waited until we are under the max. - * We make them pass us the amount of dirty data so we don't - * have to handle the case of it being >= the max, which could - * cause a divide-by-zero if it's == the max. - */ - ASSERT3U(dirty, <, zfs_dirty_data_max); + /* Calculate minimum transaction time for the TX_WRITE log size. */ + wrlog = aggsum_upper_bound(&dp->dp_wrlog_total); + delay_min_bytes = + zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; + if (wrlog >= zfs_wrlog_data_max) { + tx_time = zfs_delay_max_ns; + } else if (wrlog > delay_min_bytes) { + tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) / + (zfs_wrlog_data_max - wrlog), tx_time); + } + if (tx_time == 0) + return; + + tx_time = MIN(tx_time, zfs_delay_max_ns); now = gethrtime(); - min_tx_time = zfs_delay_scale * - (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); - min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); - if (now > tx->tx_start + min_tx_time) + if (now > tx->tx_start + tx_time) return; DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, - uint64_t, min_tx_time); + uint64_t, tx_time); mutex_enter(&dp->dp_lock); - wakeup = MAX(tx->tx_start + min_tx_time, - dp->dp_last_wakeup + min_tx_time); + wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time); dp->dp_last_wakeup = wakeup; mutex_exit(&dp->dp_lock); @@ -886,8 +901,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) } if (!tx->tx_dirty_delayed && - dsl_pool_wrlog_over_max(tx->tx_pool)) { - DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max); + dsl_pool_need_wrlog_delay(tx->tx_pool)) { + tx->tx_wait_dirty = B_TRUE; + DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay); return (SET_ERROR(ERESTART)); } diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 7b91ccd067c6..df010c1f096e 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -105,9 +105,8 @@ int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; /* - * zfs_wrlog_data_max, the upper limit of TX_WRITE log data. - * Once it is reached, write operation is blocked, - * until log data is cleared out after txg sync. + * The upper limit of TX_WRITE log data. Write operations are throttled + * when approaching the limit until log data is cleared out after txg sync. * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. */ unsigned long zfs_wrlog_data_max = 0; @@ -623,15 +622,18 @@ dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) /* Choose a value slightly bigger than min dirty sync bytes */ uint64_t sync_min = - zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100; + zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200; if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) txg_kick(dp); } boolean_t -dsl_pool_wrlog_over_max(dsl_pool_t *dp) +dsl_pool_need_wrlog_delay(dsl_pool_t *dp) { - return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0); + uint64_t delay_min_bytes = + zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; + + return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0); } static void @@ -641,6 +643,9 @@ dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); aggsum_add(&dp->dp_wrlog_total, delta); + /* Compact per-CPU sums after the big change. */ + (void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); + (void) aggsum_value(&dp->dp_wrlog_total); } #ifdef ZFS_DEBUG