From 3475724ea4221a354633d7c4e50d9d90f6bd266f Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 13 Jun 2019 08:48:43 -0700 Subject: [PATCH 1/8] ztest: dmu_tx_assign() gets ENOSPC in spa_vdev_remove_thread() When running zloop, we occasionally see the following crash: dmu_tx_assign(tx, TXG_WAIT) == 0 (0x1c == 0) ASSERT at ../../module/zfs/vdev_removal.c:1507:spa_vdev_remove_thread()/sbin/ztest(+0x89c3)[0x55faf567b9c3] The error value 0x1c is ENOSPC. The transaction used by spa_vdev_remove_thread() should not be able to fail due to being out of space. i.e. we should not call dmu_tx_hold_space(). This will allow the removal thread to schedule its work even when the pool is low on space. The "slop space" will provide enough free space to sync out the txg. Reviewed-by: Igor Kozhukhov Reviewed-by: Paul Dagnelie Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens External-issue: DLPX-37853 Closes #8889 --- module/zfs/vdev_removal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index f2d18d9257bd..536a982eca2b 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1498,7 +1498,7 @@ spa_vdev_remove_thread(void *arg) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - dmu_tx_hold_space(tx, SPA_MAXBLOCKSIZE); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); From 9c7da9a95aaaecced0a1cfc40190906e7a691327 Mon Sep 17 00:00:00 2001 From: Tulsi Jain Date: Thu, 13 Jun 2019 08:56:15 -0700 Subject: [PATCH 2/8] Restrict filesystem creation if name referred either '.' or '..' This change restricts filesystem creation if the given name contains either '.' or '..' Reviewed-by: Matt Ahrens Reviewed-by: Brian Behlendorf Reviewed-by: Richard Elling Signed-off-by: TulsiJain Closes #8842 Closes #8564 --- include/zfs_namecheck.h | 2 ++ lib/libzfs/libzfs_dataset.c | 10 +++++++++ module/zcommon/zfs_namecheck.c | 21 +++++++++++++++++++ .../zfs_create/zfs_create_009_neg.ksh | 4 +++- 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/include/zfs_namecheck.h b/include/zfs_namecheck.h index 527db92b0cfa..56d3d36f026e 100644 --- a/include/zfs_namecheck.h +++ b/include/zfs_namecheck.h @@ -43,6 +43,8 @@ typedef enum { NAME_ERR_RESERVED, /* entire name is reserved */ NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].*) */ NAME_ERR_TOOLONG, /* name is too long */ + NAME_ERR_SELF_REF, /* reserved self path name ('.') */ + NAME_ERR_PARENT_REF, /* reserved parent path name ('..') */ NAME_ERR_NO_AT, /* permission set is missing '@' */ } namecheck_err_t; diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 93af50b99cdd..3be205f1f437 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -197,6 +197,16 @@ zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, "reserved disk name")); break; + case NAME_ERR_SELF_REF: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "self reference, '.' is found in name")); + break; + + case NAME_ERR_PARENT_REF: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "parent reference, '..' is found in name")); + break; + default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index 58b23b0e00b0..b1e0de6d8181 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -232,6 +232,27 @@ entity_namecheck(const char *path, namecheck_err_t *why, char *what) } } + if (*end == '\0' || *end == '/') { + int component_length = end - start; + /* Validate the contents of this component is not '.' */ + if (component_length == 1) { + if (start[0] == '.') { + if (why) + *why = NAME_ERR_SELF_REF; + return (-1); + } + } + + /* Validate the content of this component is not '..' */ + if (component_length == 2) { + if (start[0] == '.' && start[1] == '.') { + if (why) + *why = NAME_ERR_PARENT_REF; + return (-1); + } + } + } + /* Snapshot or bookmark delimiter found */ if (*end == '@' || *end == '#') { /* Multiple delimiters are not allowed */ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh index b8190626c7b3..63f5e595ea38 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh @@ -90,7 +90,9 @@ set -A args "$TESTPOOL/" "$TESTPOOL//blah" "$TESTPOOL/@blah" \ "$TESTPOOL/blah*blah" "$TESTPOOL/blah blah" \ "-s $TESTPOOL/$TESTFS1" "-b 1092 $TESTPOOL/$TESTFS1" \ "-b 64k $TESTPOOL/$TESTFS1" "-s -b 32k $TESTPOOL/$TESTFS1" \ - "$TESTPOOL/$BYND_MAX_NAME" "$TESTPOOL/$BYND_NEST_LIMIT" + "$TESTPOOL/$BYND_MAX_NAME" "$TESTPOOL/$BYND_NEST_LIMIT" \ + "$TESTPOOL/." "$TESTPOOL/.." "$TESTPOOL/../blah" "$TESTPOOL/./blah" \ + "$TESTPOOL/blah/./blah" "$TESTPOOL/blah/../blah" log_assert "Verify 'zfs create ' fails with bad argument." From d3230d761ac6234ad20c815f0512a7489f949dad Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 13 Jun 2019 13:06:15 -0700 Subject: [PATCH 3/8] looping in metaslab_block_picker impacts performance on fragmented pools On fragmented pools with high-performance storage, the looping in metaslab_block_picker() can become the performance-limiting bottleneck. When looking for a larger block (e.g. a 128K block for the ZIL), we may search through many free segments (up to hundreds of thousands) to find one that is large enough to satisfy the allocation. This can take a long time (up to dozens of ms), and is done while holding the ms_lock, which other threads may spin waiting for. When this performance problem is encountered, profiling will show high CPU time in metaslab_block_picker, as well as in mutex_enter from various callers. The problem is very evident on a test system with a sync write workload with 8K writes to a recordsize=8k filesystem, with 4TB of SSD storage, 84% full and 88% fragmented. It has also been observed on production systems with 90TB of storage, 76% full and 87% fragmented. The fix is to change metaslab_df_alloc() to search only up to 16MB from the previous allocation (of this alignment). After that, we will pick a segment that is of the exact size requested (or larger). This reduces the number of iterations to a few hundred on fragmented pools (a ~100x improvement). Reviewed-by: Brian Behlendorf Reviewed-by: Paul Dagnelie Reviewed-by: Tony Nguyen Reviewed-by: George Wilson Reviewed-by: Serapheim Dimitropoulos Signed-off-by: Matthew Ahrens External-issue: DLPX-62324 Closes #8877 --- man/man5/zfs-module-parameters.5 | 34 ++++++++ module/zfs/metaslab.c | 143 ++++++++++++++++++------------- 2 files changed, 117 insertions(+), 60 deletions(-) diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 604f2f6c9056..3ed7bc6e4879 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -325,6 +325,40 @@ Enable use of the fragmentation metric in computing metaslab weights. Use \fB1\fR for yes (default) and \fB0\fR for no. .RE +.sp +.ne 2 +.na +\fBmetaslab_df_max_search\fR (int) +.ad +.RS 12n +Maximum distance to search forward from the last offset. Without this limit, +fragmented pools can see >100,000 iterations and metaslab_block_picker() +becomes the performance limiting factor on high-performance storage. + +With the default setting of 16MB, we typically see less than 500 iterations, +even with very fragmented, ashift=9 pools. The maximum number of iterations +possible is: \fBmetaslab_df_max_search / (2 * (1<100,000 iterations and + * metaslab_block_picker() becomes the performance limiting factor on + * high-performance storage. + * + * With the default setting of 16MB, we typically see less than 500 + * iterations, even with very fragmented, ashift=9 pools. The maximum number + * of iterations possible is: + * metaslab_df_max_search / (2 * (1<rs_start, align); + if (rs != NULL) + first_found = rs->rs_start; + while (rs != NULL && rs->rs_start - first_found <= max_search) { + uint64_t offset = rs->rs_start; if (offset + size <= rs->rs_end) { *cursor = offset + size; return (offset); @@ -1224,55 +1250,30 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, rs = AVL_NEXT(t, rs); } - /* - * If we know we've searched the whole map (*cursor == 0), give up. - * Otherwise, reset the cursor to the beginning and try again. - */ - if (*cursor == 0) - return (-1ULL); - *cursor = 0; - return (metaslab_block_picker(t, cursor, size, align)); -} -#endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */ - -#if defined(WITH_FF_BLOCK_ALLOCATOR) -/* - * ========================================================================== - * The first-fit block allocator - * ========================================================================== - */ -static uint64_t -metaslab_ff_alloc(metaslab_t *msp, uint64_t size) -{ - /* - * Find the largest power of 2 block size that evenly divides the - * requested size. This is used to try to allocate blocks with similar - * alignment from the same area of the metaslab (i.e. same cursor - * bucket) but it does not guarantee that other allocations sizes - * may exist in the same region. - */ - uint64_t align = size & -size; - uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; - avl_tree_t *t = &msp->ms_allocatable->rt_root; - - return (metaslab_block_picker(t, cursor, size, align)); + return (-1ULL); } - -static metaslab_ops_t metaslab_ff_ops = { - metaslab_ff_alloc -}; - -metaslab_ops_t *zfs_metaslab_ops = &metaslab_ff_ops; -#endif /* WITH_FF_BLOCK_ALLOCATOR */ +#endif /* WITH_DF/CF_BLOCK_ALLOCATOR */ #if defined(WITH_DF_BLOCK_ALLOCATOR) /* * ========================================================================== - * Dynamic block allocator - - * Uses the first fit allocation scheme until space get low and then - * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold - * and metaslab_df_free_pct to determine when to switch the allocation scheme. + * Dynamic Fit (df) block allocator + * + * Search for a free chunk of at least this size, starting from the last + * offset (for this alignment of block) looking for up to + * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not + * found within 16MB, then return a free chunk of exactly the requested size (or + * larger). + * + * If it seems like searching from the last offset will be unproductive, skip + * that and just return a free chunk of exactly the requested size (or larger). + * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This + * mechanism is probably not very useful and may be removed in the future. + * + * The behavior when not searching can be changed to return the largest free + * chunk, instead of a free chunk of exactly the requested size, by setting + * metaslab_df_use_largest_segment. * ========================================================================== */ static uint64_t @@ -1288,28 +1289,42 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &rt->rt_root; - uint64_t max_size = metaslab_block_maxsize(msp); int free_pct = range_tree_space(rt) * 100 / msp->ms_size; + uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, + ASSERT3U(avl_numnodes(&rt->rt_root), ==, avl_numnodes(&msp->ms_allocatable_by_size)); - if (max_size < size) - return (-1ULL); - /* - * If we're running low on space switch to using the size - * sorted AVL tree (best-fit). + * If we're running low on space, find a segment based on size, + * rather than iterating based on offset. */ - if (max_size < metaslab_df_alloc_threshold || + if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { - t = &msp->ms_allocatable_by_size; - *cursor = 0; + offset = -1; + } else { + offset = metaslab_block_picker(&rt->rt_root, + cursor, size, metaslab_df_max_search); } - return (metaslab_block_picker(t, cursor, size, 1ULL)); + if (offset == -1) { + range_seg_t *rs; + if (metaslab_df_use_largest_segment) { + /* use largest free segment */ + rs = avl_last(&msp->ms_allocatable_by_size); + } else { + /* use segment of this size, or next largest */ + rs = metaslab_block_find(&msp->ms_allocatable_by_size, + 0, size); + } + if (rs != NULL && rs->rs_start + size <= rs->rs_end) { + offset = rs->rs_start; + *cursor = offset + size; + } + } + + return (offset); } static metaslab_ops_t metaslab_df_ops = { @@ -4823,6 +4838,14 @@ MODULE_PARM_DESC(zfs_metaslab_switch_threshold, module_param(metaslab_force_ganging, ulong, 0644); MODULE_PARM_DESC(metaslab_force_ganging, "blocks larger than this size are forced to be gang blocks"); + +module_param(metaslab_df_max_search, int, 0644); +MODULE_PARM_DESC(metaslab_df_max_search, + "max distance (bytes) to search forward before using size tree"); + +module_param(metaslab_df_use_largest_segment, int, 0644); +MODULE_PARM_DESC(metaslab_df_use_largest_segment, + "when looking in size tree, use largest segment instead of exact fit"); /* END CSTYLED */ #endif From ae5c78e0b13ffeabf1c49a27d3f42a95aa9a678d Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 13 Jun 2019 16:08:24 -0400 Subject: [PATCH 4/8] Move write aggregation memory copy out of vq_lock Memory copy is too heavy operation to do under the congested lock. Moving it out reduces congestion by many times to almost invisible. Since the original zio removed from the queue, and the child zio is not executed yet, I don't see why would the copy need protection. My guess it just remained like this from the time when lock was not dropped here, which was added later to fix lock ordering issue. Multi-threaded sequential write tests with both HDD and SSD pools with ZVOL block sizes of 4KB, 16KB, 64KB and 128KB all show major reduction of lock congestion, saving from 15% to 35% of CPU time and increasing throughput from 10% to 40%. Reviewed-by: Richard Yao Reviewed-by: Matt Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Closes #8890 --- module/zfs/vdev_queue.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index e74df76b7530..86b20f134834 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -709,6 +709,18 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) do { dio = nio; nio = AVL_NEXT(t, dio); + zio_add_child(dio, aio); + vdev_queue_io_remove(vq, dio); + } while (dio != last); + + /* + * We need to drop the vdev queue's lock during zio_execute() to + * avoid a deadlock that we could encounter due to lock order + * reversal between vq_lock and io_lock in zio_change_priority(). + * Use the dropped lock to do memory copy without congestion. + */ + mutex_exit(&vq->vq_lock); + while ((dio = zio_walk_parents(aio, &zl)) != NULL) { ASSERT3U(dio->io_type, ==, aio->io_type); if (dio->io_flags & ZIO_FLAG_NODATA) { @@ -720,16 +732,6 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) dio->io_offset - aio->io_offset, 0, dio->io_size); } - zio_add_child(dio, aio); - vdev_queue_io_remove(vq, dio); - } while (dio != last); - - /* - * We need to drop the vdev queue's lock to avoid a deadlock that we - * could encounter since this I/O will complete immediately. - */ - mutex_exit(&vq->vq_lock); - while ((dio = zio_walk_parents(aio, &zl)) != NULL) { zio_vdev_io_bypass(dio); zio_execute(dio); } From be89734a29fda5a0f5780d953789fb7e91b2a529 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 13 Jun 2019 13:10:19 -0700 Subject: [PATCH 5/8] compress metadata in later sync passes Starting in sync pass 5 (zfs_sync_pass_dont_compress), we disable compression (including of metadata). Ostensibly this helps the sync passes to converge (i.e. for a sync pass to not need to allocate anything because it is 100% overwrites). However, in practice it increases the average number of sync passes, because when we turn compression off, a lot of block's size will change and thus we have to re-allocate (not overwrite) them. It also increases the number of 128KB allocations (e.g. for indirect blocks and spacemaps) because these will not be compressed. The 128K allocations are especially detrimental to performance on highly fragmented systems, which may have very few free segments of this size, and may need to load new metaslabs to satisfy 128K allocations. We should increase zfs_sync_pass_dont_compress. In practice on a highly fragmented system we see a few 5-pass txg's, a tiny number of 6-pass txg's, and no txg's with more than 6 passes. Reviewed-by: Brian Behlendorf Reviewed-by: Richard Elling Reviewed by: Pavel Zakharov Reviewed-by: Serapheim Dimitropoulos Reviewed-by: George Wilson Signed-off-by: Matthew Ahrens External-issue: DLPX-63431 Closes #8892 --- man/man5/zfs-module-parameters.5 | 16 ++++++++++++++-- module/zfs/zio.c | 18 ++++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 3ed7bc6e4879..7fb24d2755ef 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -2478,9 +2478,21 @@ Default value: \fB25\fR. \fBzfs_sync_pass_dont_compress\fR (int) .ad .RS 12n -Don't compress starting in this pass +Starting in this sync pass, we disable compression (including of metadata). +With the default setting, in practice, we don't have this many sync passes, +so this has no effect. +.sp +The original intent was that disabling compression would help the sync passes +to converge. However, in practice disabling compression increases the average +number of sync passes, because when we turn compression off, a lot of block's +size will change and thus we have to re-allocate (not overwrite) them. It +also increases the number of 128KB allocations (e.g. for indirect blocks and +spacemaps) because these will not be compressed. The 128K allocations are +especially detrimental to performance on highly fragmented systems, which may +have very few free segments of this size, and may need to load new metaslabs +to satisfy 128K allocations. .sp -Default value: \fB5\fR. +Default value: \fB8\fR. .RE .sp diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 80a2dbc82397..a6bf8a27b938 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ @@ -96,9 +96,23 @@ int zio_slow_io_ms = (30 * MILLISEC); * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. + * + * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable + * compression (including of metadata). In practice, we don't have this + * many sync passes, so this has no effect. + * + * The original intent was that disabling compression would help the sync + * passes to converge. However, in practice disabling compression increases + * the average number of sync passes, because when we turn compression off, a + * lot of block's size will change and thus we have to re-allocate (not + * overwrite) them. It also increases the number of 128KB allocations (e.g. + * for indirect blocks and spacemaps) because these will not be compressed. + * The 128K allocations are especially detrimental to performance on highly + * fragmented systems, which may have very few free segments of this size, + * and may need to load new metaslabs to satisfy 128K allocations. */ int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ -int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ +int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */ int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ /* From 53dce5acc652800fcfca1b83e22a00c5e4fc9e87 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 13 Jun 2019 13:12:39 -0700 Subject: [PATCH 6/8] panic in removal_remap test on 4K devices If the zfs_remove_max_segment tunable is changed to be not a multiple of the sector size, then the device removal code will malfunction and try to create mappings that are smaller than one sector, leading to a panic. On debug bits this assertion will fail in spa_vdev_copy_segment(): ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); On nondebug, the system panics with a stack like: metaslab_free_concrete() metaslab_free_impl() metaslab_free_impl_cb() vdev_indirect_remap() free_from_removing_vdev() metaslab_free_impl() metaslab_free_dva() metaslab_free() Fortunately, the default for zfs_remove_max_segment is 1MB, so this can't occur by default. We hit it during this test because removal_remap.ksh changes zfs_remove_max_segment to 1KB. When testing on 4KB-sector disks, we hit the bug. This change makes the zfs_remove_max_segment tunable more robust, automatically rounding it up to a multiple of the sector size. We also turn some key assertions into VERIFY's so that similar bugs would be caught before they are encoded on disk (and thus avoid a panic-reboot-loop). Reviewed-by: Sean Eric Fagan Reviewed-by: Pavel Zakharov Reviewed-by: Serapheim Dimitropoulos Reviewed-by: Sebastien Roy Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens External-issue: DLPX-61342 Closes #8893 --- include/sys/vdev_removal.h | 8 ++++---- man/man5/zfs-module-parameters.5 | 27 ++++++++++++++++++++++++++ module/zfs/vdev_label.c | 5 ++--- module/zfs/vdev_removal.c | 33 +++++++++++++++++++++++++------- 4 files changed, 59 insertions(+), 14 deletions(-) diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h index 3962237afdab..e3bab0658d62 100644 --- a/include/sys/vdev_removal.h +++ b/include/sys/vdev_removal.h @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2014, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_REMOVAL_H @@ -81,13 +81,13 @@ extern void spa_vdev_condense_suspend(spa_t *); extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t); extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t); extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *); -extern void svr_sync(spa_t *spa, dmu_tx_t *tx); +extern void svr_sync(spa_t *, dmu_tx_t *); extern void spa_vdev_remove_suspend(spa_t *); extern int spa_vdev_remove_cancel(spa_t *); -extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr); +extern void spa_vdev_removal_destroy(spa_vdev_removal_t *); +extern uint64_t spa_remove_max_segment(spa_t *); extern int vdev_removal_max_span; -extern int zfs_remove_max_segment; #ifdef __cplusplus } diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 7fb24d2755ef..77b4c2801e0e 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -2228,6 +2228,33 @@ pool cannot be returned to a healthy state prior to removing the device. Default value: \fB0\fR. .RE +.sp +.ne 2 +.na +\fBzfs_removal_suspend_progress\fR (int) +.ad +.RS 12n +.sp +This is used by the test suite so that it can ensure that certain actions +happen while in the middle of a removal. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_remove_max_segment\fR (int) +.ad +.RS 12n +.sp +The largest contiguous segment that we will attempt to allocate when removing +a device. This can be no larger than 16MB. If there is a performance +problem with attempting to allocate large blocks, consider decreasing this. +.sp +Default value: \fB16,777,216\fR (16MB). +.RE + .sp .ne 2 .na diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index a0e373b3dfc5..6320732ed6da 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -21,8 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ @@ -613,7 +612,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, * zfs_remove_max_segment, so we need at least one entry * per zfs_remove_max_segment of allocated data. */ - seg_count += to_alloc / zfs_remove_max_segment; + seg_count += to_alloc / spa_remove_max_segment(spa); fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, seg_count * diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 536a982eca2b..6f64edd8c473 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. */ #include @@ -100,6 +100,8 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If * there is a performance problem with attempting to allocate large blocks, * consider decreasing this. + * + * See also the accessor function spa_remove_max_segment(). */ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE; @@ -951,8 +953,10 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, vdev_indirect_mapping_entry_t *entry; dva_t dst = {{ 0 }}; uint64_t start = range_tree_min(segs); + ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift)); ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); + ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift)); uint64_t size = range_tree_span(segs); if (range_tree_span(segs) > maxalloc) { @@ -983,6 +987,7 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, } } ASSERT3U(size, <=, maxalloc); + ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift)); /* * An allocation class might not have any remaining vdevs or space @@ -1026,11 +1031,11 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, /* * We can't have any padding of the allocated size, otherwise we will - * misunderstand what's allocated, and the size of the mapping. - * The caller ensures this will be true by passing in a size that is - * aligned to the worst (highest) ashift in the pool. + * misunderstand what's allocated, and the size of the mapping. We + * prevent padding by ensuring that all devices in the pool have the + * same ashift, and the allocation size is a multiple of the ashift. */ - ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); + VERIFY3U(DVA_GET_ASIZE(&dst), ==, size); entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); @@ -1363,6 +1368,20 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, range_tree_destroy(segs); } +/* + * The size of each removal mapping is limited by the tunable + * zfs_remove_max_segment, but we must adjust this to be a multiple of the + * pool's ashift, so that we don't try to split individual sectors regardless + * of the tunable value. (Note that device removal requires that all devices + * have the same ashift, so there's no difference between spa_min_ashift and + * spa_max_ashift.) The raw tunable should not be used elsewhere. + */ +uint64_t +spa_remove_max_segment(spa_t *spa) +{ + return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift)); +} + /* * The removal thread operates in open context. It iterates over all * allocated space in the vdev, by loading each metaslab's spacemap. @@ -1385,7 +1404,7 @@ spa_vdev_remove_thread(void *arg) spa_t *spa = arg; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_copy_arg_t vca; - uint64_t max_alloc = zfs_remove_max_segment; + uint64_t max_alloc = spa_remove_max_segment(spa); uint64_t last_txg = 0; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -1511,7 +1530,7 @@ spa_vdev_remove_thread(void *arg) vd = vdev_lookup_top(spa, svr->svr_vdev_id); if (txg != last_txg) - max_alloc = zfs_remove_max_segment; + max_alloc = spa_remove_max_segment(spa); last_txg = txg; spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); From 7218b29e4bb86fb8f17459f355b609e23c6aaa54 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 13 Jun 2019 13:14:35 -0700 Subject: [PATCH 7/8] lz4_decompress_abd declared but not defined `lz4_decompress_abd` is declared in zio_compress.h but it is not defined anywhere. The declaration should be removed. Reviewed by: Dan Kimmel Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens External-issue: DLPX-47477 Closes #8894 --- include/sys/zio_compress.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h index 1642823d3d42..208117eee4b5 100644 --- a/include/sys/zio_compress.h +++ b/include/sys/zio_compress.h @@ -105,8 +105,7 @@ extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len, int level); extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, int level); -extern int lz4_decompress_abd(abd_t *src, void *dst, size_t s_len, size_t d_len, - int level); + /* * Compress and decompress data if necessary. */ From b1b4ac27082aede8522e479c87897026519f1dd7 Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Thu, 13 Jun 2019 13:15:46 -0700 Subject: [PATCH 8/8] Python config cleanup Don't require Python at configure/build unless building pyzfs. Move ZFS_AC_PYTHON_MODULE to always-pyzfs.m4 where it is used. Make test syntax more consistent. Sponsored by: iXsystems, Inc. Reviewed-by: Neal Gompa Reviewed-by: Brian Behlendorf Signed-off-by: Ryan Moeller Closes #8895 --- config/always-python.m4 | 87 +++++++++++------------------------------ config/always-pyzfs.m4 | 45 ++++++++++++++------- 2 files changed, 53 insertions(+), 79 deletions(-) diff --git a/config/always-python.m4 b/config/always-python.m4 index 7cfefd9ebcae..c1c07597e688 100644 --- a/config/always-python.m4 +++ b/config/always-python.m4 @@ -1,47 +1,3 @@ -dnl # -dnl # ZFS_AC_PYTHON_VERSION(version, [action-if-true], [action-if-false]) -dnl # -dnl # Verify Python version -dnl # -AC_DEFUN([ZFS_AC_PYTHON_VERSION], [ - ver_check=`$PYTHON -c "import sys; print (sys.version.split()[[0]] $1)"` - AS_IF([test "$ver_check" = "True"], [ - m4_ifvaln([$2], [$2]) - ], [ - m4_ifvaln([$3], [$3]) - ]) -]) - -dnl # -dnl # ZFS_AC_PYTHON_VERSION_IS_2 -dnl # ZFS_AC_PYTHON_VERSION_IS_3 -dnl # -dnl # Tests if the $PYTHON_VERSION matches 2.x or 3.x. -dnl # -AC_DEFUN([ZFS_AC_PYTHON_VERSION_IS_2], - [test "${PYTHON_VERSION%%\.*}" = "2"]) -AC_DEFUN([ZFS_AC_PYTHON_VERSION_IS_3], - [test "${PYTHON_VERSION%%\.*}" = "3"]) - -dnl # -dnl # ZFS_AC_PYTHON_MODULE(module_name, [action-if-true], [action-if-false]) -dnl # -dnl # Checks for Python module. Freely inspired by AX_PYTHON_MODULE -dnl # https://www.gnu.org/software/autoconf-archive/ax_python_module.html -dnl # Required by ZFS_AC_CONFIG_ALWAYS_PYZFS. -dnl # -AC_DEFUN([ZFS_AC_PYTHON_MODULE], [ - PYTHON_NAME=`basename $PYTHON` - AC_MSG_CHECKING([for $PYTHON_NAME module: $1]) - AS_IF([$PYTHON -c "import $1" 2>/dev/null], [ - AC_MSG_RESULT(yes) - m4_ifvaln([$2], [$2]) - ], [ - AC_MSG_RESULT(no) - m4_ifvaln([$3], [$3]) - ]) -]) - dnl # dnl # The majority of the python scripts are written to be compatible dnl # with Python 2.6 and Python 3.4. Therefore, they may be installed @@ -66,35 +22,38 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYTHON], [ [AC_MSG_ERROR([Unknown --with-python value '$with_python'])] ) - AS_IF([test $PYTHON != :], [ - AS_IF([$PYTHON --version >/dev/null 2>&1], - [AM_PATH_PYTHON([2.6], [], [:])], - [AC_MSG_ERROR([Cannot find $PYTHON in your system path])] - ) - ]) - AM_CONDITIONAL([USING_PYTHON], [test "$PYTHON" != :]) - AM_CONDITIONAL([USING_PYTHON_2], [ZFS_AC_PYTHON_VERSION_IS_2]) - AM_CONDITIONAL([USING_PYTHON_3], [ZFS_AC_PYTHON_VERSION_IS_3]) - dnl # dnl # Minimum supported Python versions for utilities: - dnl # Python 2.6.x, or Python 3.4.x + dnl # Python 2.6 or Python 3.4 dnl # - AS_IF([ZFS_AC_PYTHON_VERSION_IS_2], [ - ZFS_AC_PYTHON_VERSION([>= '2.6'], [ true ], - [AC_MSG_ERROR("Python >= 2.6.x is not available")]) + AM_PATH_PYTHON([], [], [:]) + AS_IF([test -z "$PYTHON_VERSION"], [ + PYTHON_VERSION=$(basename $PYTHON | tr -cd 0-9.) ]) + PYTHON_MINOR=${PYTHON_VERSION#*\.} - AS_IF([ZFS_AC_PYTHON_VERSION_IS_3], [ - ZFS_AC_PYTHON_VERSION([>= '3.4'], [ true ], - [AC_MSG_ERROR("Python >= 3.4.x is not available")]) - ]) + AS_CASE([$PYTHON_VERSION], + [2.*], [ + AS_IF([test $PYTHON_MINOR -lt 6], + [AC_MSG_ERROR("Python >= 2.6 is required")]) + ], + [3.*], [ + AS_IF([test $PYTHON_MINOR -lt 4], + [AC_MSG_ERROR("Python >= 3.4 is required")]) + ], + [:|2|3], [], + [PYTHON_VERSION=3] + ) + + AM_CONDITIONAL([USING_PYTHON], [test "$PYTHON" != :]) + AM_CONDITIONAL([USING_PYTHON_2], [test "x${PYTHON_VERSION%%\.*}" = x2]) + AM_CONDITIONAL([USING_PYTHON_3], [test "x${PYTHON_VERSION%%\.*}" = x3]) dnl # dnl # Request that packages be built for a specific Python version. dnl # - AS_IF([test $with_python != check], [ - PYTHON_PKG_VERSION=`echo ${PYTHON} | tr -d 'a-zA-Z.'` + AS_IF([test "x$with_python" != xcheck], [ + PYTHON_PKG_VERSION=$(echo $PYTHON_VERSION | tr -d .) DEFINE_PYTHON_PKG_VERSION='--define "__use_python_pkg_version '${PYTHON_PKG_VERSION}'"' DEFINE_PYTHON_VERSION='--define "__use_python '${PYTHON}'"' ], [ diff --git a/config/always-pyzfs.m4 b/config/always-pyzfs.m4 index 6f32e98feed2..f620a8f9a18b 100644 --- a/config/always-pyzfs.m4 +++ b/config/always-pyzfs.m4 @@ -1,5 +1,24 @@ dnl # -dnl # Determines if pyzfs can be built, requires Python 2.7 or latter. +dnl # ZFS_AC_PYTHON_MODULE(module_name, [action-if-true], [action-if-false]) +dnl # +dnl # Checks for Python module. Freely inspired by AX_PYTHON_MODULE +dnl # https://www.gnu.org/software/autoconf-archive/ax_python_module.html +dnl # Required by ZFS_AC_CONFIG_ALWAYS_PYZFS. +dnl # +AC_DEFUN([ZFS_AC_PYTHON_MODULE], [ + PYTHON_NAME=$(basename $PYTHON) + AC_MSG_CHECKING([for $PYTHON_NAME module: $1]) + AS_IF([$PYTHON -c "import $1" 2>/dev/null], [ + AC_MSG_RESULT(yes) + m4_ifvaln([$2], [$2]) + ], [ + AC_MSG_RESULT(no) + m4_ifvaln([$3], [$3]) + ]) +]) + +dnl # +dnl # Determines if pyzfs can be built, requires Python 2.7 or later. dnl # AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ AC_ARG_ENABLE([pyzfs], @@ -18,7 +37,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ DEFINE_PYZFS='--without pyzfs' ]) ], [ - AS_IF([test $PYTHON != :], [ + AS_IF([test "$PYTHON" != :], [ DEFINE_PYZFS='' ], [ enable_pyzfs=no @@ -31,20 +50,16 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ dnl # Require python-devel libraries dnl # AS_IF([test "x$enable_pyzfs" = xcheck -o "x$enable_pyzfs" = xyes], [ - AS_IF([ZFS_AC_PYTHON_VERSION_IS_2], [ - PYTHON_REQUIRED_VERSION=">= '2.7.0'" - ], [ - AS_IF([ZFS_AC_PYTHON_VERSION_IS_3], [ - PYTHON_REQUIRED_VERSION=">= '3.4.0'" - ], [ - AC_MSG_ERROR("Python $PYTHON_VERSION unknown") - ]) - ]) + AS_CASE([$PYTHON_VERSION], + [3.*], [PYTHON_REQUIRED_VERSION=">= '3.4.0'"], + [2.*], [PYTHON_REQUIRED_VERSION=">= '2.7.0'"], + [AC_MSG_ERROR("Python $PYTHON_VERSION unknown")] + ) AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [ AS_IF([test "x$enable_pyzfs" = xyes], [ AC_MSG_ERROR("Python $PYTHON_REQUIRED_VERSION development library is not installed") - ], [test ! "x$enable_pyzfs" = xno], [ + ], [test "x$enable_pyzfs" != xno], [ enable_pyzfs=no ]) ]) @@ -57,7 +72,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ ZFS_AC_PYTHON_MODULE([setuptools], [], [ AS_IF([test "x$enable_pyzfs" = xyes], [ AC_MSG_ERROR("Python $PYTHON_VERSION setuptools is not installed") - ], [test ! "x$enable_pyzfs" = xno], [ + ], [test "x$enable_pyzfs" != xno], [ enable_pyzfs=no ]) ]) @@ -70,7 +85,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ ZFS_AC_PYTHON_MODULE([cffi], [], [ AS_IF([test "x$enable_pyzfs" = xyes], [ AC_MSG_ERROR("Python $PYTHON_VERSION cffi is not installed") - ], [test ! "x$enable_pyzfs" = xno], [ + ], [test "x$enable_pyzfs" != xno], [ enable_pyzfs=no ]) ]) @@ -81,7 +96,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ dnl # AS_IF([test "x$enable_pyzfs" = xcheck], [enable_pyzfs=yes]) - AM_CONDITIONAL([PYZFS_ENABLED], [test x$enable_pyzfs = xyes]) + AM_CONDITIONAL([PYZFS_ENABLED], [test "x$enable_pyzfs" = xyes]) AC_SUBST([PYZFS_ENABLED], [$enable_pyzfs]) AC_SUBST(pythonsitedir, [$PYTHON_SITE_PKG])