diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 3b67576401fa..913ac483cecd 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -11061,8 +11061,12 @@ zpool_do_rebalance(int argc, char **argv) int err = zpool_rebalance(zhp, vdev); + int ret = 0; + if (err != 0) + ret = 1; + zpool_close(zhp); - return (err); + return (ret); } static int diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 815b5d0c9cf1..2b4429688b05 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -88,6 +88,8 @@ int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, int); int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); +uint64_t metaslab_group_alloc(metaslab_group_t *, zio_alloc_list_t *, + uint64_t, uint64_t, boolean_t, dva_t *, int, int, boolean_t); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t); void metaslab_free_dva(spa_t *, const dva_t *, boolean_t); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 1a04bedc3137..4bf3d1115006 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -318,6 +318,7 @@ struct spa { uint64_t spa_nonallocating_dspace; spa_removing_phys_t spa_removing_phys; spa_vdev_removal_t *spa_vdev_removal; + spa_vdev_rebalance_t *spa_vdev_rebalance; spa_condensing_indirect_phys_t spa_condensing_indirect_phys; spa_condensing_indirect_t *spa_condensing_indirect; diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 0e12fb96f90a..efe32ebbb7bd 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -167,7 +167,6 @@ typedef struct vdev_rebalance_node { uint64_t vrn_offset; uint64_t vrn_size; dva_t vrn_dest; - uint64_t vrn_txg; } vdev_rebalance_node_t; @@ -654,6 +653,9 @@ extern uint_t zfs_vdev_max_auto_ashift; int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); + +extern int vdev_rebalance_node_cmp(void *a, void *b); + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h index 7bff86ad1ddb..40996d4b87b9 100644 --- a/include/sys/vdev_removal.h +++ b/include/sys/vdev_removal.h @@ -73,7 +73,8 @@ typedef struct spa_vdev_rebalance { uint64_t svr_vdev_id; int svr_target; list_t svr_sources; - + uint64_t svr_start_offset; + range_tree_t *svr_allocd_segs; } spa_vdev_rebalance_t; typedef struct spa_condensing_indirect { @@ -95,6 +96,7 @@ extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t); extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t); extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *); extern void svr_sync(spa_t *, dmu_tx_t *); +extern void spa_rebalance_some(spa_t *, dmu_tx_t *); extern void spa_vdev_remove_suspend(spa_t *); extern int spa_vdev_remove_cancel(spa_t *); extern void spa_vdev_removal_destroy(spa_vdev_removal_t *); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index dd4ff77e6f5d..89f04476b5c1 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -5098,7 +5098,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, return (offset); } -static uint64_t +uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, int allocator, boolean_t try_hard) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index cda62f939c1e..fa467e4ff116 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -9261,6 +9261,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) dsl_scan_sync(dp, tx); dsl_errorscrub_sync(dp, tx); svr_sync(spa, tx); + spa_rebalance_some(spa, tx); spa_sync_upgrades(spa, tx); spa_flush_metaslabs(spa, tx); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 49d76125bde7..090077f9d52b 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -416,7 +416,7 @@ vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value) return (err); } -static int +int vdev_rebalance_node_cmp(void *a, void *b) { vdev_rebalance_node_t *c1 = a; @@ -1209,16 +1209,18 @@ typedef struct remap_segment { uint64_t rs_asize; uint64_t rs_split_offset; list_node_t rs_node; + uint64_t rs_birth; } remap_segment_t; static remap_segment_t * -rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) +rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset, uint64_t birth) { remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); rs->rs_vd = vd; rs->rs_offset = offset; rs->rs_asize = asize; rs->rs_split_offset = split_offset; + rs->rs_birth = birth; return (rs); } @@ -1459,7 +1461,7 @@ vdev_indirect_remap(vdev_t *vd, uint64_t birth, uint64_t offset, uint64_t asize, list_create(&stack, sizeof (remap_segment_t), offsetof(remap_segment_t, rs_node)); - for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); + for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0, birth); rs != NULL; rs = list_remove_head(&stack)) { vdev_t *v = rs->rs_vd; @@ -1489,7 +1491,7 @@ vdev_indirect_remap(vdev_t *vd, uint64_t birth, uint64_t offset, uint64_t asize, avl_tree_t *t = vd->vdev_rebalance_tree; vdev_rebalance_node_t *idx = - vdev_rebalance_get_first(v, birth, + vdev_rebalance_get_first(v, rs->rs_birth, rs->rs_offset, rs->rs_asize); vdev_rebalance_node_t *next; if (rs->rs_offset < idx->vrn_offset) { @@ -1525,7 +1527,7 @@ vdev_indirect_remap(vdev_t *vd, uint64_t birth, uint64_t offset, uint64_t asize, if (dst_v->vdev_rebalance_tree) { list_insert_head(&stack, rs_alloc(dst_v, dst_offset + inner_offset, - inner_size, rs->rs_split_offset)); + inner_size, rs->rs_split_offset, idx->vrn_birth)); } func(rs->rs_split_offset, dst_v, diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index b679437b37ed..6fcde291ef17 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1119,7 +1119,7 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio, * read from the old location and write to the new location. */ static int -spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, +spa_vdev_copy_segment(vdev_t *vd, vdev_t *dest, range_tree_t *segs, uint64_t maxalloc, uint64_t txg, vdev_copy_arg_t *vca, zio_alloc_list_t *zal) { @@ -1161,18 +1161,26 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, } ASSERT3U(size, <=, maxalloc); ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift)); - - /* - * An allocation class might not have any remaining vdevs or space - */ - metaslab_class_t *mc = mg->mg_class; - if (mc->mc_groups == 0) - mc = spa_normal_class(spa); - int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, - METASLAB_DONT_THROTTLE, zal, 0); - if (error == ENOSPC && mc != spa_normal_class(spa)) { - error = metaslab_alloc_dva(spa, spa_normal_class(spa), size, - &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0); + int error = 0; + if (dest) { + uint64_t offset = metaslab_group_alloc(mg, zal, size, txg, + B_FALSE, &dst, 0, 0, B_FALSE); + if (offset == -1ULL) { + error = ENOSPC; + } + } else { + /* + * An allocation class might not have any remaining vdevs or space + */ + metaslab_class_t *mc = mg->mg_class; + if (mc->mc_groups == 0) + mc = spa_normal_class(spa); + int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, + METASLAB_DONT_THROTTLE, zal, 0); + if (error == ENOSPC && mc != spa_normal_class(spa)) { + error = metaslab_alloc_dva(spa, spa_normal_class(spa), size, + &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0); + } } if (error != 0) return (error); @@ -1231,16 +1239,15 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, spa_config_enter(spa, SCL_STATE, spa, RW_READER); zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL, spa_vdev_copy_segment_done, vcsa, 0); - vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst)); - if (dest_vd->vdev_ops == &vdev_mirror_ops) { - for (int i = 0; i < dest_vd->vdev_children; i++) { - vdev_t *child = dest_vd->vdev_child[i]; + if (dest->vdev_ops == &vdev_mirror_ops) { + for (int i = 0; i < dest->vdev_children; i++) { + vdev_t *child = dest->vdev_child[i]; spa_vdev_copy_one_child(vca, nzio, vd, start, child, DVA_GET_OFFSET(&dst), i, size); } } else { spa_vdev_copy_one_child(vca, nzio, vd, start, - dest_vd, DVA_GET_OFFSET(&dst), -1, size); + dest, DVA_GET_OFFSET(&dst), -1, size); } zio_nowait(nzio); @@ -2555,10 +2562,10 @@ spa_vdev_rebalance(spa_t *spa, uint64_t vdev_guid) int pfull = vd->vdev_stat.vs_alloc * 100 / vd->vdev_stat.vs_space; zfs_dbgmsg("rebalancing vdev: %llu %s %d", (u_longlong_t)vdev_guid, vd->vdev_path, pfull); - spa_vdev_rebalance_t svr; - svr.svr_target = pfull; - svr.svr_vdev_id = vd->vdev_id; - list_create(&svr.svr_sources, sizeof (spa_vdev_rebalance_source_info_t), + spa_vdev_rebalance_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); + svr->svr_target = pfull; + svr->svr_vdev_id = vd->vdev_id; + list_create(&svr->svr_sources, sizeof (spa_vdev_rebalance_source_info_t), offsetof(spa_vdev_rebalance_source_info_t, svrsi_node)); // TODO need to consider in descending order of fullness to do this right @@ -2575,7 +2582,7 @@ spa_vdev_rebalance(spa_t *spa, uint64_t vdev_guid) if (cfull > pfull && cfull > tfull) { spa_vdev_rebalance_source_info_t *svrsi = kmem_zalloc(sizeof (*svrsi), KM_SLEEP); svrsi->svrsi_id = id; - list_insert_tail(&svr.svr_sources, svrsi); + list_insert_tail(&svr->svr_sources, svrsi); zfs_dbgmsg("selected vdev: %s %d > %d", cvd->vdev_path, cfull, pfull); talloc += cvd->vdev_stat.vs_alloc; tspace += cvd->vdev_stat.vs_space; @@ -2584,13 +2591,13 @@ spa_vdev_rebalance(spa_t *spa, uint64_t vdev_guid) } if (count == 0) { - list_destroy(&svr.svr_sources); + list_destroy(&svr->svr_sources); return (0); } int tfull = 100 * talloc / tspace; uint64_t sum = 0; - spa_vdev_rebalance_source_info_t *head = list_head(&svr.svr_sources); + spa_vdev_rebalance_source_info_t *head = list_head(&svr->svr_sources); ASSERT(head); do { @@ -2599,8 +2606,8 @@ spa_vdev_rebalance(spa_t *spa, uint64_t vdev_guid) if (cfull < tfull) { zfs_dbgmsg("Skipping %s, %d", cvd->vdev_path, cfull); spa_vdev_rebalance_source_info_t *cur = head; - head = list_next(&svr.svr_sources, head); - list_remove(&svr.svr_sources, cur); + head = list_next(&svr->svr_sources, head); + list_remove(&svr->svr_sources, cur); kmem_free(cur, sizeof (*cur)); continue; } @@ -2608,24 +2615,374 @@ spa_vdev_rebalance(spa_t *spa, uint64_t vdev_guid) sum += head->svrsi_bytes; zfs_dbgmsg("Moving %llu bytes from %s to %s", (u_longlong_t)head->svrsi_bytes, cvd->vdev_path, vd->vdev_path); zfs_dbgmsg("%s will be %d full", cvd->vdev_path, (int)(100 * (cvd->vdev_stat.vs_alloc - head->svrsi_bytes) / cvd->vdev_stat.vs_space)); - head = list_next(&svr.svr_sources, head); + head = list_next(&svr->svr_sources, head); } while (head); zfs_dbgmsg("Moving %llu bytes total to %s", (u_longlong_t)sum, vd->vdev_path); zfs_dbgmsg("%s will be %d full", vd->vdev_path, (int)(100 * (vd->vdev_stat.vs_alloc + sum) / vd->vdev_stat.vs_space)); + spa->spa_vdev_rebalance = svr; + + return (0); +} + + + +/* + * Allocate a new location for this segment, and create the zio_t's to + * read from the old location and write to the new location. + */ +static int +spa_vdev_copy_segment_rebal(spa_vdev_rebalance_t *svr, vdev_t *vd, vdev_t *dest, range_tree_t *segs, + uint64_t maxalloc, uint64_t txg, + vdev_copy_arg_t *vca, zio_alloc_list_t *zal) +{ + metaslab_group_t *mg = vd->vdev_mg; + spa_t *spa = vd->vdev_spa; + vdev_indirect_mapping_entry_t *entry; + dva_t dst = {{ 0 }}; + uint64_t start = range_tree_min(segs); + ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift)); + + ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); + ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift)); + + uint64_t size = range_tree_span(segs); + if (range_tree_span(segs) > maxalloc) { + /* + * We can't allocate all the segments. Prefer to end + * the allocation at the end of a segment, thus avoiding + * additional split blocks. + */ + range_seg_max_t search; + zfs_btree_index_t where; + rs_set_start(&search, segs, start + maxalloc); + rs_set_end(&search, segs, start + maxalloc); + (void) zfs_btree_find(&segs->rt_root, &search, &where); + range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where, + &where); + if (rs != NULL) { + size = rs_get_end(rs, segs) - start; + } else { + /* + * There are no segments that end before maxalloc. + * I.e. the first segment is larger than maxalloc, + * so we must split it. + */ + size = maxalloc; + } + } + ASSERT3U(size, <=, maxalloc); + ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift)); + int error = 0; + if (dest) { + uint64_t offset = metaslab_group_alloc(mg, zal, size, txg, + B_FALSE, &dst, 0, 0, B_FALSE); + if (offset == -1ULL) { + error = ENOSPC; + } + } else { + /* + * An allocation class might not have any remaining vdevs or space + */ + metaslab_class_t *mc = mg->mg_class; + if (mc->mc_groups == 0) + mc = spa_normal_class(spa); + int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, + METASLAB_DONT_THROTTLE, zal, 0); + if (error == ENOSPC && mc != spa_normal_class(spa)) { + error = metaslab_alloc_dva(spa, spa_normal_class(spa), size, + &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0); + } + } + if (error != 0) + return (error); + + /* + * Determine the ranges that are not actually needed. Offsets are + * relative to the start of the range to be copied (i.e. relative to the + * local variable "start"). + */ + range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); - /*avl_creeate(&vd->vdev_rebalance_tree, vdev_rebalance_node_cmp, - sizeof (vdev_rebalance_node_t), offsetof(vdev_rebalance_node_t, - vrn_node));*/ + zfs_btree_index_t where; + range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); + ASSERT3U(rs_get_start(rs, segs), ==, start); + uint64_t prev_seg_end = rs_get_end(rs, segs); + while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) { + if (rs_get_start(rs, segs) >= start + size) { + break; + } else { + range_tree_add(obsolete_segs, + prev_seg_end - start, + rs_get_start(rs, segs) - prev_seg_end); + } + prev_seg_end = rs_get_end(rs, segs); + } + /* We don't end in the middle of an obsolete range */ + ASSERT3U(start + size, <=, prev_seg_end); + + range_tree_clear(segs, start, size); + + /* + * We can't have any padding of the allocated size, otherwise we will + * misunderstand what's allocated, and the size of the mapping. We + * prevent padding by ensuring that all devices in the pool have the + * same ashift, and the allocation size is a multiple of the ashift. + */ + VERIFY3U(DVA_GET_ASIZE(&dst), ==, size); + + vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP); + vcsa->vcsa_dest_dva = &dst; + vcsa->vcsa_spa = spa; + vcsa->vcsa_txg = txg; + + /* + * See comment before spa_vdev_copy_one_child(). + */ + zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL, + spa_vdev_copy_segment_done, vcsa, 0); + if (dest->vdev_ops == &vdev_mirror_ops) { + for (int i = 0; i < dest->vdev_children; i++) { + vdev_t *child = dest->vdev_child[i]; + spa_vdev_copy_one_child(vca, nzio, vd, start, + child, DVA_GET_OFFSET(&dst), i, size); + } + } else { + spa_vdev_copy_one_child(vca, nzio, vd, start, + dest, DVA_GET_OFFSET(&dst), -1, size); + } + zio_nowait(nzio); - while ((head = list_remove_head(&svr.svr_sources))) { - kmem_free(head, sizeof (*head)); + vdev_rebalance_node_t *node = kmem_zalloc(sizeof (*node), KM_SLEEP); + node->vrn_dest = dst; + node->vrn_birth = txg; + node->vrn_offset = start; + node->vrn_size = size; + if (vd->vdev_rebalance_tree == NULL) { + vd->vdev_rebalance_tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); + avl_creeate(&vd->vdev_rebalance_tree, vdev_rebalance_node_cmp, + sizeof (vdev_rebalance_node_t), offsetof(vdev_rebalance_node_t, + vrn_node)); } - list_destroy(&svr.svr_sources); + avl_insert(vd->vdev_rebalance_tree, node); + + ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); + vdev_dirty(vd, 0, NULL, txg); return (0); } +/* + * Evacuates a segment of size at most max_alloc from the vdev + * via repeated calls to spa_vdev_copy_segment. If an allocation + * fails, the pool is probably too fragmented to handle such a + * large size, so decrease max_alloc so that the caller will not try + * this size again this txg. + */ +static void +spa_vdev_copy_rebal_impl(vdev_t *vd, spa_vdev_rebalance_t *svr, vdev_copy_arg_t *vca, + uint64_t *max_alloc, dmu_tx_t *tx) +{ + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + /* + * Determine how big of a chunk to copy. We can allocate up + * to max_alloc bytes, and we can span up to vdev_removal_max_span + * bytes of unallocated space at a time. "segs" will track the + * allocated segments that we are copying. We may also be copying + * free segments (of up to vdev_removal_max_span bytes). + */ + range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + for (;;) { + range_tree_t *rt = svr->svr_allocd_segs; + range_seg_t *rs = range_tree_first(rt); + + if (rs == NULL) + break; + + uint64_t seg_length; + + if (range_tree_is_empty(segs)) { + /* need to truncate the first seg based on max_alloc */ + seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs, + rt), *max_alloc); + } else { + if (rs_get_start(rs, rt) - range_tree_max(segs) > + vdev_removal_max_span) { + /* + * Including this segment would cause us to + * copy a larger unneeded chunk than is allowed. + */ + break; + } else if (rs_get_end(rs, rt) - range_tree_min(segs) > + *max_alloc) { + /* + * This additional segment would extend past + * max_alloc. Rather than splitting this + * segment, leave it for the next mapping. + */ + break; + } else { + seg_length = rs_get_end(rs, rt) - + rs_get_start(rs, rt); + } + } + + range_tree_add(segs, rs_get_start(rs, rt), seg_length); + range_tree_remove(svr->svr_allocd_segs, + rs_get_start(rs, rt), seg_length); + } + + if (range_tree_is_empty(segs)) { + range_tree_destroy(segs); + return; + } + + zio_alloc_list_t zal; + metaslab_trace_init(&zal); + uint64_t thismax = SPA_MAXBLOCKSIZE; + vdev_t *dest = vdev_lookup_top(spa, svr->svr_vdev_id); + while (!range_tree_is_empty(segs)) { + int error = spa_vdev_copy_segment_rebal(svr, vd, dest, + segs, thismax, txg, vca, &zal); + + if (error == ENOSPC) { + /* + * Cut our segment in half, and don't try this + * segment size again this txg. Note that the + * allocation size must be aligned to the highest + * ashift in the pool, so that the allocation will + * not be padded out to a multiple of the ashift, + * which could cause us to think that this mapping + * is larger than we intended. + */ + ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); + uint64_t attempted = + MIN(range_tree_span(segs), thismax); + thismax = P2ROUNDUP(attempted / 2, + 1 << spa->spa_max_ashift); + /* + * The minimum-size allocation can not fail. + */ + ASSERT3U(attempted, >, 1 << spa->spa_max_ashift); + *max_alloc = attempted - (1 << spa->spa_max_ashift); + } else { + ASSERT0(error); + + /* + * We've performed an allocation, so reset the + * alloc trace list. + */ + metaslab_trace_fini(&zal); + metaslab_trace_init(&zal); + } + } + metaslab_trace_fini(&zal); + range_tree_destroy(segs); +} + +void +spa_rebalance_some(spa_t *spa, dmu_tx_t *tx) +{ + spa_vdev_rebalance_t *svr = spa->spa_vdev_rebalance; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + if (svr == NULL) + return; + + /* + * Only do stuff in the first pass + * */ + if (spa->spa_sync_pass == 1) + return; + + vdev_copy_arg_t vca = {0}; + spa_vdev_rebalance_source_info_t *svrsi = list_head(&svr->svr_sources); + vdev_t *vd = vdev_lookup_top(spa, svrsi->svrsi_id); + uint64_t start_offset = svr->svr_start_offset; + uint64_t msi; + uint64_t max_alloc = spa_remove_max_segment(spa); + for (msi = svr->svr_start_offset; + msi < vd->vdev_ms_count; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + ASSERT3U(msi, <=, vd->vdev_ms_count); + range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + /* + * Assert nothing in flight -- ms_*tree is empty. + */ + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(range_tree_space(msp->ms_allocating[i])); + } + + /* + * If the metaslab has ever been allocated from (ms_sm!=NULL), + * read the allocated segments from the space map object + * into svr_allocd_segs. Since we do this while holding + * svr_lock and ms_sync_lock, concurrent frees (which + * would have modified the space map) will wait for us + * to finish loading the spacemap, and then take the + * appropriate action (see free_from_removing_vdev()). + */ + if (msp->ms_sm != NULL) { + VERIFY0(space_map_load(msp->ms_sm, + segs, SM_ALLOC)); + + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_add, segs); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_remove, segs); + range_tree_walk(msp->ms_freeing, + range_tree_remove, segs); + + /* + * When we are resuming from a paused removal (i.e. + * when importing a pool with a removal in progress), + * discard any state that we have already processed. + */ + range_tree_clear(segs, 0, start_offset); + } + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + + vca.vca_msp = msp; + zfs_dbgmsg("copying %llu segments for metaslab %llu", + (u_longlong_t)zfs_btree_numnodes( + &segs->rt_root), + (u_longlong_t)msp->ms_id); + + svr->svr_allocd_segs = segs; + while (!range_tree_is_empty(segs)) { + if (vca.vca_outstanding_bytes > + zfs_remove_max_copy_bytes || + vca.vca_outstanding_bytes >= svrsi->svrsi_bytes) { + break; + } + spa_vdev_copy_rebal_impl(vd, svr, &vca, &max_alloc, tx); + } + svrsi->svrsi_bytes -= vca.vca_outstanding_bytes; + svr->svr_allocd_segs = NULL; + range_tree_destroy(segs); + if (vca.vca_outstanding_bytes > + zfs_remove_max_copy_bytes || + svrsi->svrsi_bytes == 0) { + break; + } + } + if (msi == vd->vdev_ms_count || + vca.vca_outstanding_bytes >= svrsi->svrsi_bytes) { + list_remove_head(&svr->svr_sources); + kmem_free(svrsi, sizeof (*svrsi)); + msi = 0; + } + svr->svr_start_offset = msi; +} + ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW, "Ignore hard IO errors when removing device");