From dda7bf25f733d5d5f47d71f50fa5cb7322904ce1 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 10 Jun 2021 12:56:12 -0700 Subject: [PATCH 01/25] disable zstd mempool --- module/zstd/zfs_zstd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c index 2c698716c971..4bf412c1a992 100644 --- a/module/zstd/zfs_zstd.c +++ b/module/zstd/zfs_zstd.c @@ -266,16 +266,16 @@ zstd_mempool_reap(struct zstd_pool *zstd_mempool) static void * zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) { - struct zstd_pool *pool; struct zstd_kmem *mem = NULL; if (!zstd_mempool) { return (NULL); } +#if 0 /* Seek for preallocated memory slot and free obsolete slots */ for (int i = 0; i < ZSTD_POOL_MAX; i++) { - pool = &zstd_mempool[i]; + struct zstd_pool *pool = &zstd_mempool[i]; /* * This lock is simply a marker for a pool object being in use. * If it's already hold, it will be skipped. @@ -309,7 +309,7 @@ zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) * allocations constantly at the end. */ for (int i = 0; i < ZSTD_POOL_MAX; i++) { - pool = &zstd_mempool[i]; + struct zstd_pool *pool = &zstd_mempool[i]; if (mutex_tryenter(&pool->barrier)) { /* Object is free, try to allocate new one */ if (!pool->mem) { @@ -337,6 +337,7 @@ zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) mutex_exit(&pool->barrier); } } +#endif /* * If the pool is full or the allocation failed, try lazy allocation From 2f2ae112584fb357c3fd3b71ed8006a0f01b6996 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 10 Jun 2021 12:55:02 -0700 Subject: [PATCH 02/25] raidz expansion feature This feature allows disks to be added one at a time to a RAID-Z group, expanding its capacity incrementally. This feature is especially useful for small pools (typically with only one RAID-Z group), where there isn't sufficient hardware to add capacity by adding a whole new RAID-Z group (typically doubling the number of disks). == Initiating expansion == A new device (disk) can be attached to an existing RAIDZ vdev, by running `zpool attach POOL raidzP-N NEW_DEVICE`, e.g. `zpool attach tank raidz2-0 sda`. The new device will become part of the RAIDZ group. A "raidz expansion" will be initiated, and the new device will contribute additional space to the RAIDZ group once the expansion completes. The `feature@raidz_expansion` on-disk feature flag must be `enabled` to initiate an expansion, and it remains `active` for the life of the pool. In other words, pools with expanded RAIDZ vdevs can not be imported by older releases of the ZFS software. == During expansion == The expansion entails reading all allocated space from existing disks in the RAIDZ group, and rewriting it to the new disks in the RAIDZ group (including the newly added device). The expansion progress can be monitored with `zpool status`. Data redundancy is maintained during (and after) the expansion. If a disk fails while the expansion is in progress, the expansion pauses until the health of the RAIDZ vdev is restored (e.g. by replacing the failed disk and waiting for reconstruction to complete). The pool remains accessible during expansion. Following a reboot or export/import, the expansion resumes where it left off. == After expansion == When the expansion completes, the additional space is available for use, and is reflected in the `available` zfs property (as seen in `zfs list`, `df`, etc). Expansion does not change the number of failures that can be tolerated without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after expansion). A RAIDZ vdev can be expanded multiple times. After the expansion completes, old blocks remain with their old data-to-parity ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but distributed among the larger set of disks. New blocks will be written with the new data-to-parity ratio (e.g. a 5-wide RAIDZ2 which has been expanded once to 6-wide, has 4 data to 2 parity). However, the RAIDZ vdev's "assumed parity ratio" does not change, so slightly less space than is expected may be reported for newly-written blocks, according to `zfs list`, `df`, `ls -s`, and similar tools. Sponsored-by: The FreeBSD Foundation Contributions-by: Fedor Uporov Contributions-by: Stuart Maybee Contributions-by: Thorsten Behrens Contributions-by: Fmstrat --- cmd/raidz_test/raidz_bench.c | 12 +- cmd/raidz_test/raidz_test.c | 196 +- cmd/raidz_test/raidz_test.h | 3 - cmd/zdb/zdb.c | 5 + cmd/zpool/zpool_main.c | 132 +- cmd/ztest/ztest.c | 755 +++++- contrib/pyzfs/libzfs_core/_constants.py | 2 + .../pyzfs/libzfs_core/_error_translation.py | 3 + contrib/pyzfs/libzfs_core/exceptions.py | 6 + include/libzfs.h | 1 + include/sys/fs/zfs.h | 24 + include/sys/spa_impl.h | 4 + include/sys/uberblock_impl.h | 23 + include/sys/vdev.h | 9 +- include/sys/vdev_impl.h | 6 +- include/sys/vdev_raidz.h | 90 +- include/sys/vdev_raidz_impl.h | 46 +- include/zfeature_common.h | 1 + lib/libzfs/libzfs_pool.c | 5 +- lib/libzfs/libzfs_util.c | 5 + man/man4/zfs.4 | 9 + man/man7/zpool-features.7 | 8 + man/man8/zpool-attach.8 | 46 +- man/man8/zpool-wait.8 | 4 +- module/os/linux/zfs/zfs_debug.c | 3 +- module/zcommon/zfeature_common.c | 5 + module/zfs/dsl_scan.c | 1 - module/zfs/metaslab.c | 5 +- module/zfs/spa.c | 215 +- module/zfs/spa_checkpoint.c | 3 + module/zfs/vdev.c | 105 +- module/zfs/vdev_draid.c | 29 +- module/zfs/vdev_initialize.c | 12 +- module/zfs/vdev_label.c | 51 +- module/zfs/vdev_raidz.c | 2241 ++++++++++++++++- module/zfs/vdev_trim.c | 17 +- tests/runfiles/common.run | 5 +- tests/zfs-tests/include/tunables.cfg | 2 + .../cli_root/zpool_get/zpool_get.cfg | 1 + .../tests/functional/raidz/Makefile.am | 8 +- .../tests/functional/raidz/raidz_003_pos.ksh | 2 +- .../tests/functional/raidz/raidz_004_pos.ksh | 2 +- .../functional/raidz/raidz_expand_001_pos.ksh | 226 ++ .../functional/raidz/raidz_expand_002_pos.ksh | 118 + .../functional/raidz/raidz_expand_003_neg.ksh | 94 + .../functional/raidz/raidz_expand_003_pos.ksh | 135 + .../functional/raidz/raidz_expand_004_pos.ksh | 119 + .../functional/raidz/raidz_expand_005_pos.ksh | 170 ++ 48 files changed, 4442 insertions(+), 522 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c index f44d6fbde707..0721e6519e7f 100644 --- a/cmd/raidz_test/raidz_bench.c +++ b/cmd/raidz_test/raidz_bench.c @@ -84,10 +84,10 @@ run_gen_bench_impl(const char *impl) if (rto_opts.rto_expand) { rm_bench = vdev_raidz_map_alloc_expanded( - zio_bench.io_abd, - zio_bench.io_size, zio_bench.io_offset, + &zio_bench, rto_opts.rto_ashift, ncols+1, ncols, - fn+1, rto_opts.rto_expand_offset); + fn+1, rto_opts.rto_expand_offset, + 0, B_FALSE); } else { rm_bench = vdev_raidz_map_alloc(&zio_bench, BENCH_ASHIFT, ncols, fn+1); @@ -172,10 +172,10 @@ run_rec_bench_impl(const char *impl) if (rto_opts.rto_expand) { rm_bench = vdev_raidz_map_alloc_expanded( - zio_bench.io_abd, - zio_bench.io_size, zio_bench.io_offset, + &zio_bench, BENCH_ASHIFT, ncols+1, ncols, - PARITY_PQR, rto_opts.rto_expand_offset); + PARITY_PQR, + rto_opts.rto_expand_offset, 0, B_FALSE); } else { rm_bench = vdev_raidz_map_alloc(&zio_bench, BENCH_ASHIFT, ncols, PARITY_PQR); diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index c1610a8d1b0c..f80885916ed0 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -333,14 +333,12 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) if (opts->rto_expand) { opts->rm_golden = - vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd, - opts->zio_golden->io_size, opts->zio_golden->io_offset, + vdev_raidz_map_alloc_expanded(opts->zio_golden, opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset); - rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd, - zio_test->io_size, zio_test->io_offset, + parity, opts->rto_expand_offset, 0, B_FALSE); + rm_test = vdev_raidz_map_alloc_expanded(zio_test, opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset); + parity, opts->rto_expand_offset, 0, B_FALSE); } else { opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, opts->rto_ashift, total_ncols, parity); @@ -367,187 +365,6 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) return (err); } -/* - * If reflow is not in progress, reflow_offset should be UINT64_MAX. - * For each row, if the row is entirely before reflow_offset, it will - * come from the new location. Otherwise this row will come from the - * old location. Therefore, rows that straddle the reflow_offset will - * come from the old location. - * - * NOTE: Until raidz expansion is implemented this function is only - * needed by raidz_test.c to the multi-row raid_map_t functionality. - */ -raidz_map_t * -vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, - uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, - uint64_t nparity, uint64_t reflow_offset) -{ - /* The zio's size in units of the vdev's minimum sector size. */ - uint64_t s = size >> ashift; - uint64_t q, r, bc, devidx, asize = 0, tot; - - /* - * "Quotient": The number of data sectors for this stripe on all but - * the "big column" child vdevs that also contain "remainder" data. - * AKA "full rows" - */ - q = s / (logical_cols - nparity); - - /* - * "Remainder": The number of partial stripe data sectors in this I/O. - * This will add a sector to some, but not all, child vdevs. - */ - r = s - q * (logical_cols - nparity); - - /* The number of "big columns" - those which contain remainder data. */ - bc = (r == 0 ? 0 : r + nparity); - - /* - * The total number of data and parity sectors associated with - * this I/O. - */ - tot = s + nparity * (q + (r == 0 ? 0 : 1)); - - /* How many rows contain data (not skip) */ - uint64_t rows = howmany(tot, logical_cols); - int cols = MIN(tot, logical_cols); - - raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), - KM_SLEEP); - rm->rm_nrows = rows; - - for (uint64_t row = 0; row < rows; row++) { - raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t, - rr_col[cols]), KM_SLEEP); - rm->rm_row[row] = rr; - - /* The starting RAIDZ (parent) vdev sector of the row. */ - uint64_t b = (offset >> ashift) + row * logical_cols; - - /* - * If we are in the middle of a reflow, and any part of this - * row has not been copied, then use the old location of - * this row. - */ - int row_phys_cols = physical_cols; - if (b + (logical_cols - nparity) > reflow_offset >> ashift) - row_phys_cols--; - - /* starting child of this row */ - uint64_t child_id = b % row_phys_cols; - /* The starting byte offset on each child vdev. */ - uint64_t child_offset = (b / row_phys_cols) << ashift; - - /* - * We set cols to the entire width of the block, even - * if this row is shorter. This is needed because parity - * generation (for Q and R) needs to know the entire width, - * because it treats the short row as though it was - * full-width (and the "phantom" sectors were zero-filled). - * - * Another approach to this would be to set cols shorter - * (to just the number of columns that we might do i/o to) - * and have another mechanism to tell the parity generation - * about the "entire width". Reconstruction (at least - * vdev_raidz_reconstruct_general()) would also need to - * know about the "entire width". - */ - rr->rr_cols = cols; - rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; - rr->rr_firstdatacol = nparity; - rr->rr_abd_empty = NULL; - rr->rr_nempty = 0; - - for (int c = 0; c < rr->rr_cols; c++, child_id++) { - if (child_id >= row_phys_cols) { - child_id -= row_phys_cols; - child_offset += 1ULL << ashift; - } - rr->rr_col[c].rc_devidx = child_id; - rr->rr_col[c].rc_offset = child_offset; - rr->rr_col[c].rc_orig_data = NULL; - rr->rr_col[c].rc_error = 0; - rr->rr_col[c].rc_tried = 0; - rr->rr_col[c].rc_skipped = 0; - rr->rr_col[c].rc_need_orig_restore = B_FALSE; - - uint64_t dc = c - rr->rr_firstdatacol; - if (c < rr->rr_firstdatacol) { - rr->rr_col[c].rc_size = 1ULL << ashift; - rr->rr_col[c].rc_abd = - abd_alloc_linear(rr->rr_col[c].rc_size, - B_TRUE); - } else if (row == rows - 1 && bc != 0 && c >= bc) { - /* - * Past the end, this for parity generation. - */ - rr->rr_col[c].rc_size = 0; - rr->rr_col[c].rc_abd = NULL; - } else { - /* - * "data column" (col excluding parity) - * Add an ASCII art diagram here - */ - uint64_t off; - - if (c < bc || r == 0) { - off = dc * rows + row; - } else { - off = r * rows + - (dc - r) * (rows - 1) + row; - } - rr->rr_col[c].rc_size = 1ULL << ashift; - rr->rr_col[c].rc_abd = abd_get_offset_struct( - &rr->rr_col[c].rc_abdstruct, - abd, off << ashift, 1 << ashift); - } - - asize += rr->rr_col[c].rc_size; - } - /* - * If all data stored spans all columns, there's a danger that - * parity will always be on the same device and, since parity - * isn't read during normal operation, that that device's I/O - * bandwidth won't be used effectively. We therefore switch - * the parity every 1MB. - * - * ...at least that was, ostensibly, the theory. As a practical - * matter unless we juggle the parity between all devices - * evenly, we won't see any benefit. Further, occasional writes - * that aren't a multiple of the LCM of the number of children - * and the minimum stripe width are sufficient to avoid pessimal - * behavior. Unfortunately, this decision created an implicit - * on-disk format requirement that we need to support for all - * eternity, but only for single-parity RAID-Z. - * - * If we intend to skip a sector in the zeroth column for - * padding we must make sure to note this swap. We will never - * intend to skip the first column since at least one data and - * one parity column must appear in each row. - */ - if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && - (offset & (1ULL << 20))) { - ASSERT(rr->rr_cols >= 2); - ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); - devidx = rr->rr_col[0].rc_devidx; - uint64_t o = rr->rr_col[0].rc_offset; - rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; - rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; - rr->rr_col[1].rc_devidx = devidx; - rr->rr_col[1].rc_offset = o; - } - - } - ASSERT3U(asize, ==, tot << ashift); - - /* init RAIDZ parity ops */ - rm->rm_ops = vdev_raidz_math_get_ops(); - - return (rm); -} - static raidz_map_t * init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) { @@ -567,10 +384,9 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) init_zio_abd(*zio); if (opts->rto_expand) { - rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd, - (*zio)->io_size, (*zio)->io_offset, + rm = vdev_raidz_map_alloc_expanded(*zio, opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset); + parity, opts->rto_expand_offset, 0, B_FALSE); } else { rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, total_ncols, parity); diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h index 0f7f4cee3eb6..2e7a2bcc59bb 100644 --- a/cmd/raidz_test/raidz_test.h +++ b/cmd/raidz_test/raidz_test.h @@ -117,7 +117,4 @@ void init_zio_abd(zio_t *zio); void run_raidz_benchmark(void); -struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t, - uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); - #endif /* RAIDZ_TEST_H */ diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index dff61b7bae8b..27ca1db7a45d 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -3931,6 +3931,11 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer) } (void) printf("\tcheckpoint_txg = %llu\n", (u_longlong_t)ub->ub_checkpoint_txg); + + (void) printf("\traidz_reflow state=%u off=%llu\n", + (int)RRSS_GET_STATE(ub), + (u_longlong_t)RRSS_GET_OFFSET(ub)); + (void) printf("%s", footer ? footer : ""); } diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 3a2caa9a8101..4d3944761964 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -6526,9 +6526,17 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, rebuild); - if (ret == 0 && wait) - ret = zpool_wait(zhp, - replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER); + if (ret == 0 && wait) { + zpool_wait_activity_t activity = ZPOOL_WAIT_RESILVER; + char raidz_prefix[] = "raidz"; + if (replacing) { + activity = ZPOOL_WAIT_REPLACE; + } else if (strncmp(old_disk, + raidz_prefix, strlen(raidz_prefix)) == 0) { + activity = ZPOOL_WAIT_RAIDZ_EXPAND; + } + ret = zpool_wait(zhp, activity); + } nvlist_free(props); nvlist_free(nvroot); @@ -7928,6 +7936,98 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) } } +/* + * Print out detailed raidz expansion status. + */ +static void +print_raidz_expand_status(zpool_handle_t *zhp, pool_raidz_expand_stat_t *pres) +{ + char copied_buf[7]; + + if (pres == NULL || pres->pres_state == DSS_NONE) + return; + + /* + * Determine name of vdev. + */ + nvlist_t *config = zpool_get_config(zhp, NULL); + nvlist_t *nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + nvlist_t **child; + uint_t children; + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0); + assert(pres->pres_expanding_vdev < children); + + printf_color(ANSI_BOLD, gettext("raidz expand: ")); + + time_t start = pres->pres_start_time; + time_t end = pres->pres_end_time; + zfs_nicenum(pres->pres_reflowed, copied_buf, sizeof (copied_buf)); + + /* + * Expansion is finished or canceled. + */ + if (pres->pres_state == DSS_FINISHED) { + uint64_t minutes_taken = (end - start) / 60; + + (void) printf(gettext("Expansion of vdev %u copied %s " + "in %lluh%um, completed on %s"), + (int)pres->pres_expanding_vdev, + copied_buf, + (u_longlong_t)(minutes_taken / 60), + (uint_t)(minutes_taken % 60), + ctime((time_t *)&end)); + } else { + char examined_buf[7], total_buf[7], rate_buf[7]; + uint64_t copied, total, elapsed, mins_left, hours_left; + double fraction_done; + uint_t rate; + + assert(pres->pres_state == DSS_SCANNING); + + /* + * Expansion is in progress. + */ + (void) printf(gettext( + "Expansion of vdev %u in progress since %s"), + (int)pres->pres_expanding_vdev, ctime(&start)); + + copied = pres->pres_reflowed > 0 ? pres->pres_reflowed : 1; + total = pres->pres_to_reflow; + fraction_done = (double)copied / total; + + /* elapsed time for this pass */ + elapsed = time(NULL) - pres->pres_start_time; + elapsed = elapsed > 0 ? elapsed : 1; + rate = copied / elapsed; + rate = rate > 0 ? rate : 1; + mins_left = ((total - copied) / rate) / 60; + hours_left = mins_left / 60; + + zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); + zfs_nicenum(total, total_buf, sizeof (total_buf)); + zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); + + /* + * do not print estimated time if hours_left is more than + * 30 days + */ + (void) printf(gettext(" %s copied out of %s at %s/s, " + "%.2f%% done"), + examined_buf, total_buf, rate_buf, 100 * fraction_done); + if (pres->pres_waiting_for_resilver) { + (void) printf(gettext(", paused due to io errors, " + "waiting for resilver or clear\n")); + } else if (hours_left < (30 * 24)) { + (void) printf(gettext(", %lluh%um to go\n"), + (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); + } else { + (void) printf(gettext( + ", (copy is slow, no estimated time)\n")); + } + } +} static void print_checkpoint_status(pool_checkpoint_stat_t *pcs) { @@ -8505,19 +8605,24 @@ status_callback(zpool_handle_t *zhp, void *data) uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - pool_checkpoint_stat_t *pcs = NULL; - pool_removal_stat_t *prs = NULL; print_scan_status(zhp, nvroot); + pool_removal_stat_t *prs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); print_removal_status(zhp, prs); + pool_checkpoint_stat_t *pcs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_status(pcs); + pool_raidz_expand_stat_t *pres = NULL; + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); + print_raidz_expand_status(zhp, pres); + cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) @@ -10293,8 +10398,9 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *pss = NULL; pool_removal_stat_t *prs = NULL; + pool_raidz_expand_stat_t *pres = NULL; char *headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE", - "REMOVE", "RESILVER", "SCRUB", "TRIM"}; + "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND"}; int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; /* Calculate the width of each column */ @@ -10353,6 +10459,13 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) vdev_activity_top_remaining(nvroot); } + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); + if (pres != NULL && pres->pres_state == DSS_SCANNING) { + int64_t rem = pres->pres_to_reflow - pres->pres_reflowed; + bytes_rem[ZPOOL_WAIT_RAIDZ_EXPAND] = rem; + } + bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); bytes_rem[ZPOOL_WAIT_TRIM] = @@ -10382,11 +10495,12 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) if (!wd->wd_enabled[i]) continue; - if (wd->wd_exact) + if (wd->wd_exact) { (void) snprintf(buf, sizeof (buf), "%" PRIi64, bytes_rem[i]); - else + } else { zfs_nicenum(bytes_rem[i], buf, sizeof (buf)); + } if (wd->wd_scripted) (void) printf(i == 0 ? "%s" : "\t%s", buf); @@ -10491,7 +10605,7 @@ zpool_do_wait(int argc, char **argv) { static char *col_subopts[] = { "discard", "free", "initialize", "replace", "remove", "resilver", - "scrub", "trim", NULL }; + "scrub", "trim", "raidz_expand", NULL }; /* Reset activities array */ bzero(&wd.wd_enabled, sizeof (wd.wd_enabled)); diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 5a5c381409a2..646805ab57cb 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -186,6 +186,7 @@ typedef struct ztest_shared_opts { uint64_t zo_time; uint64_t zo_maxloops; uint64_t zo_metaslab_force_ganging; + uint64_t zo_raidz_expand_test; int zo_mmp_test; int zo_special_vdevs; int zo_dump_dbgmsg; @@ -247,6 +248,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, .zo_gvars_count = 0, + .zo_raidz_expand_test = 0, }; extern uint64_t metaslab_force_ganging; @@ -259,6 +261,7 @@ extern int dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; +extern uint64_t raidz_expand_max_offset_pause; static ztest_shared_opts_t *ztest_shared_opts; @@ -309,9 +312,9 @@ typedef struct bufwad { * still need to map from object ID to rangelock_t. */ typedef enum { - RL_READER, - RL_WRITER, - RL_APPEND + ZTRL_READER, + ZTRL_WRITER, + ZTRL_APPEND } rl_type_t; typedef struct rll { @@ -367,11 +370,18 @@ typedef struct ztest_ds { */ typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); +/* + * XXX: remove zi_raidz_attach_compatible field, when + * raidz expansion will be completely integrated together with + * ztest_raidz_attach_test variable. + */ + typedef struct ztest_info { ztest_func_t *zi_func; /* test function */ uint64_t zi_iters; /* iterations per execution */ uint64_t *zi_interval; /* execute every seconds */ const char *zi_funcname; /* name of test function */ + boolean_t zi_raidz_attach_compatible; } ztest_info_t; typedef struct ztest_shared_callstate { @@ -406,6 +416,7 @@ ztest_func_t ztest_mmp_enable_disable; ztest_func_t ztest_scrub; ztest_func_t ztest_dsl_dataset_promote_busy; ztest_func_t ztest_vdev_attach_detach; +ztest_func_t ztest_vdev_raidz_attach; ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_class_add; @@ -427,56 +438,88 @@ uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ -#define ZTI_INIT(func, iters, interval) \ +#define ZTI_INIT(func, iters, interval, compatible) \ { .zi_func = (func), \ .zi_iters = (iters), \ .zi_interval = (interval), \ + .zi_raidz_attach_compatible = (compatible), \ .zi_funcname = # func } ztest_info_t ztest_info[] = { - ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), - ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), - ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), - ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), - ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), - ZTI_INIT(ztest_zap, 30, &zopt_always), - ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), - ZTI_INIT(ztest_split_pool, 1, &zopt_always), - ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), - ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), - ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), - ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), - ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), - ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), + ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always, B_TRUE), + ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always, B_TRUE), + ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always, B_TRUE), + ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes, B_TRUE), + ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always, B_FALSE), + ZTI_INIT(ztest_zap, 30, &zopt_always, B_FALSE), + ZTI_INIT(ztest_zap_parallel, 100, &zopt_always, B_FALSE), + ZTI_INIT(ztest_split_pool, 1, &zopt_always, B_FALSE), + ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant, B_FALSE), + ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes, B_FALSE), + ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often, B_FALSE), + ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often, B_FALSE), + ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often, B_FALSE), + ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes, B_FALSE), #if 0 - ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), + ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes, B_FALSE), #endif - ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), - ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), - ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), - ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), - ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), - ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), - ZTI_INIT(ztest_reguid, 1, &zopt_rarely), - ZTI_INIT(ztest_scrub, 1, &zopt_rarely), - ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), - ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), - ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), - ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), - ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), - ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), - ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), - ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), - ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), - ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), - ZTI_INIT(ztest_trim, 1, &zopt_sometimes), - ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), - ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), - ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), + ZTI_INIT(ztest_fzap, 1, &zopt_sometimes, B_FALSE), + ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes, + B_FALSE), + ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes, B_FALSE), + ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes, B_FALSE), + ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes, B_FALSE), + ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes, B_FALSE), + ZTI_INIT(ztest_reguid, 1, &zopt_rarely, B_FALSE), + ZTI_INIT(ztest_scrub, 1, &zopt_rarely, B_FALSE), + ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely, B_FALSE), + ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely, B_FALSE), + ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes, B_FALSE), + ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes, B_TRUE), + ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely, B_FALSE), + ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime, B_FALSE), + ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime, B_FALSE), + ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime, + B_FALSE), + ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes, B_FALSE), + ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely, B_FALSE), + ZTI_INIT(ztest_initialize, 1, &zopt_sometimes, B_FALSE), + ZTI_INIT(ztest_trim, 1, &zopt_sometimes, B_FALSE), + ZTI_INIT(ztest_fletcher, 1, &zopt_rarely, B_FALSE), + ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely, B_FALSE), + ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes, B_FALSE), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) +ztest_info_t raidz_expand_info[] = { +/* XXX - does this list of activities need further pruning? */ + ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always, B_TRUE), + ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always, B_TRUE), + ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always, B_TRUE), + ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes, B_TRUE), + ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always, B_TRUE), + ZTI_INIT(ztest_zap, 30, &zopt_always, B_TRUE), + ZTI_INIT(ztest_zap_parallel, 100, &zopt_always, B_TRUE), + ZTI_INIT(ztest_split_pool, 1, &zopt_always, B_TRUE), + ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant, B_TRUE), + ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes, B_TRUE), + ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often, B_TRUE), + ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often, B_TRUE), + ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often, B_TRUE), + ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes, B_TRUE), +#if 0 + ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes, B_TRUE), +#endif + ZTI_INIT(ztest_fzap, 1, &zopt_sometimes, B_TRUE), + ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely, B_TRUE), + ZTI_INIT(ztest_initialize, 1, &zopt_sometimes, B_TRUE), + ZTI_INIT(ztest_trim, 1, &zopt_sometimes, B_TRUE), + ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes, B_TRUE), +}; + +#define RAIDZ_EXPAND_FUNCS (sizeof (raidz_expand_info) / sizeof (ztest_info_t)) + /* * The following struct is used to hold a list of uncalled commit callbacks. * The callbacks are ordered by txg number. @@ -491,6 +534,7 @@ typedef struct ztest_cb_list { */ typedef struct ztest_shared { boolean_t zs_do_init; + boolean_t zs_do_raidz_scratch_verify; hrtime_t zs_proc_start; hrtime_t zs_proc_stop; hrtime_t zs_thread_start; @@ -503,6 +547,7 @@ typedef struct ztest_shared { uint64_t zs_space; uint64_t zs_splits; uint64_t zs_mirrors; + uint64_t zs_raidzs_attached; uint64_t zs_metaslab_sz; uint64_t zs_metaslab_df_alloc_threshold; uint64_t zs_guid; @@ -519,6 +564,7 @@ static ztest_ds_t *ztest_ds; static kmutex_t ztest_vdev_lock; static boolean_t ztest_device_removal_active = B_FALSE; +static boolean_t ztest_raidz_attach_test = B_FALSE; static boolean_t ztest_pool_scrubbed = B_FALSE; static kmutex_t ztest_checkpoint_lock; @@ -774,6 +820,9 @@ static ztest_option_t option_table[] = { NO_DEFAULT, NULL}, { 'C', "vdev-class-state", "on|off|random", "vdev class state", NO_DEFAULT, "random"}, + { 'X', "raidz-expand-max-offset", "OFFSET", + "raidz_expand test, killing at off bytes into reflow", + NO_DEFAULT, NULL}, { 'o', "option", "\"OPTION=INTEGER\"", "Set global variable to an unsigned 32-bit integer value", NO_DEFAULT, NULL}, @@ -953,6 +1002,7 @@ process_options(int argc, char **argv) case 'T': case 'P': case 'F': + case 'X': value = nicenumtoull(optarg); } switch (opt) { @@ -1021,6 +1071,9 @@ process_options(int argc, char **argv) case 'V': zo->zo_verbose++; break; + case 'X': + zo->zo_raidz_expand_test = value; + break; case 'E': zo->zo_init = 0; break; @@ -1628,7 +1681,7 @@ ztest_rll_lock(rll_t *rll, rl_type_t type) { mutex_enter(&rll->rll_lock); - if (type == RL_READER) { + if (type == ZTRL_READER) { while (rll->rll_writer != NULL) (void) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_readers++; @@ -2084,7 +2137,7 @@ ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); ASSERT3U(object, !=, 0); - ztest_object_lock(zd, object, RL_WRITER); + ztest_object_lock(zd, object, ZTRL_WRITER); VERIFY0(dmu_object_info(os, object, &doi)); @@ -2154,8 +2207,8 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) if (bt->bt_magic != BT_MAGIC) bt = NULL; - ztest_object_lock(zd, lr->lr_foid, RL_READER); - rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); + ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); @@ -2257,9 +2310,9 @@ ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); - ztest_object_lock(zd, lr->lr_foid, RL_READER); + ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, - RL_WRITER); + ZTRL_WRITER); tx = dmu_tx_create(os); @@ -2299,7 +2352,7 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); - ztest_object_lock(zd, lr->lr_foid, RL_WRITER); + ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); @@ -2423,7 +2476,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); - ztest_object_lock(zd, object, RL_READER); + ztest_object_lock(zd, object, ZTRL_READER); error = dmu_bonus_hold(os, object, FTAG, &db); if (error) { ztest_object_unlock(zd, object); @@ -2448,7 +2501,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, if (buf != NULL) { /* immediate write */ zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, - object, offset, size, RL_READER); + object, offset, size, ZTRL_READER); error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); @@ -2463,7 +2516,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, } zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, - object, offset, size, RL_READER); + object, offset, size, ZTRL_READER); error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); @@ -2540,7 +2593,7 @@ ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) ASSERT3U(od->od_object, !=, 0); ASSERT0(missing); /* there should be no gaps */ - ztest_object_lock(zd, od->od_object, RL_READER); + ztest_object_lock(zd, od->od_object, ZTRL_READER); VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, FTAG, &db)); dmu_object_info_from_db(db, &doi); @@ -2713,8 +2766,8 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) txg_wait_synced(dmu_objset_pool(os), 0); - ztest_object_lock(zd, object, RL_READER); - rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); + ztest_object_lock(zd, object, ZTRL_READER); + rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER); tx = dmu_tx_create(os); @@ -3048,6 +3101,8 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) { spa_t *spa; uint64_t initial_version = SPA_VERSION_INITIAL; + uint64_t raidz_children = ztest_opts.zo_raid_children + + ztest_shared->zs_raidzs_attached; uint64_t version, newversion; nvlist_t *nvroot, *props; char *name; @@ -3068,7 +3123,7 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) (void) spa_destroy(name); nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); + NULL, raidz_children, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the @@ -3134,6 +3189,7 @@ ztest_spa_checkpoint(spa_t *spa) case ZFS_ERR_DEVRM_IN_PROGRESS: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_CHECKPOINT_EXISTS: + case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: break; case ENOSPC: ztest_record_enospc(FTAG); @@ -3214,6 +3270,9 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) spa_t *spa = ztest_spa; uint64_t leaves; uint64_t guid; + uint64_t raidz_children = ztest_opts.zo_raid_children + + ztest_shared->zs_raidzs_attached; + nvlist_t *nvroot; int error; @@ -3221,8 +3280,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) return; mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * - ztest_opts.zo_raid_children; + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -3276,7 +3334,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? - "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, + "log" : NULL, raidz_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); @@ -3304,6 +3362,8 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) spa_t *spa = ztest_spa; uint64_t leaves; nvlist_t *nvroot; + uint64_t raidz_children = ztest_opts.zo_raid_children + + ztest_shared->zs_raidzs_attached; const char *class = (ztest_random(2) == 0) ? VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; int error; @@ -3331,15 +3391,14 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) return; } - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * - ztest_opts.zo_raid_children; + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); + class, raidz_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); fnvlist_free(nvroot); @@ -3599,6 +3658,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) uint64_t ashift = ztest_get_ashift(); uint64_t oldguid, pguid; uint64_t oldsize, newsize; + uint64_t raidz_children = ztest_opts.zo_raid_children + + ztest_shared->zs_raidzs_attached; char *oldpath, *newpath; int replacing; int oldvd_has_siblings = B_FALSE; @@ -3614,7 +3675,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; + leaves = MAX(zs->zs_mirrors, 1) * raidz_children; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -3653,7 +3714,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (zs->zs_mirrors >= 1) { ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); - oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; + oldvd = oldvd->vdev_child[leaf / raidz_children]; } /* pick a child out of the raidz group */ @@ -3662,8 +3723,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); else ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); - ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); - oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; + oldvd = oldvd->vdev_child[leaf % raidz_children]; } /* @@ -3827,6 +3887,140 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) umem_free(newpath, MAXPATHLEN); } +#define RAIDZ_REFLOW_OFFSET_PAUSE 4 + +static void +raidz_scratch_verify(void) +{ + spa_t *spa; + + if (ztest_shared->zs_do_raidz_scratch_verify == B_FALSE) + return; + + kernel_init(SPA_MODE_READ); + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + + ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX); + ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), >=, + RAIDZ_REFLOW_OFFSET_PAUSE); + ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); + + ztest_shared->zs_do_raidz_scratch_verify = B_FALSE; + + spa_close(spa, FTAG); + kernel_fini(); +} + +static boolean_t +ztest_vdev_raidz_attach_possible(spa_t *spa) +{ + ztest_shared_t *zs = ztest_shared; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd = rvd->vdev_child[0]; + + if (rvd->vdev_children == 1 && + strcmp(vd->vdev_ops->vdev_op_type, "raidz") == 0 && + zs->zs_mirrors == 0) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * Verify that we can attach raidz device. + */ +/* ARGSUSED */ +void +ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + uint64_t csize, ashift = ztest_get_ashift(); + vdev_t *cvd, *pvd; + nvlist_t *root; + char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + int error, expected_error = 0; + + if (ztest_opts.zo_mmp_test) + return; + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); + + if (ztest_device_removal_active) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + + if (!ztest_vdev_raidz_attach_possible(spa)) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + + pvd = vdev_lookup_top(spa, 0); + + ASSERT(pvd->vdev_ops == &vdev_raidz_ops); + + /* + * Get size of a child of the raidz group, + * make sure device is a bit bigger + */ + cvd = pvd->vdev_child[0]; + csize = vdev_get_min_asize(cvd); + csize += csize / 10; + + if (spa->spa_raidz_expand) + expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; + + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Path to vdev to be attached + */ + (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, pvd->vdev_children); + + /* + * Build the nvlist describing newpath. + */ + root = make_vdev_root(newpath, NULL, NULL, csize, ashift, NULL, + 0, 0, 1); + + /* + * XXX this doesn't work right because spa_vdev_attach() won't + * return until it can write the first txg of the reflow, which + * will be paused. We need to kill off from another thread?? + */ +#if 0 + if (ztest_random(2) == 0 && expected_error == 0) { + raidz_expand_max_offset_pause = RAIDZ_REFLOW_OFFSET_PAUSE; + ztest_shared->zs_do_raidz_scratch_verify = B_TRUE; + } +#endif + + error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); + + nvlist_free(root); + + if (error == 0) { + ztest_shared->zs_raidzs_attached++; + } else if (error != 0 && error != expected_error) { + fatal(0, "raidz attach (%s %llu) returned %d, expected %d", + newpath, (long long)csize, error, expected_error); + } else if (error == 0 && ztest_shared->zs_do_raidz_scratch_verify) { + /* + * Wait raidz expansion thread starting and kill it. + */ + sleep(10); + ztest_kill(ztest_shared); + } + +out: + mutex_exit(&ztest_vdev_lock); + + umem_free(newpath, MAXPATHLEN); +} + /* ARGSUSED */ void ztest_device_removal(ztest_ds_t *zd, uint64_t id) @@ -6017,6 +6211,8 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) uint64_t leaves; uint64_t bad = 0x1990c0ffeedecadeull; uint64_t top, leaf; + uint64_t raidz_children = ztest_opts.zo_raid_children + + ztest_shared->zs_raidzs_attached; char *path0; char *pathrand; size_t fsize; @@ -6045,7 +6241,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) } maxfaults = MAXFAULTS(zs); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; + leaves = MAX(zs->zs_mirrors, 1) * raidz_children; mirror_save = zs->zs_mirrors; mutex_exit(&ztest_vdev_lock); @@ -7066,8 +7262,12 @@ ztest_execute(int test, ztest_info_t *zi, uint64_t id) hrtime_t functime = gethrtime(); int i; - for (i = 0; i < zi->zi_iters; i++) - zi->zi_func(zd, id); + for (i = 0; i < zi->zi_iters; i++) { + if (!ztest_raidz_attach_test) + zi->zi_func(zd, id); + else if (zi->zi_raidz_attach_compatible) + zi->zi_func(zd, id); + } functime = gethrtime() - functime; @@ -7079,6 +7279,38 @@ ztest_execute(int test, ztest_info_t *zi, uint64_t id) (double)functime / NANOSEC, zi->zi_funcname); } +static void +ztest_rzx_thread(void *arg) +{ + int rand; + uint64_t id = (uintptr_t)arg; + ztest_shared_t *zs = ztest_shared; + uint64_t call_next; + hrtime_t now; + ztest_info_t *zi; + ztest_shared_callstate_t *zc; + + while ((now = gethrtime()) < zs->zs_thread_stop) { + /* + * Pick a random function to execute. + * XXX - better to pick a specific set of functions here? + * i.e. a deterministic set of operations to generate pool data. + */ + rand = ztest_random(RAIDZ_EXPAND_FUNCS); + zi = &raidz_expand_info[rand]; + zc = ZTEST_GET_SHARED_CALLSTATE(rand); + call_next = zc->zc_next; + + if (now >= call_next && + atomic_cas_64(&zc->zc_next, call_next, call_next + + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { + ztest_execute(rand, zi, id); + } + } + + thread_exit(); +} + static void ztest_thread(void *arg) { @@ -7285,9 +7517,13 @@ ztest_freeze(void) spa_t *spa; int numloops = 0; + if (ztest_raidz_attach_test) + return; + if (ztest_opts.zo_verbose >= 3) (void) printf("testing spa_freeze()...\n"); + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); VERIFY0(ztest_dataset_open(0)); @@ -7355,6 +7591,7 @@ ztest_freeze(void) /* * Open and close the pool and dataset to induce log replay. */ + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); @@ -7400,6 +7637,7 @@ ztest_import(ztest_shared_t *zs) mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); ztest_import_impl(zs); @@ -7422,6 +7660,363 @@ ztest_import(ztest_shared_t *zs) mutex_destroy(&ztest_checkpoint_lock); } +#define RAIDZ_EXPAND_KILLED UINT64_MAX +#define RAIDZ_EXPAND_CHECKED (UINT64_MAX - 1) + +/* + * Start a raidz expansion test. We run some I/O on the pool for a while + * to get some data in the pool. Then we grow the raidz and + * kill the test at the requested offset into the reflow, verifying that + * doing such does not lead to pool corruption. + */ +static void +ztest_raidz_expand_run(ztest_shared_t *zs) +{ + spa_t *spa; + objset_t *os; + kthread_t *resume_thread, *deadman_thread; + kthread_t **run_threads; + uint64_t object; + uint64_t ashift = ztest_get_ashift(); + int error; + int i, t, d; + vdev_t *rzvd, *cvd; + uint64_t csize, desreflow; + nvlist_t *root; + char *newpath; + pool_raidz_expand_stat_t rzx_stats; + pool_raidz_expand_stat_t *pres = &rzx_stats; + + newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + ztest_exiting = B_FALSE; + + /* + * Initialize parent/child shared state. + */ + mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); + VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + + zs->zs_thread_start = gethrtime(); + zs->zs_thread_stop = + zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; + zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); + zs->zs_thread_kill = zs->zs_thread_stop; + if (ztest_random(100) < ztest_opts.zo_killrate) { + zs->zs_thread_kill -= + ztest_random(ztest_opts.zo_passtime * NANOSEC); + } + + mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); + + list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), + offsetof(ztest_cb_data_t, zcd_node)); + + /* + * Open our pool. It may need to be imported first depending on + * what tests were running when the previous pass was terminated. + */ + raidz_scratch_verify(); + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + error = spa_open(ztest_opts.zo_pool, &spa, FTAG); + if (error) { + VERIFY3S(error, ==, ENOENT); + ztest_import_impl(zs); + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + zs->zs_metaslab_sz = + 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + } + + metaslab_preload_limit = ztest_random(20) + 1; + ztest_spa = spa; + + VERIFY0(vdev_raidz_impl_set("cycle")); + + dmu_objset_stats_t dds; + VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, + DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + dmu_objset_fast_stat(os, &dds); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + zs->zs_guid = dds.dds_guid; + dmu_objset_disown(os, B_TRUE, FTAG); + + /* + * Create a thread to periodically resume suspended I/O. + */ + resume_thread = thread_create(NULL, 0, ztest_resume_thread, + spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + + /* + * Create a deadman thread and set to panic if we hang. + */ + deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, + zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; + + /* + * Verify that we can safely inquire about any object, + * whether it's allocated or not. To make it interesting, + * we probe a 5-wide window around each power of two. + * This hits all edge cases, including zero and the max. + */ + for (t = 0; t < 64; t++) { + for (d = -5; d <= 5; d++) { + error = dmu_object_info(spa->spa_meta_objset, + (1ULL << t) + d, NULL); + ASSERT(error == 0 || error == ENOENT || + error == EINVAL); + } + } + + /* + * We should not get any ENOSPC errors in this test + */ + if (zs->zs_enospc_count != 0) { + fatal(0, "raidz expand: ENOSPC errors?"); + } + + run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), + UMEM_NOFAIL); + + if (ztest_opts.zo_verbose >= 4) + (void) printf("starting main threads...\n"); + + /* + * Replay all logs of all datasets in the pool. This is primarily for + * temporary datasets which wouldn't otherwise get replayed, which + * can trigger failures when attempting to offline a SLOG in + * ztest_fault_inject(). + */ + (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, + NULL, DS_FIND_CHILDREN); + + if (ztest_opts.zo_raidz_expand_test != 0 && + ztest_opts.zo_raidz_expand_test < RAIDZ_EXPAND_KILLED) { + desreflow = ztest_opts.zo_raidz_expand_test; + /* + * Set the reflow to pause at the desired offset + */ + raidz_expand_max_offset_pause = desreflow; + /* + * In here on first pass of test only. + */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("running raidz expansion test," + " killing when offset %llu of reflow reached\n", + (u_longlong_t)desreflow); + if (ztest_opts.zo_verbose > 1) { + /* XXX - pause to allow debugger attach */ + (void) printf( + "our pid is %d, pausing for 10 seconds\n", + getpid()); + sleep(10); + } + } + /* + * Put some data in the pool and then attach a vdev to initiate + * reflow. + */ + /* + * Kick off all the I/O generators that run in parallel. + */ + for (t = 0; t < ztest_opts.zo_threads; t++) { + if (t < ztest_opts.zo_datasets && + ztest_dataset_open(t) != 0) { + umem_free(run_threads, ztest_opts.zo_threads * + sizeof (kthread_t *)); + return; + } + + run_threads[t] = thread_create(NULL, 0, + ztest_rzx_thread, + (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, + defclsyspri); + } + + /* + * Wait a while for I/O to put some data in the pool + * XXX- add an option to specify if we wait for I/O to quiesce + */ + for (i = 0; i < 60; i++) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 1000); + } + + rzvd = spa->spa_root_vdev->vdev_child[0]; + ASSERT(rzvd->vdev_ops == &vdev_raidz_ops); + /* + * get size of a child of the raidz group + */ + cvd = rzvd->vdev_child[0]; + + csize = vdev_get_min_asize(cvd); + csize += csize / 10; /* make sure device is a bit bigger */ + /* + * Path to vdev to be attached + */ + (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children); + /* + * Build the nvlist describing newpath. + */ + root = make_vdev_root(newpath, NULL, NULL, csize, ashift, NULL, + 0, 0, 1); + /* + * Now attach the vdev to the raidz so it will expand + */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("expanding raidz\n"); + } + error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, + B_FALSE); + nvlist_free(root); + if (error != 0) { + fatal(0, "raidz expand: attach (%s %llu) returned %d", + newpath, (long long)csize, error); + } + + /* + * Wait for desired reflow offset to be reached and kill the + * test + */ + /* + * Wait for reflow to begin + */ + while (spa->spa_raidz_expand == NULL) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + } + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + while (pres->pres_state != DSS_SCANNING) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + ASSERT3U(pres->pres_state, ==, DSS_SCANNING); + ASSERT3U(pres->pres_to_reflow, !=, 0); + /* + * Set so when we are killed we go to raidz checking rather than + * restarting test. + */ + ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED; + if (ztest_opts.zo_verbose >= 1) { + (void) printf("raidz expandsion reflow started," + " waiting for offset %llu to be reched\n", + (u_longlong_t)desreflow); + } + + while (pres->pres_reflowed < desreflow) { + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); /* wait 1/10 second */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, pres); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + /* + * XXX - should we clear the reflow pause here? + */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf( + "killing raidz expandsion test offset at %llu\n", + (u_longlong_t)pres->pres_reflowed); + } + /* + * Kill ourself, this simulates a panic during a reflow. Our + * parent will restart the test and the changed flag value + * will drive the test through the scrub/check code to + * verify the pool is not corrupted. + */ + ztest_kill(zs); + } else { /* check the pool is healthy */ + /* + * Set pool check done flag, main program will run a zdb check + * of the pool when we exit. + */ + ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED; + /* XXX - wait for reflow done? */ + if (ztest_opts.zo_verbose >= 1) { + (void) printf("\nverifying raidz expansion\n"); + if (ztest_opts.zo_verbose > 1) { + /* XXX - pause to allow debugger attach */ + (void) printf( + "our pid is %d, pausing for 10 seconds\n", + getpid()); + sleep(10); + } + } + VERIFY0(ztest_scrub_impl(spa)); + if (ztest_opts.zo_verbose >= 1) { + (void) printf("raidz expansion scrub check complete\n"); + } + } + + + txg_wait_synced(spa_get_dsl(spa), 0); + + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); + + umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); + + /* Kill the resume and deadman threads */ + ztest_exiting = B_TRUE; + VERIFY0(thread_join(resume_thread)); + VERIFY0(thread_join(deadman_thread)); + ztest_resume(spa); + + /* + * Right before closing the pool, kick off a bunch of async I/O; + * spa_close() should wait for it to complete. + */ + for (object = 1; object < 50; object++) { + dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, + ZIO_PRIORITY_SYNC_READ); + } + + /* Verify that at least one commit cb was called in a timely fashion */ + if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) + VERIFY0(zc_min_txg_delay); + + spa_close(spa, FTAG); + + /* + * Verify that we can loop over all pools. + */ + mutex_enter(&spa_namespace_lock); + for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) + if (ztest_opts.zo_verbose > 3) + (void) printf("spa_next: found %s\n", spa_name(spa)); + mutex_exit(&spa_namespace_lock); + + /* + * Verify that we can export the pool and reimport it under a + * different name. + */ + if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + (void) snprintf(name, sizeof (name), "%s_import", + ztest_opts.zo_pool); + ztest_spa_import_export(ztest_opts.zo_pool, name); + ztest_spa_import_export(name, ztest_opts.zo_pool); + } + + kernel_fini(); + + list_destroy(&zcl.zcl_callbacks); + mutex_destroy(&zcl.zcl_callbacks_lock); + (void) pthread_rwlock_destroy(&ztest_name_lock); + mutex_destroy(&ztest_vdev_lock); + mutex_destroy(&ztest_checkpoint_lock); +} + /* * Kick off threads to run tests on all datasets in parallel. */ @@ -7464,6 +8059,7 @@ ztest_run(ztest_shared_t *zs) * Open our pool. It may need to be imported first depending on * what tests were running when the previous pass was terminated. */ + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); error = spa_open(ztest_opts.zo_pool, &spa, FTAG); if (error) { @@ -7477,7 +8073,12 @@ ztest_run(ztest_shared_t *zs) metaslab_preload_limit = ztest_random(20) + 1; ztest_spa = spa; - VERIFY0(vdev_raidz_impl_set("cycle")); + ztest_raidz_attach_test = ztest_vdev_raidz_attach_possible(spa); + + /* + * BUGBUG raidz expansion do not run this for now + * VERIFY0(vdev_raidz_impl_set("cycle")); + */ dmu_objset_stats_t dds; VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, @@ -7707,6 +8308,7 @@ ztest_init(ztest_shared_t *zs) mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + raidz_scratch_verify(); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); /* @@ -8054,10 +8656,14 @@ main(int argc, char **argv) metaslab_df_alloc_threshold = zs->zs_metaslab_df_alloc_threshold; - if (zs->zs_do_init) + if (zs->zs_do_init) { ztest_run_init(); - else - ztest_run(zs); + } else { + if (ztest_opts.zo_raidz_expand_test) + ztest_raidz_expand_run(zs); + else + ztest_run(zs); + } exit(0); } @@ -8183,6 +8789,9 @@ main(int argc, char **argv) if (!ztest_opts.zo_mmp_test) ztest_run_zdb(ztest_opts.zo_pool); + if (ztest_shared_opts->zo_raidz_expand_test == + RAIDZ_EXPAND_CHECKED) + break; /* raidz expand test complete */ } if (ztest_opts.zo_verbose >= 1) { @@ -8196,6 +8805,8 @@ main(int argc, char **argv) kills, iters - kills, (100.0 * kills) / MAX(1, iters)); } + dump_debug_buffer(); + umem_free(cmd, MAXNAMELEN); return (0); diff --git a/contrib/pyzfs/libzfs_core/_constants.py b/contrib/pyzfs/libzfs_core/_constants.py index 2dfed224c29d..0e1fd7e99374 100644 --- a/contrib/pyzfs/libzfs_core/_constants.py +++ b/contrib/pyzfs/libzfs_core/_constants.py @@ -99,6 +99,7 @@ def enum(*sequential, **named): 'ZFS_ERR_RESILVER_IN_PROGRESS', 'ZFS_ERR_REBUILD_IN_PROGRESS', 'ZFS_ERR_BADPROP', + 'ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS', ], {} ) @@ -110,5 +111,6 @@ def enum(*sequential, **named): ZFS_ERR_DEVRM_IN_PROGRESS = zfs_errno.ZFS_ERR_DEVRM_IN_PROGRESS ZFS_ERR_VDEV_TOO_BIG = zfs_errno.ZFS_ERR_VDEV_TOO_BIG ZFS_ERR_WRONG_PARENT = zfs_errno.ZFS_ERR_WRONG_PARENT +ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS = zfs_errno.ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS # vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/_error_translation.py b/contrib/pyzfs/libzfs_core/_error_translation.py index f494461f63b2..c562f2ffb733 100644 --- a/contrib/pyzfs/libzfs_core/_error_translation.py +++ b/contrib/pyzfs/libzfs_core/_error_translation.py @@ -43,6 +43,7 @@ ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, ZFS_ERR_WRONG_PARENT, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, zfs_errno ) @@ -594,6 +595,8 @@ def lzc_pool_checkpoint_translate_error(ret, name, discard=False): raise lzc_exc.DeviceRemovalRunning() if ret == ZFS_ERR_VDEV_TOO_BIG: raise lzc_exc.DeviceTooBig() + if ret == ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: + raise lzc_exc.RaidzExpansionRunning() if discard: raise _generic_exception( ret, name, "Failed to discard pool checkpoint") diff --git a/contrib/pyzfs/libzfs_core/exceptions.py b/contrib/pyzfs/libzfs_core/exceptions.py index e484b07b6450..ba8f7e49093c 100644 --- a/contrib/pyzfs/libzfs_core/exceptions.py +++ b/contrib/pyzfs/libzfs_core/exceptions.py @@ -30,6 +30,7 @@ ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, ZFS_ERR_WRONG_PARENT, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, zfs_errno ) @@ -598,4 +599,9 @@ class DeviceTooBig(ZFSError): message = "One or more top-level vdevs exceed the maximum vdev size" +class RaidzExpansionRunning(ZFSError): + errno = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS + message = "A raidz device is currently expanding" + + # vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/include/libzfs.h b/include/libzfs.h index c0883a983678..e9309f2c9b74 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -150,6 +150,7 @@ typedef enum zfs_error { EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */ EZFS_EXPORT_IN_PROGRESS, /* currently exporting the pool */ EZFS_REBUILDING, /* resilvering (sequential reconstrution) */ + EZFS_RAIDZ_EXPAND_IN_PROGRESS, /* a raidz is currently expanding */ EZFS_UNKNOWN } zfs_error_t; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 2af11fc7196d..3845e2bb9a6f 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -630,6 +630,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ +#define ZPOOL_CONFIG_RAIDZ_EXPAND_STATS "raidz_expand_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ @@ -695,6 +696,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" +#define ZPOOL_CONFIG_RAIDZ_EXPANDING "raidz_expanding" +#define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" @@ -810,6 +813,15 @@ typedef struct zpool_load_policy { #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE \ + "org.freebsd:raidz_expand_state" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME \ + "org.freebsd:raidz_expand_start_time" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME \ + "org.freebsd:raidz_expand_end_time" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED \ + "org.freebsd:raidz_expand_bytes_copied" + /* vdev metaslab allocation bias */ #define VDEV_ALLOC_BIAS_LOG "log" #define VDEV_ALLOC_BIAS_SPECIAL "special" @@ -1026,6 +1038,16 @@ typedef struct pool_removal_stat { uint64_t prs_mapping_memory; } pool_removal_stat_t; +typedef struct pool_raidz_expand_stat { + uint64_t pres_state; /* dsl_scan_state_t */ + uint64_t pres_expanding_vdev; + uint64_t pres_start_time; + uint64_t pres_end_time; + uint64_t pres_to_reflow; /* bytes that need to be moved */ + uint64_t pres_reflowed; /* bytes moved so far */ + uint64_t pres_waiting_for_resilver; +} pool_raidz_expand_stat_t; + typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, @@ -1417,6 +1439,7 @@ typedef enum { ZFS_ERR_RESILVER_IN_PROGRESS, ZFS_ERR_REBUILD_IN_PROGRESS, ZFS_ERR_BADPROP, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, } zfs_errno_t; /* @@ -1441,6 +1464,7 @@ typedef enum { ZPOOL_WAIT_RESILVER, ZPOOL_WAIT_SCRUB, ZPOOL_WAIT_TRIM, + ZPOOL_WAIT_RAIDZ_EXPAND, ZPOOL_WAIT_NUM_ACTIVITIES } zpool_wait_activity_t; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 9714bbce9c9d..fd54fcfaeb1b 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -315,6 +316,9 @@ struct spa { spa_condensing_indirect_t *spa_condensing_indirect; zthr_t *spa_condense_zthr; /* zthr doing condense. */ + vdev_raidz_expand_t *spa_raidz_expand; + zthr_t *spa_raidz_expand_zthr; + uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h index 91699e65131a..28ebca3c2ba1 100644 --- a/include/sys/uberblock_impl.h +++ b/include/sys/uberblock_impl.h @@ -75,6 +75,27 @@ extern "C" { #define MMP_FAIL_INT_SET(fail) \ (((uint64_t)(fail & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT) +typedef enum raidz_reflow_scratch_state { + RRSS_SCRATCH_NOT_IN_USE = 0, + RRSS_SCRATCH_VALID, +} raidz_reflow_scratch_state_t; + +#define RRSS_GET_OFFSET(ub) \ + BF64_GET_SB((ub)->ub_raidz_reflow_info, 0, 32, SPA_MINBLOCKSHIFT, 0) +#define RRSS_SET_OFFSET(ub, x) \ + BF64_SET_SB((ub)->ub_raidz_reflow_info, 0, 32, SPA_MINBLOCKSHIFT, 0, x) + +#define RRSS_GET_STATE(ub) \ + BF64_GET_SB((ub)->ub_raidz_reflow_info, 32, 8, 0, 0) +#define RRSS_SET_STATE(ub, x) \ + BF64_SET_SB((ub)->ub_raidz_reflow_info, 32, 8, 0, 0, x) + +#define RAIDZ_REFLOW_SET(ub, state, offset) do { \ + (ub)->ub_raidz_reflow_info = 0; \ + RRSS_SET_OFFSET(ub, offset); \ + RRSS_SET_STATE(ub, state); \ +} while (0) + struct uberblock { uint64_t ub_magic; /* UBERBLOCK_MAGIC */ uint64_t ub_version; /* SPA_VERSION */ @@ -136,6 +157,8 @@ struct uberblock { * the ZIL block is not allocated [see uses of spa_min_claim_txg()]. */ uint64_t ub_checkpoint_txg; + + uint64_t ub_raidz_reflow_info; }; #ifdef __cplusplus diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 0a81713a44d0..4eb25c7fb3c5 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -132,15 +132,19 @@ extern void vdev_space_update(vdev_t *vd, extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space); +extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, + uint64_t txg); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); /* - * Return the amount of space allocated for a gang block header. + * Return the amount of space allocated for a gang block header. Note that + * since the physical birth txg is not provided, this must be constant for + * a given vdev. (e.g. raidz expansion can't change this) */ static inline uint64_t vdev_gang_header_asize(vdev_t *vd) { - return (vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE)); + return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0)); } extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); @@ -207,6 +211,7 @@ extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, int flags); extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *); +extern int vdev_uberblock_sync_list(vdev_t **, int, struct uberblock *, int); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 3cfde40a77fe..100f42a61dbb 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -73,7 +73,7 @@ typedef void vdev_fini_func_t(vdev_t *vd); typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); -typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize, uint64_t txg); typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); @@ -296,6 +296,7 @@ struct vdev { uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ uint64_t vdev_removing; /* device is being removed? */ + boolean_t vdev_rz_expanding; /* raidz is being expanded? */ boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ @@ -541,6 +542,7 @@ typedef struct vdev_label { /* * Size of embedded boot loader region on each label. * The total size of the first two labels plus the boot area is 4MB. + * On RAIDZ, this space is overwritten during RAIDZ expansion. */ #define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ @@ -613,7 +615,7 @@ extern vdev_ops_t vdev_indirect_ops; */ extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs); -extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index ee597eb0dbb3..f784d425e9b3 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -26,6 +26,7 @@ #define _SYS_VDEV_RAIDZ_H #include +#include #ifdef __cplusplus extern "C" { @@ -34,6 +35,8 @@ extern "C" { struct zio; struct raidz_row; struct raidz_map; +struct vdev_raidz; +struct uberblock; #if !defined(_KERNEL) struct kernel_param {}; #endif @@ -43,12 +46,18 @@ struct kernel_param {}; */ struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, uint64_t); +struct raidz_map *vdev_raidz_map_alloc_expanded(struct zio *, + uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, boolean_t); void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_free(struct vdev_raidz *); void vdev_raidz_generate_parity_row(struct raidz_map *, struct raidz_row *); void vdev_raidz_generate_parity(struct raidz_map *); void vdev_raidz_reconstruct(struct raidz_map *, const int *, int); void vdev_raidz_child_done(zio_t *); void vdev_raidz_io_done(zio_t *); +struct raidz_row *vdev_raidz_row_alloc(int); +void vdev_raidz_reflow_copy_scratch(spa_t *); +void raidz_dtl_reassessed(vdev_t *); extern const zio_vsd_ops_t vdev_raidz_vsd_ops; @@ -63,11 +72,90 @@ int vdev_raidz_math_reconstruct(struct raidz_map *, struct raidz_row *, const int *, const int *, const int); int vdev_raidz_impl_set(const char *); +typedef struct vdev_raidz_expand { + uint64_t vre_vdev_id; + + kmutex_t vre_lock; + kcondvar_t vre_cv; + + /* + * How much i/o is outstanding (issued and not completed). + */ + uint64_t vre_outstanding_bytes; + + /* + * Next offset to issue i/o for. + */ + uint64_t vre_offset; + + /* + * Lowest offset of a failed expansion i/o. The expansion will retry + * from here. Once the expansion thread notices the failure and exits, + * vre_failed_offset is reset back to UINT64_MAX, and + * vre_waiting_for_resilver will be set. + */ + uint64_t vre_failed_offset; + boolean_t vre_waiting_for_resilver; + + /* + * Offset that is completing each txg + */ + uint64_t vre_offset_pertxg[TXG_SIZE]; + + /* + * Bytes copied in each txg. + */ + uint64_t vre_bytes_copied_pertxg[TXG_SIZE]; + + /* + * The rangelock prevents normal read/write zio's from happening while + * there are expansion (reflow) i/os in progress to the same offsets. + */ + zfs_rangelock_t vre_rangelock; + + /* + * These fields are stored on-disk in the vdev_top_zap: + */ + dsl_scan_state_t vre_state; + uint64_t vre_start_time; + uint64_t vre_end_time; + uint64_t vre_bytes_copied; +} vdev_raidz_expand_t; + typedef struct vdev_raidz { - int vd_logical_width; + /* + * Number of child vdevs when this raidz vdev was created (i.e. before + * any raidz expansions). + */ + int vd_original_width; + + /* + * The current number of child vdevs, which may be more than the + * original width if an expansion is in progress or has completed. + */ + int vd_physical_width; + int vd_nparity; + + /* + * Tree of reflow_node_t's. The lock protects the avl tree only. + * The reflow_node_t's describe completed expansions, and are used + * to determine the logical width given a block's birth time. + */ + avl_tree_t vd_expand_txgs; + kmutex_t vd_expand_lock; + + /* + * If this vdev is being expanded, spa_raidz_expand is set to this + */ + vdev_raidz_expand_t vn_vre; } vdev_raidz_t; +extern int vdev_raidz_attach_check(vdev_t *); +extern void vdev_raidz_attach_sync(void *, dmu_tx_t *); +extern void spa_start_raidz_expansion_thread(spa_t *); +extern int spa_raidz_expand_get_stats(spa_t *, pool_raidz_expand_stat_t *); +extern int vdev_raidz_load(vdev_t *); #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 908723da0c2a..74714538cafe 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -30,6 +30,8 @@ #include #include #include +#include +#include #ifdef __cplusplus extern "C" { @@ -102,28 +104,31 @@ typedef struct raidz_impl_ops { char name[RAIDZ_IMPL_NAME_MAX]; /* Name of the implementation */ } raidz_impl_ops_t; + typedef struct raidz_col { - uint64_t rc_devidx; /* child device index for I/O */ + int rc_devidx; /* child device index for I/O */ + uint32_t rc_size; /* I/O size */ uint64_t rc_offset; /* device offset */ - uint64_t rc_size; /* I/O size */ abd_t rc_abdstruct; /* rc_abd probably points here */ abd_t *rc_abd; /* I/O data */ abd_t *rc_orig_data; /* pre-reconstruction */ int rc_error; /* I/O error for this device */ - uint8_t rc_tried; /* Did we attempt this I/O column? */ - uint8_t rc_skipped; /* Did we skip this I/O column? */ - uint8_t rc_need_orig_restore; /* need to restore from orig_data? */ - uint8_t rc_force_repair; /* Write good data to this column */ - uint8_t rc_allow_repair; /* Allow repair I/O to this column */ + uint8_t rc_tried:1; /* Did we attempt this I/O column? */ + uint8_t rc_skipped:1; /* Did we skip this I/O column? */ + uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */ + uint8_t rc_force_repair:1; /* Write good data to this column */ + uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */ + int rc_shadow_devidx; /* for double write during expansion */ + int rc_shadow_error; /* for double write during expansion */ + uint64_t rc_shadow_offset; /* for double write during expansion */ } raidz_col_t; typedef struct raidz_row { - uint64_t rr_cols; /* Regular column count */ - uint64_t rr_scols; /* Count including skipped columns */ - uint64_t rr_bigcols; /* Remainder data column count */ - uint64_t rr_missingdata; /* Count of missing data devices */ - uint64_t rr_missingparity; /* Count of missing parity devices */ - uint64_t rr_firstdatacol; /* First data column/parity count */ + int rr_cols; /* Regular column count */ + int rr_scols; /* Count including skipped columns */ + int rr_missingdata; /* Count of missing data devices */ + int rr_missingparity; /* Count of missing parity devices */ + int rr_firstdatacol; /* First data column/parity count */ abd_t *rr_abd_empty; /* dRAID empty sector buffer */ int rr_nempty; /* empty sectors included in parity */ #ifdef ZFS_DEBUG @@ -138,10 +143,25 @@ typedef struct raidz_map { int rm_nrows; /* Regular row count */ int rm_nskip; /* RAIDZ sectors skipped for padding */ int rm_skipstart; /* Column index of padding start */ + int rm_original_width; /* pre-expansion width of raidz vdev */ + int rm_nphys_cols; /* num entries in rm_phys_col[] */ + zfs_locked_range_t *rm_lr; const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ + raidz_col_t *rm_phys_col; /* if non-NULL, read i/o aggregation */ raidz_row_t *rm_row[0]; /* flexible array of rows */ } raidz_map_t; +/* + * Nodes in vdev_raidz_t:vd_expand_txgs. + * Blocks with physical birth time of re_txg or later have the specified + * logical width (until the next node). + */ +typedef struct reflow_node { + uint64_t re_txg; + uint64_t re_logical_width; + avl_node_t re_link; +} reflow_node_t; + #define RAIDZ_ORIGINAL_IMPL (INT_MAX) diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 874cbd9ff714..db9d5231bbba 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -75,6 +75,7 @@ typedef enum spa_feature { SPA_FEATURE_DEVICE_REBUILD, SPA_FEATURE_ZSTD_COMPRESS, SPA_FEATURE_DRAID, + SPA_FEATURE_RAIDZ_EXPANSION, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 8ed96275c4db..d95c86941e50 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3392,9 +3392,8 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, break; case EBUSY: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, " - "or device removal is in progress"), - new_disk); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "%s is busy"), new_disk); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index c3c009ae3a10..28f981e01916 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -296,6 +296,8 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_REBUILDING: return (dgettext(TEXT_DOMAIN, "currently sequentially " "resilvering")); + case EZFS_RAIDZ_EXPAND_IN_PROGRESS: + return (dgettext(TEXT_DOMAIN, "raidz expansion in progress")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -732,6 +734,9 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_IOC_ARG_BADTYPE: zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; + case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: + zfs_verror(hdl, EZFS_RAIDZ_EXPAND_IN_PROGRESS, fmt, ap); + break; default: zfs_error_aux(hdl, "%s", strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 9c21688705b8..ca7ae67402fe 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -387,6 +387,15 @@ Note, that both this many milliseconds and .Sy metaslab_unload_delay TXGs must pass before unloading will occur. . +.It Sy raidz_expand_max_copy_bytes Ns = Ns Sy 160MB Pq ulong +Max amount of concurrent i/o for RAIDZ expansion. +. +.It Sy raidz_expand_max_offset_pause Ns = Ns Sy 0 Pq ulong +For testing, pause RAIDZ expansion at this offset. +. +.It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong +For expanded RAIDZ, aggregate reads that have more rows than this. +. .It Sy reference_history Ns = Ns Sy 3 Pq int Maximum reference holders being tracked when reference_tracking_enable is active. . diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 83ca91175370..826f93b7a1f4 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -836,6 +836,14 @@ once all filesystems that have ever had their property set to .Sy zstd are destroyed. +. +.feature org.openzfs raidz_expansion no none +This feature enables the +.Nm zpool Cm attach +subcommand to attach a new device to a RAIDZ group, expanding the total +amount usable space in the pool. +See +.Xr zpool-attach 8 . .El . .Sh SEE ALSO diff --git a/man/man8/zpool-attach.8 b/man/man8/zpool-attach.8 index 19d8f6ac07ac..c9adb8dada8a 100644 --- a/man/man8/zpool-attach.8 +++ b/man/man8/zpool-attach.8 @@ -45,7 +45,14 @@ Attaches .Ar new_device to the existing .Ar device . -The existing device cannot be part of a raidz configuration. +The behavior differs depending on if the existing +.Ar device +is a RAIDZ device, or a mirror/plain device. +.Pp +If the existing device is a mirror or plain device (e.g. specified as "sda" or +"mirror-7"), the new device will be mirrored with the existing device, a +resilver will be initiated, and the new device will contribute to additional +redundancy once the resilver completes. If .Ar device is not currently part of a mirrored configuration, @@ -62,6 +69,38 @@ creates a three-way mirror, and so on. In either case, .Ar new_device begins to resilver immediately and any running scrub is cancelled. +.Pp +If the existing device is a RAIDZ device (e.g. specified as "raidz2-0"), the new +device will become part of that RAIDZ group. +A "raidz expansion" will be initiated, and the new device will contribute +additional space to the RAIDZ group once the expansion completes. +The expansion entails reading all allocated space from existing disks in the +RAIDZ group, and rewriting it to the new disks in the RAIDZ group (including the +newly added +.Ar device ) . +Its progress can be monitored with +.Nm zpool Cm status . +.Pp +Data redundancy is maintained during and after the expansion. +If a disk fails while the expansion is in progress, the expansion pauses until +the health of the RAIDZ vdev is restored (e.g. by replacing the failed disk +and waiting for reconstruction to complete). +Expansion does not change the number of failures that can be tolerated +without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after expansion). +A RAIDZ vdev can be expanded multiple times. +.Pp +After the expansion completes, old blocks remain with their old data-to-parity +ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but distributed among the +larger set of disks. +New blocks will be written with the new data-to-parity ratio (e.g. a 5-wide +RAIDZ2 which has been expanded once to 6-wide, has 4 data to 2 parity). +However, the RAIDZ vdev's "assumed parity ratio" does not change, so slightly +less space than is expected may be reported for newly-written blocks, according +to +.Nm zfs Cm list , +.Nm df , +.Nm ls Fl s , +and similar tools. .Bl -tag -width Ds .It Fl f Forces use of @@ -76,16 +115,15 @@ manual page for a list of valid properties that can be set. The only property supported at the moment is .Sy ashift . .It Fl s -The +When attaching to a mirror or plain device, the .Ar new_device is reconstructed sequentially to restore redundancy as quickly as possible. Checksums are not verfied during sequential reconstruction so a scrub is started when the resilver completes. -Sequential reconstruction is not supported for raidz configurations. .It Fl w Waits until .Ar new_device -has finished resilvering before returning. +has finished resilvering or expanding before returning. .El . .Sh SEE ALSO diff --git a/man/man8/zpool-wait.8 b/man/man8/zpool-wait.8 index 38f4812ace10..40603b671873 100644 --- a/man/man8/zpool-wait.8 +++ b/man/man8/zpool-wait.8 @@ -20,7 +20,7 @@ .\" .\" .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. -.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012, 2021 by Delphix. All rights reserved. .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2018 George Melikov. All Rights Reserved. @@ -76,6 +76,8 @@ Resilver to cease Scrub to cease .It Sy trim Manual trim to cease +.It Sy raidz_expand +Attaching to a RAIDZ vdev to complete .El .Pp If an diff --git a/module/os/linux/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c index 98c9923d5927..b68c93436543 100644 --- a/module/os/linux/zfs/zfs_debug.c +++ b/module/os/linux/zfs/zfs_debug.c @@ -174,7 +174,8 @@ __dprintf(boolean_t dprint, const char *file, const char *func, newfile = file; } - i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func); + i = snprintf(buf, size, "%px %s%s:%d:%s(): ", + curthread, prefix, newfile, line, func); if (i < size) { va_start(adx, fmt); diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index fc0e09605eef..4bc3b9c97017 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -598,6 +598,11 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_DRAID, "org.openzfs:draid", "draid", "Support for distributed spare RAID", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_RAIDZ_EXPANSION, + "org.openzfs:raidz_expansion", "raidz_expansion", + "Support for raidz expansion", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL); } #if defined(_KERNEL) diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index d25c067dfbc1..50d4ed3d40b5 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2708,7 +2708,6 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_visit_rootbp(scn, NULL, &dp->dp_meta_rootbp, tx); - spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); if (scn->scn_suspending) return; diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index d1fee70f004b..ce68134b00bb 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -4272,7 +4272,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); - if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { + if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing || + vd->vdev_rz_expanding) { defer_allowed = B_FALSE; } @@ -5210,7 +5211,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, ASSERT(mg->mg_class == mc); - uint64_t asize = vdev_psize_to_asize(vd, psize); + uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); /* diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 7546e3e414f1..dd9a7f626af4 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -1579,6 +1580,10 @@ spa_destroy_aux_threads(spa_t *spa) zthr_destroy(spa->spa_livelist_condense_zthr); spa->spa_livelist_condense_zthr = NULL; } + if (spa->spa_raidz_expand_zthr != NULL) { + zthr_destroy(spa->spa_raidz_expand_zthr); + spa->spa_raidz_expand_zthr = NULL; + } } /* @@ -1728,6 +1733,8 @@ spa_unload(spa_t *spa) spa->spa_compatibility = NULL; } + spa->spa_raidz_expand = NULL; + spa_config_exit(spa, SCL_ALL, spa); } @@ -2835,6 +2842,7 @@ spa_spawn_aux_threads(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); + spa_start_raidz_expansion_thread(spa); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); @@ -3581,6 +3589,12 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); + if (ub->ub_raidz_reflow_info != 0) { + spa_load_note(spa, "uberblock raidz_reflow_info: " + "state=%u offset=%llu", + (int)RRSS_GET_STATE(ub), + (u_longlong_t)RRSS_GET_OFFSET(ub)); + } /* @@ -4881,6 +4895,15 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); + /* + * Before we do any zio_write's, complete the raidz expansion + * scratch space copying, if necessary. + */ + if (RRSS_GET_STATE(&spa->spa_uberblock) != + RRSS_SCRATCH_NOT_IN_USE) { + vdev_raidz_reflow_copy_scratch(spa); + } + /* * In case of a checkpoint rewind, log the original txg * of the checkpointed uberblock. @@ -6676,8 +6699,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; - int newvd_isspare; + int newvd_isspare = B_FALSE; int error; + boolean_t raidz = B_FALSE; ASSERT(spa_writeable(spa)); @@ -6705,16 +6729,31 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, ZFS_ERR_REBUILD_IN_PROGRESS)); } - if (spa->spa_vdev_removal != NULL) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + if (spa->spa_vdev_removal != NULL) { + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_DEVRM_IN_PROGRESS)); + } if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - if (!oldvd->vdev_ops->vdev_op_leaf) + if (oldvd->vdev_ops == &vdev_raidz_ops) { + raidz = B_TRUE; + /* + * Can't expand a raidz while prior expand is in progress. + */ + if (spa->spa_raidz_expand != NULL) { + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); + } + } else if (!oldvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + } - pvd = oldvd->vdev_parent; + if (raidz) + pvd = oldvd; + else + pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH)) != 0) @@ -6768,6 +6807,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_raidz_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); @@ -6807,7 +6847,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, /* * Make sure the new device is big enough. */ - if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) + vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; + if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* @@ -6817,32 +6858,57 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + /* + * RAIDZ-expansion-specific checks. + */ + if (raidz && vdev_raidz_attach_check(newvd) != 0) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + + if (raidz) { + /* + * Note: oldvdpath is freed by spa_strfree(), but + * kmem_asprintf() is freed by kmem_strfree(), so we have to + * move it to a spa_strdup-ed string. + */ + char *tmp = kmem_asprintf("raidz%u-%u", + vdev_get_nparity(oldvd), oldvd->vdev_id); + oldvdpath = spa_strdup(tmp); + kmem_strfree(tmp); + } else { + oldvdpath = spa_strdup(oldvd->vdev_path); + } + newvdpath = spa_strdup(newvd->vdev_path); + /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ - if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { + if (strcmp(oldvdpath, newvdpath) == 0) { spa_strfree(oldvd->vdev_path); - oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, + oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, KM_SLEEP); - (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, - "%s/%s", newvd->vdev_path, "old"); + (void) sprintf(oldvd->vdev_path, "%s/old", + newvdpath); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } + spa_strfree(oldvdpath); + oldvdpath = spa_strdup(oldvd->vdev_path); } /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ - if (pvd->vdev_ops != pvops) + if (!raidz && pvd->vdev_ops != pvops) { pvd = vdev_add_parent(oldvd, pvops); + ASSERT(pvd->vdev_ops == pvops); + ASSERT(oldvd->vdev_parent == pvd); + } ASSERT(pvd->vdev_top->vdev_parent == rvd); - ASSERT(pvd->vdev_ops == pvops); - ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. @@ -6870,41 +6936,66 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, - TXG_INITIAL, dtl_max_txg - TXG_INITIAL); + if (raidz) { + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); - if (newvd->vdev_isspare) { - spa_spare_activate(newvd); - spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); - } + vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); + vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); + vdev_autotrim_stop_wait(tvd); - oldvdpath = spa_strdup(oldvd->vdev_path); - newvdpath = spa_strdup(newvd->vdev_path); - newvd_isspare = newvd->vdev_isspare; + dtl_max_txg = spa_vdev_config_enter(spa); - /* - * Mark newvd's DTL dirty in this txg. - */ - vdev_dirty(tvd, VDD_DTL, newvd, txg); + tvd->vdev_rz_expanding = B_TRUE; - /* - * Schedule the resilver or rebuild to restart in the future. We do - * this to ensure that dmu_sync-ed blocks have been stitched into the - * respective datasets. - */ - if (rebuild) { - newvd->vdev_rebuild_txg = txg; + vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); + vdev_config_dirty(tvd); - vdev_rebuild(tvd); + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, + dtl_max_txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, + newvd, tx); + dmu_tx_commit(tx); } else { - newvd->vdev_resilver_txg = txg; + vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, + dtl_max_txg - TXG_INITIAL); - if (dsl_scan_resilvering(spa_get_dsl(spa)) && - spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { - vdev_defer_resilver(newvd); + if (newvd->vdev_isspare) { + spa_spare_activate(newvd); + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); + } + + newvd_isspare = newvd->vdev_isspare; + + /* + * Mark newvd's DTL dirty in this txg. + */ + vdev_dirty(tvd, VDD_DTL, newvd, txg); + + /* + * Schedule the resilver or rebuild to restart in the future. + * We do this to ensure that dmu_sync-ed blocks have been + * stitched into the respective datasets. + */ + if (rebuild) { + newvd->vdev_rebuild_txg = txg; + + vdev_rebuild(tvd); } else { - dsl_scan_restart_resilver(spa->spa_dsl_pool, - dtl_max_txg); + newvd->vdev_resilver_txg = txg; + + if (dsl_scan_resilvering(spa_get_dsl(spa)) && + spa_feature_is_enabled(spa, + SPA_FEATURE_RESILVER_DEFER)) { + vdev_defer_resilver(newvd); + } else { + dsl_scan_restart_resilver(spa->spa_dsl_pool, + dtl_max_txg); + } } } @@ -7229,7 +7320,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, */ if (cmd_type == POOL_INITIALIZE_START && (vd->vdev_initialize_thread != NULL || - vd->vdev_top->vdev_removing)) { + vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && @@ -7344,7 +7435,8 @@ spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, * which has completed but the thread is not exited. */ if (cmd_type == POOL_TRIM_START && - (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { + (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_TRIM_CANCEL && @@ -8241,6 +8333,10 @@ spa_async_suspend(spa_t *spa) if (condense_thread != NULL) zthr_cancel(condense_thread); + zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; + if (raidz_expand_thread != NULL) + zthr_cancel(raidz_expand_thread); + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); @@ -8267,6 +8363,10 @@ spa_async_resume(spa_t *spa) if (condense_thread != NULL) zthr_resume(condense_thread); + zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; + if (raidz_expand_thread != NULL) + zthr_resume(raidz_expand_thread); + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); @@ -9057,6 +9157,28 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) != NULL) vdev_sync(vd, txg); + if (pass == 1) { + /* + * dsl_pool_sync() -> dp_sync_tasks may have dirtied + * the config. If that happens, we don't want this + * txg to be able to be a no-op, so be sure to sync + * the config to the MOS before checking for no-op + * txg below. + * + * Note that when the config is dirty, it will + * be written to the MOS (i.e. the MOS will be + * dirtied) every time we call spa_sync_config_object() + * in this txg. Therefore we can't call this after + * dsl_pool_sync() every pass, because it would + * prevent us from converging, since we'd dirty + * the MOS every pass. + * + * Sync tasks can only be processed in pass 1, so + * there's no need to do this in later passes. + */ + spa_sync_config_object(spa, tx); + } + /* * Note: We need to check if the MOS is dirty because we could * have marked the MOS dirty without updating the uberblock @@ -9699,7 +9821,8 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: - if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) + *in_progress = vdev_rebuild_active(spa->spa_root_vdev); + if (*in_progress) break; fallthrough; case ZPOOL_WAIT_SCRUB: @@ -9714,6 +9837,12 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, is_scrub == (activity == ZPOOL_WAIT_SCRUB)); break; } + case ZPOOL_WAIT_RAIDZ_EXPAND: + { + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); + break; + } default: panic("unrecognized value for activity %d", activity); } diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index 09f62996853d..2d09cfe90c00 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -465,6 +465,9 @@ spa_checkpoint_check(void *arg, dmu_tx_t *tx) if (spa->spa_removing_phys.sr_state == DSS_SCANNING) return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); + if (spa->spa_raidz_expand != NULL) + return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); + if (spa->spa_checkpoint_txg != 0) return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 4a67ba85f58a..7d3290fc807d 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -294,13 +295,13 @@ vdev_derive_alloc_bias(const char *bias) * all children. This is what's used by anything other than RAID-Z. */ uint64_t -vdev_default_asize(vdev_t *vd, uint64_t psize) +vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { - csize = vdev_psize_to_asize(vd->vdev_child[c], psize); + csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); asize = MAX(asize, csize); } @@ -869,6 +870,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); + vd->vdev_rz_expanding = nvlist_exists(nv, + ZPOOL_CONFIG_RAIDZ_EXPANDING); } else { ASSERT0(vd->vdev_top_zap); } @@ -1604,6 +1607,8 @@ vdev_probe_done(zio_t *zio) vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; + vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u", + vd->vdev_cant_read, vd->vdev_cant_write); if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { @@ -1825,17 +1830,19 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) } /* - * Compute the raidz-deflation ratio. Note, we hard-code - * in 128k (1 << 17) because it is the "typical" blocksize. - * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, - * otherwise it would inconsistently account for existing bp's. + * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17) + * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE + * changed, this algorithm can not change, otherwise it would inconsistently + * account for existing bp's. We also hard-code txg 0 for the same reason + * (expanded RAIDZ vdevs can use different asize for different birth txg's). */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / - (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); + (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> + SPA_MINBLOCKSHIFT); } } @@ -3103,32 +3110,43 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); - return; + } else { + mutex_enter(&vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + /* account for child's outage in parent's missing map */ + int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; + if (t == DTL_SCRUB) { + /* leaf vdevs only */ + continue; + } + if (t == DTL_PARTIAL) { + /* i.e. non-zero */ + minref = 1; + } else if (vdev_get_nparity(vd) != 0) { + /* RAIDZ, DRAID */ + minref = vdev_get_nparity(vd) + 1; + } else { + /* any kind of mirror */ + minref = vd->vdev_children; + } + space_reftree_create(&reftree); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + mutex_enter(&cvd->vdev_dtl_lock); + space_reftree_add_map(&reftree, + cvd->vdev_dtl[s], 1); + mutex_exit(&cvd->vdev_dtl_lock); + } + space_reftree_generate_map(&reftree, + vd->vdev_dtl[t], minref); + space_reftree_destroy(&reftree); + } + mutex_exit(&vd->vdev_dtl_lock); } - mutex_enter(&vd->vdev_dtl_lock); - for (int t = 0; t < DTL_TYPES; t++) { - /* account for child's outage in parent's missing map */ - int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; - if (t == DTL_SCRUB) - continue; /* leaf vdevs only */ - if (t == DTL_PARTIAL) - minref = 1; /* i.e. non-zero */ - else if (vdev_get_nparity(vd) != 0) - minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ - else - minref = vd->vdev_children; /* any kind of mirror */ - space_reftree_create(&reftree); - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - mutex_enter(&cvd->vdev_dtl_lock); - space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); - mutex_exit(&cvd->vdev_dtl_lock); - } - space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); - space_reftree_destroy(&reftree); + if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { + raidz_dtl_reassessed(vd); } - mutex_exit(&vd->vdev_dtl_lock); } int @@ -3469,6 +3487,12 @@ vdev_load(vdev_t *vd) vdev_set_deflate_ratio(vd); + if (vd->vdev_ops == &vdev_raidz_ops) { + error = vdev_raidz_load(vd); + if (error != 0) + return (error); + } + /* * On spa_load path, grab the allocation bias from our zap */ @@ -3793,10 +3817,22 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } +/* + * Return the amount of space that should be (or was) allocated for the given + * psize (compressed block size) in the given TXG. Note that for expanded + * RAIDZ vdevs, the size allocated for older BP's may be larger. See + * vdev_raidz_asize(). + */ +uint64_t +vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) +{ + return (vd->vdev_ops->vdev_op_asize(vd, psize, txg)); +} + uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { - return (vd->vdev_ops->vdev_op_asize(vd, psize)); + return (vdev_psize_to_asize_txg(vd, psize, 0)); } /* @@ -3932,9 +3968,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); - wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; @@ -5191,7 +5224,9 @@ vdev_expand(vdev_t *vd, uint64_t txg) vdev_set_deflate_ratio(vd); - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && + if ((vd->vdev_spa->spa_raidz_expand == NULL || + vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) && + (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index b8f82d52e8f0..c583718ac634 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -577,7 +577,7 @@ vdev_draid_permute_id(vdev_draid_config_t *vdc, * i.e. vdev_draid_psize_to_asize(). */ static uint64_t -vdev_draid_asize(vdev_t *vd, uint64_t psize) +vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_ashift; @@ -913,7 +913,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t io_size = abd_size; - uint64_t io_asize = vdev_draid_asize(vd, io_size); + uint64_t io_asize = vdev_draid_asize(vd, io_size, 0); uint64_t group = vdev_draid_offset_to_group(vd, io_offset); uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1); @@ -976,15 +976,9 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, /* The total number of data and parity sectors for this I/O. */ uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1))); - raidz_row_t *rr; - rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP); - rr->rr_cols = groupwidth; + raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth); rr->rr_scols = groupwidth; - rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; rr->rr_firstdatacol = vdc->vdc_nparity; - rr->rr_abd_empty = NULL; #ifdef ZFS_DEBUG rr->rr_offset = io_offset; rr->rr_size = io_size; @@ -1004,14 +998,6 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c); rc->rc_offset = physical_offset; - rc->rc_abd = NULL; - rc->rc_orig_data = NULL; - rc->rc_error = 0; - rc->rc_tried = 0; - rc->rc_skipped = 0; - rc->rc_force_repair = 0; - rc->rc_allow_repair = 1; - rc->rc_need_orig_restore = B_FALSE; if (q == 0 && i >= bc) rc->rc_size = 0; @@ -1080,7 +1066,7 @@ vdev_draid_map_alloc(zio_t *zio) if (size < abd_size) { vdev_t *vd = zio->io_vd; - io_offset += vdev_draid_asize(vd, size); + io_offset += vdev_draid_asize(vd, size, 0); abd_offset += size; abd_size -= size; nrows++; @@ -1102,7 +1088,6 @@ vdev_draid_map_alloc(zio_t *zio) rm->rm_row[0] = rr[0]; if (nrows == 2) rm->rm_row[1] = rr[1]; - return (rm); } @@ -1728,7 +1713,7 @@ vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t asize = vdev_draid_asize(vd, psize); + uint64_t asize = vdev_draid_asize(vd, psize, 0); if (phys_birth == TXG_UNKNOWN) { /* @@ -1785,7 +1770,7 @@ vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_draid_asize(vd, rr->rr_size); + vdev_draid_asize(vd, rr->rr_size, 0); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; @@ -1983,6 +1968,8 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) } } +extern const zio_vsd_ops_t vdev_raidz_vsd_ops; + /* * Start an IO operation to a dRAID vdev. */ diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index e9156c32f384..368c75cb17cb 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -52,7 +52,8 @@ static boolean_t vdev_initialize_should_stop(vdev_t *vd) { return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || - vd->vdev_detached || vd->vdev_top->vdev_removing); + vd->vdev_detached || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding); } static void @@ -71,7 +72,8 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); - if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + if (vd == NULL || vd->vdev_top->vdev_removing || + !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) return; uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; @@ -597,6 +599,7 @@ vdev_initialize(vdev_t *vd) ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_initialize_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_top->vdev_rz_expanding); vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); vd->vdev_initialize_thread = thread_create(NULL, 0, @@ -738,13 +741,14 @@ vdev_initialize_restart(vdev_t *vd) ASSERT(err == 0 || err == ENOENT); vd->vdev_initialize_action_time = timestamp; - if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || - vd->vdev_offline) { + if ((vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_initialize_load(vd)); } else if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && + !vd->vdev_top->vdev_rz_expanding && vd->vdev_initialize_thread == NULL) { vdev_initialize(vd); } diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index daf53f0a0c8b..d30e8cdcf177 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -142,6 +142,7 @@ #include #include #include +#include #include #include #include @@ -423,6 +424,13 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, sizeof (pcs) / sizeof (uint64_t)); } + + pool_raidz_expand_stat_t pres; + if (spa_raidz_expand_get_stats(spa, &pres) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres, + sizeof (pres) / sizeof (uint64_t)); + } } static void @@ -1488,7 +1496,8 @@ vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) } struct ubl_cbdata { - uberblock_t *ubl_ubbest; /* Best uberblock */ + uberblock_t ubl_latest; /* Most recent uberblock */ + uberblock_t *ubl_ubbest; /* Best uberblock (w/r/t max_txg) */ vdev_t *ubl_vd; /* vdev associated with the above */ }; @@ -1505,6 +1514,9 @@ vdev_uberblock_load_done(zio_t *zio) if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); + if (vdev_uberblock_compare(ub, &cbp->ubl_latest) > 0) { + cbp->ubl_latest = *ub; + } if (ub->ub_txg <= spa->spa_load_max_txg && vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { /* @@ -1562,10 +1574,10 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) ASSERT(config); bzero(ub, sizeof (uberblock_t)); + bzero(&cb, sizeof (cb)); *config = NULL; cb.ubl_ubbest = ub; - cb.ubl_vd = NULL; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); zio = zio_root(spa, NULL, &cb, flags); @@ -1582,6 +1594,22 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); + if (ub->ub_raidz_reflow_info != + cb.ubl_latest.ub_raidz_reflow_info) { + vdev_dbgmsg(cb.ubl_vd, + "spa=%s best uberblock (txg=%llu info=0x%llx) " + "has different raidz_reflow_info than latest " + "uberblock (txg=%llu info=0x%llx)", + spa->spa_name, + (u_longlong_t)ub->ub_txg, + (u_longlong_t)ub->ub_raidz_reflow_info, + (u_longlong_t)cb.ubl_latest.ub_txg, + (u_longlong_t)cb.ubl_latest.ub_raidz_reflow_info); + bzero(ub, sizeof (uberblock_t)); + spa_config_exit(spa, SCL_ALL, FTAG); + return; + } + *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); if (*config == NULL && spa->spa_extreme_rewind) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " @@ -1703,8 +1731,23 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, vd->vdev_copy_uberblocks = B_FALSE; } + /* + * We chose a slot based on the txg. If this uberblock has a special + * RAIDZ expansion state, then it is essentially an update of the + * current uberblock (it has the same txg). However, the current + * state is committed, so we want to write it to a different slot. If + * we overwrote the same slot, and we lose power during the uberblock + * write, and the disk does not do single-sector overwrites + * atomically (even though it is required to - i.e. we should see + * either the old or the new uberblock), then we could lose this + * txg's uberblock. Rewinding to the previous txg's uberblock may not + * be possible because RAIDZ expansion may have already overwritten + * some of the data, so we need the progress indicator in the + * uberblock. + */ int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0; - int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m); + int n = (ub->ub_txg - RRSS_GET_STATE(ub)) % + (VDEV_UBERBLOCK_COUNT(vd) - m); /* Copy the uberblock_t into the ABD */ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); @@ -1721,7 +1764,7 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, } /* Sync the uberblocks to all vdevs in svd[] */ -static int +int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { spa_t *spa = svd[0]->vdev_spa; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 7e7202ec1e55..9631ac8a9ccc 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -27,15 +27,21 @@ #include #include +#include +#include #include +#include #include #include +#include #include +#include #include #include #include #include #include +#include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_raidz_io_verify() */ @@ -135,6 +141,22 @@ VDEV_RAIDZ_64MUL_2((x), mask); \ } +/* + * For testing only: logical offset at which to pause the expansion. + */ +unsigned long raidz_expand_max_offset_pause = 0; + +/* + * Maximum amount of copy io's outstanding at once. + */ +unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; + +/* + * Apply raidz map abds aggregation if the number of rows in the map is equal + * or greater than the value below. + */ +unsigned long raidz_io_aggregate_rows = 4; + static void vdev_raidz_row_free(raidz_row_t *rr) { @@ -159,6 +181,17 @@ vdev_raidz_map_free(raidz_map_t *rm) for (int i = 0; i < rm->rm_nrows; i++) vdev_raidz_row_free(rm->rm_row[i]); + if (rm->rm_nphys_cols) { + for (int i = 0; i < rm->rm_nphys_cols; i++) { + if (rm->rm_phys_col[i].rc_abd != NULL) + abd_free(rm->rm_phys_col[i].rc_abd); + } + + kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * + rm->rm_nphys_cols); + } + + ASSERT3P(rm->rm_lr, ==, NULL); kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } @@ -170,10 +203,37 @@ vdev_raidz_map_free_vsd(zio_t *zio) vdev_raidz_map_free(rm); } +static int +vdev_raidz_reflow_compare(const void *x1, const void *x2) +{ + const reflow_node_t *l = x1; + const reflow_node_t *r = x2; + + return (TREE_CMP(l->re_txg, r->re_txg)); +} + const zio_vsd_ops_t vdev_raidz_vsd_ops = { .vsd_free = vdev_raidz_map_free_vsd, }; +raidz_row_t * +vdev_raidz_row_alloc(int cols) +{ + raidz_row_t *rr = + kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); + + rr->rr_cols = cols; + rr->rr_scols = cols; + + for (int c = 0; c < cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_shadow_devidx = INT_MAX; + rc->rc_shadow_offset = UINT64_MAX; + rc->rc_allow_repair = 1; + } + return (rr); +} + static void vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) { @@ -343,18 +403,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, } ASSERT3U(acols, <=, scols); - - rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); + rr = vdev_raidz_row_alloc(scols); rm->rm_row[0] = rr; - rr->rr_cols = acols; - rr->rr_scols = scols; - rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; rr->rr_firstdatacol = nparity; - rr->rr_abd_empty = NULL; - rr->rr_nempty = 0; #ifdef ZFS_DEBUG rr->rr_offset = zio->io_offset; rr->rr_size = zio->io_size; @@ -372,18 +424,8 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, } rc->rc_devidx = col; rc->rc_offset = coff; - rc->rc_abd = NULL; - rc->rc_orig_data = NULL; - rc->rc_error = 0; - rc->rc_tried = 0; - rc->rc_skipped = 0; - rc->rc_force_repair = 0; - rc->rc_allow_repair = 1; - rc->rc_need_orig_restore = B_FALSE; - if (c >= acols) - rc->rc_size = 0; - else if (c < bc) + if (c < bc) rc->rc_size = (q + 1) << ashift; else rc->rc_size = q << ashift; @@ -425,7 +467,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; rr->rr_col[1].rc_devidx = devidx; rr->rr_col[1].rc_offset = o; - if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } @@ -435,7 +476,374 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, } else { vdev_raidz_map_alloc_read(zio, rm); } + /* init RAIDZ parity ops */ + rm->rm_ops = vdev_raidz_math_get_ops(); + + return (rm); +} + +/* + * Everything before reflow_offset_synced should have been moved to the new + * location (read and write completed). However, this may not yet be reflected + * in the on-disk format (e.g. raidz_reflow_sync() has been called but the + * uberblock has not yet been written). If reflow is not in progress, + * reflow_offset_synced should be UINT64_MAX. For each row, if the row is + * entirely before reflow_offset_synced, it will come from the new location. + * Otherwise this row will come from the old location. Therefore, rows that + * straddle the reflow_offset_synced will come from the old location. + * + * For writes, reflow_offset_next is the next offset to copy. If a sector has + * been copied, but not yet reflected in the on-disk progress + * (reflow_offset_synced), it will also be written to the new (already copied) + * offset. + */ +noinline raidz_map_t * +vdev_raidz_map_alloc_expanded(zio_t *zio, + uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, + uint64_t nparity, uint64_t reflow_offset_synced, + uint64_t reflow_offset_next, boolean_t use_scratch) +{ + abd_t *abd = zio->io_abd; + uint64_t offset = zio->io_offset; + uint64_t size = zio->io_size; + + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = size >> ashift; + uint64_t q, r, bc, asize, tot; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + * AKA "full rows" + */ + q = s / (logical_cols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + r = s - q * (logical_cols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ + bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + /* How many rows contain data (not skip) */ + uint64_t rows = howmany(tot, logical_cols); + int cols = MIN(tot, logical_cols); + + raidz_map_t *rm = + kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), + KM_SLEEP); + rm->rm_nrows = rows; + rm->rm_nskip = roundup(tot, nparity + 1) - tot; + rm->rm_skipstart = bc; + asize = 0; + +#if 1 + zfs_dbgmsg("rm=%px s=%d q=%d r=%d bc=%d nrows=%d cols=%d rfo=%llx", + rm, (int)s, (int)q, (int)r, (int)bc, (int)rows, (int)cols, + (long long)reflow_offset_synced); +#endif + + for (uint64_t row = 0; row < rows; row++) { + raidz_row_t *rr = vdev_raidz_row_alloc(cols); + rm->rm_row[row] = rr; + + /* The starting RAIDZ (parent) vdev sector of the row. */ + uint64_t b = (offset >> ashift) + row * logical_cols; + + /* + * If we are in the middle of a reflow, and the copying has + * not yet completed for any part of this row, then use the + * old location of this row. Note that reflow_offset_synced + * reflects the i/o that's been completed, because it's + * updated by a synctask, after zio_wait(spa_txg_zio[]). + * This is sufficient for our check, even if that progress + * has not yet been recorded to disk (reflected in + * spa_ubsync). Also note that we consider the last row to + * be "full width" (`cols`-wide rather than `bc`-wide) for + * this calculation. This causes a tiny bit of unnecessary + * double-writes but is safe and simpler to calculate. + */ + int row_phys_cols = physical_cols; + if (b + cols > reflow_offset_synced >> ashift) + row_phys_cols--; + + /* starting child of this row */ + uint64_t child_id = b % row_phys_cols; + /* The starting byte offset on each child vdev. */ + uint64_t child_offset = (b / row_phys_cols) << ashift; + + /* + * Note, rr_cols is the entire width of the block, even + * if this row is shorter. This is needed because parity + * generation (for Q and R) needs to know the entire width, + * because it treats the short row as though it was + * full-width (and the "phantom" sectors were zero-filled). + * + * Another approach to this would be to set cols shorter + * (to just the number of columns that we might do i/o to) + * and have another mechanism to tell the parity generation + * about the "entire width". Reconstruction (at least + * vdev_raidz_reconstruct_general()) would also need to + * know about the "entire width". + */ + rr->rr_firstdatacol = nparity; +#ifdef ZFS_DEBUG + /* + * note: rr_size is PSIZE, not ASIZE + */ + rr->rr_offset = b << ashift; + rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; +#endif + + for (int c = 0; c < rr->rr_cols; c++, child_id++) { + if (child_id >= row_phys_cols) { + child_id -= row_phys_cols; + child_offset += 1ULL << ashift; + } + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_devidx = child_id; + rc->rc_offset = child_offset; + + /* + * Get this from the scratch space if appropriate. We + * should only be doing reads if this is the case. + * This only happens if we crashed in the middle of + * raidz_reflow_scratch_sync() (while it's running, + * the rangelock prevents us from doing concurrent + * io), and even then only during zpool import or + * when the pool is imported readonly. + */ + if (use_scratch && + (b + c) << ashift < reflow_offset_synced) { + rc->rc_offset -= VDEV_BOOT_SIZE; + } + + uint64_t dc = c - rr->rr_firstdatacol; + if (c < rr->rr_firstdatacol) { + rc->rc_size = 1ULL << ashift; + + /* + * Parity sectors' rc_abd's and are set + * below after determining if this is an + * aggregation. + */ + } else if (row == rows - 1 && bc != 0 && c >= bc) { + /* + * Past the end of the block (even including + * skip sectors). This sector is part of the + * map so that we have full rows for p/q parity + * generation. + */ + rc->rc_size = 0; + rc->rc_abd = NULL; + } else { + /* XXX ASCII art diagram here */ + /* "data column" (col excluding parity) */ + uint64_t off; + + if (c < bc || r == 0) { + off = dc * rows + row; + } else { + off = r * rows + + (dc - r) * (rows - 1) + row; + } +#if 1 + zfs_dbgmsg("rm=%px row=%d c=%d dc=%d off=%u " + "devidx=%u offset=%llu rpc=%u", + rm, (int)row, (int)c, (int)dc, (int)off, + (int)child_id, (long long)child_offset, + (int)row_phys_cols); +#endif + rc->rc_size = 1ULL << ashift; + rc->rc_abd = abd_get_offset_struct( + &rc->rc_abdstruct, abd, off << ashift, + rc->rc_size); + } + + /* + * If any part of this row is in both old and new + * locations, the primary location is the old + * location. If this sector was already copied to the + * new location, we need to also write to the new, + * "shadow" location. + * + * Note, `row_phys_cols != physical_cols` indicates + * that the primary location is the old location. + * `b+c < reflow_offset_next` indicates that the copy + * to the new location has been initiated. We know + * that the copy has completed because we have the + * rangelock, which is held exclusively while the + * copy is in progress. + */ + if (rc->rc_size != 0 && + row_phys_cols != physical_cols && + b + c < reflow_offset_next >> ashift) { + ASSERT3U(row_phys_cols, ==, physical_cols - 1); + rc->rc_shadow_devidx = (b + c) % physical_cols; + rc->rc_shadow_offset = + ((b + c) / physical_cols) << ashift; + zfs_dbgmsg("rm=%px row=%d b+c=%llu " + "shadow_devidx=%u shadow_offset=%llu", + rm, (int)row, (long long)(b + c), + (int)rc->rc_shadow_devidx, + (long long)rc->rc_shadow_offset); + } + + asize += rc->rc_size; + } + + /* + * If all data stored spans all columns, there's a danger that + * parity will always be on the same device and, since parity + * isn't read during normal operation, that that device's I/O + * bandwidth won't be used effectively. We therefore switch the + * parity every 1MB. + * + * ... at least that was, ostensibly, the theory. As a + * practical matter unless we juggle the parity between all + * devices evenly, we won't see any benefit. Further, + * occasional writes that aren't a multiple of the LCM of the + * number of children and the minimum stripe width are + * sufficient to avoid pessimal behavior. + * Unfortunately, this decision created an implicit on-disk + * format requirement that we need to support for all eternity, + * but only for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for + * padding we must make sure to note this swap. We will never + * intend to skip the first column since at least one data and + * one parity column must appear in each row. + */ + if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && + (offset & (1ULL << 20))) { + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); + + int devidx0 = rr->rr_col[0].rc_devidx; + uint64_t offset0 = rr->rr_col[0].rc_offset; + int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; + uint64_t shadow_offset0 = + rr->rr_col[0].rc_shadow_offset; + + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[0].rc_shadow_devidx = + rr->rr_col[1].rc_shadow_devidx; + rr->rr_col[0].rc_shadow_offset = + rr->rr_col[1].rc_shadow_offset; + + rr->rr_col[1].rc_devidx = devidx0; + rr->rr_col[1].rc_offset = offset0; + rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; + rr->rr_col[1].rc_shadow_offset = shadow_offset0; + } + } + ASSERT3U(asize, ==, tot << ashift); + + /* + * Determine if the block is contiguous, in which case we can use + * an aggregation. + */ + if (rows >= raidz_io_aggregate_rows) { + rm->rm_nphys_cols = physical_cols; + rm->rm_phys_col = + kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, + KM_SLEEP); + + /* + * Determine the aggregate io's offset and size, and check + * that the io is contiguous. + */ + for (int i = 0; + i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + + if (rc->rc_size == 0) + continue; + + if (prc->rc_size == 0) { + ASSERT0(prc->rc_offset); + prc->rc_offset = rc->rc_offset; + } else if (prc->rc_offset + prc->rc_size != + rc->rc_offset) { + /* + * This block is not contiguous and + * therefore can't be aggregated. + * This is expected to be rare, so + * the cost of allocating and then + * freeing rm_phys_col is not + * significant. + */ + kmem_free(rm->rm_phys_col, + sizeof (raidz_col_t) * + rm->rm_nphys_cols); + rm->rm_phys_col = NULL; + rm->rm_nphys_cols = 0; + break; + } + prc->rc_size += rc->rc_size; + } + } + } + if (rm->rm_phys_col != NULL) { + /* + * Allocate aggregate ABD's. + */ + for (int i = 0; i < rm->rm_nphys_cols; i++) { + raidz_col_t *prc = &rm->rm_phys_col[i]; + + prc->rc_devidx = i; + + if (prc->rc_size == 0) + continue; + + prc->rc_abd = + abd_alloc_linear(rm->rm_phys_col[i].rc_size, + B_FALSE); + } + /* + * Point the parity abd's into the aggregate abd's. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + rc->rc_abd = + abd_get_offset_struct(&rc->rc_abdstruct, + prc->rc_abd, + rc->rc_offset - prc->rc_offset, + rc->rc_size); + } + } + } else { + /* + * Allocate new abd's for the parity sectors. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = + abd_alloc_linear(rc->rc_size, + B_TRUE); + } + } + } /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); @@ -618,7 +1026,15 @@ vdev_raidz_generate_parity_pqr(raidz_row_t *rr) void vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { - ASSERT3U(rr->rr_cols, !=, 0); + if (rr->rr_cols == 0) { + /* + * We are handling this block one row at a time (because + * this block has a different logical vs physical width, + * due to RAIDZ expansion), and this is a pad-only row, + * which has no parity. + */ + return; + } /* Generate using the new math implementation */ if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) @@ -770,6 +1186,9 @@ vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) int x = tgts[0]; abd_t *dst, *src; + zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", + rr, x); + ASSERT3U(ntgts, ==, 1); ASSERT3U(x, >=, rr->rr_firstdatacol); ASSERT3U(x, <, rr->rr_cols); @@ -802,6 +1221,9 @@ vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) int c, exp; abd_t *dst, *src; + zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", + rr, x); + ASSERT(ntgts == 1); ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); @@ -848,6 +1270,9 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) int y = tgts[1]; abd_t *xd, *yd; + zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", + rr, x, y); + ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rr->rr_firstdatacol); @@ -1289,6 +1714,8 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; + zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", + rr, ntgts); uint8_t *p, *pp; size_t psize; uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; @@ -1429,10 +1856,20 @@ vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; + zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", + rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, + (int)rr->rr_missingparity); + nbadparity = rr->rr_firstdatacol; nbaddata = rr->rr_cols - nbadparity; ntgts = 0; for (i = 0, c = 0; c < rr->rr_cols; c++) { + zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " + "offset=%llx error=%u)", + rr, c, + (int)rr->rr_col[c].rc_devidx, + (long long)rr->rr_col[c].rc_offset, + (int)rr->rr_col[c].rc_error); if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; @@ -1553,19 +1990,71 @@ vdev_raidz_close(vdev_t *vd) } } +/* + * Return the logical width to use, given the txg in which the allocation + * happened. Note that BP_PHYSICAL_BIRTH() is usually the txg in which the + * BP was allocated. Remapped BP's (that were relocated due to device + * removal, see remap_blkptr_cb()), will have a more recent + * BP_PHYSICAL_BIRTH() which reflects when the BP was relocated, but we can + * ignore these because they can't be on RAIDZ (device removal doesn't + * support RAIDZ). + */ +static uint64_t +vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) +{ + reflow_node_t lookup = { + .re_txg = txg, + }; + avl_index_t where; + + uint64_t width; + mutex_enter(&vdrz->vd_expand_lock); + reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); + if (re != NULL) { + width = re->re_logical_width; + } else { + re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); + if (re != NULL) + width = re->re_logical_width; + else + width = vdrz->vd_original_width; + } + mutex_exit(&vdrz->vd_expand_lock); + return (width); +} + +/* + * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated + * more space due to the lower data-to-parity ratio. In this case it's + * important to pass in the correct txg. Note that vdev_gang_header_asize() + * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, + * regardless of txg. This is assured because for a single data sector, we + * allocate P+1 sectors regardless of width ("cols", which is at least P+1). + */ static uint64_t -vdev_raidz_asize(vdev_t *vd, uint64_t psize) +vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vdrz->vd_logical_width; + uint64_t cols = vdrz->vd_original_width; uint64_t nparity = vdrz->vd_nparity; + cols = vdev_raidz_get_logical_width(vdrz, txg); + asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); asize = roundup(asize, nparity + 1) << ashift; +#ifdef ZFS_DEBUG + uint64_t asize_new = ((psize - 1) >> ashift) + 1; + uint64_t ncols_new = vdrz->vd_physical_width; + asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / + (ncols_new - nparity)); + asize_new = roundup(asize_new, nparity + 1) << ashift; + VERIFY3U(asize_new, <=, asize); +#endif + return (asize); } @@ -1592,21 +2081,36 @@ vdev_raidz_child_done(zio_t *zio) } static void -vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) +vdev_raidz_shadow_child_done(zio_t *zio) { -#ifdef ZFS_DEBUG - vdev_t *tvd = vd->vdev_top; + raidz_col_t *rc = zio->io_private; + + rc->rc_shadow_error = zio->io_error; +} +static void +vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) +{ +#ifdef ZFS_DEBUG range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_asize(vd, rr->rr_size); + vdev_raidz_asize(zio->io_vd, rr->rr_size, + BP_PHYSICAL_BIRTH(zio->io_bp)); raidz_col_t *rc = &rr->rr_col[col]; - vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); ASSERT(vdev_xlate_is_empty(&remain_rs)); + if (vdev_xlate_is_empty(&physical_rs)) { + /* + * If we are in the middle of expansion, the + * physical->logical mapping is changing so vdev_xlate() + * can't give us a reliable answer. + */ + return; + } ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* @@ -1617,7 +2121,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) */ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + - rc->rc_size + (1 << tvd->vdev_ashift)); + rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); } else { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); } @@ -1625,7 +2129,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) } static void -vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) +vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; @@ -1637,31 +2141,57 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; /* Verify physical to logical translation */ - vdev_raidz_io_verify(vd, rr, c); + vdev_raidz_io_verify(zio, rm, rr, c); - if (rc->rc_size > 0) { - ASSERT3P(rc->rc_abd, !=, NULL); - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, - abd_get_size(rc->rc_abd), zio->io_type, - zio->io_priority, 0, vdev_raidz_child_done, rc)); - } else { - /* - * Generate optional write for skip sector to improve - * aggregation contiguity. - */ - ASSERT3P(rc->rc_abd, ==, NULL); - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, NULL, 1ULL << ashift, - zio->io_type, zio->io_priority, - ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, - NULL)); + if (rc->rc_size == 0) + continue; + + ASSERT3P(rc->rc_abd, !=, NULL); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, + abd_get_size(rc->rc_abd), zio->io_type, + zio->io_priority, 0, vdev_raidz_child_done, rc)); + + if (rc->rc_shadow_devidx != INT_MAX) { + vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, + rc->rc_shadow_offset, rc->rc_abd, + abd_get_size(rc->rc_abd), + zio->io_type, zio->io_priority, 0, + vdev_raidz_shadow_child_done, rc)); } } } +/* + * Generate optional I/Os for skip sectors to improve aggregation contiguity. + * This only works for vdev_raidz_map_alloc() (not _expanded()). + */ +static void +raidz_start_skip_writes(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + raidz_map_t *rm = zio->io_vsd; + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + for (int c = 0; c < rr->rr_scols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + if (rc->rc_size != 0) + continue; + ASSERT3P(rc->rc_abd, ==, NULL); + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + } +} + + static void -vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) +vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) { vdev_t *vd = zio->io_vd; @@ -1693,7 +2223,8 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_skipped = 1; continue; } - if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || + if (forceparity || + c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, @@ -1703,6 +2234,56 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) } } +static void +vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) +{ + vdev_t *vd = zio->io_vd; + + for (int i = 0; i < rm->rm_nphys_cols; i++) { + raidz_col_t *prc = &rm->rm_phys_col[i]; + if (prc->rc_size == 0) + continue; + + ASSERT3U(prc->rc_devidx, ==, i); + vdev_t *cvd = vd->vdev_child[i]; + if (!vdev_readable(cvd)) { + prc->rc_error = SET_ERROR(ENXIO); + prc->rc_tried = 1; /* don't even try */ + prc->rc_skipped = 1; + continue; + } + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { + prc->rc_error = SET_ERROR(ESTALE); + prc->rc_skipped = 1; + continue; + } + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + prc->rc_offset, prc->rc_abd, prc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, prc)); + } +} + +static void +vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) +{ + /* + * If there are multiple rows, we will be hitting + * all disks, so go ahead and read the parity so + * that we are reading in decent size chunks. + */ + boolean_t forceparity = rm->rm_nrows > 1; + + if (rm->rm_phys_col) { + vdev_raidz_io_start_read_phys_cols(zio, rm); + } else { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_io_start_read_row(zio, rr, forceparity); + } + } +} + /* * Start an IO operation on a RAIDZ VDev * @@ -1726,24 +2307,90 @@ vdev_raidz_io_start(zio_t *zio) vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_raidz_t *vdrz = vd->vdev_tsd; + raidz_map_t *rm; + + uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, + BP_PHYSICAL_BIRTH(zio->io_bp)); + zfs_dbgmsg("zio=%px bm=%llu/%llu/%llu/%llu phys_birth=%llu " + "logical_width=%llu", + zio, + (long long)zio->io_bookmark.zb_objset, + (long long)zio->io_bookmark.zb_object, + (long long)zio->io_bookmark.zb_level, + (long long)zio->io_bookmark.zb_blkid, + (long long)BP_PHYSICAL_BIRTH(zio->io_bp), + (long long)logical_width); + if (logical_width != vdrz->vd_physical_width) { + zfs_locked_range_t *lr = NULL; + uint64_t synced_offset = UINT64_MAX; + uint64_t next_offset = UINT64_MAX; + boolean_t use_scratch = B_FALSE; + /* + * Note: when the expansion is completing, we set + * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) + * in a later txg than when we last update spa_ubsync's state + * (see the end of spa_raidz_expand_cb()). Therefore we may + * see vre_state!=SCANNING before + * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected + * on disk, but the copying progress has been synced to disk + * (and reflected in spa_ubsync). In this case it's fine to + * treat the expansion as completed, since if we crash there's + * no additional copying to do. + */ + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, + &vdrz->vn_vre); + lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, + zio->io_offset, zio->io_size, RL_READER); + use_scratch = + (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == + RRSS_SCRATCH_VALID); + synced_offset = + RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); + next_offset = vdrz->vn_vre.vre_offset; + /* + * If we haven't resumed expanding since importing the + * pool, vre_offset won't have been set yet. In + * this case the next offset to be copied is the same + * as what was synced. + */ + if (next_offset == UINT64_MAX) { + next_offset = synced_offset; + } + } + zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=%lld " + "next_offset=%lld use_scratch=%u", + zio, + zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", + (long long)zio->io_offset, + (long long)synced_offset, + (long long)next_offset, + use_scratch); + + rm = vdev_raidz_map_alloc_expanded(zio, + tvd->vdev_ashift, vdrz->vd_physical_width, + logical_width, vdrz->vd_nparity, + synced_offset, next_offset, use_scratch); + rm->rm_lr = lr; + } else { + rm = vdev_raidz_map_alloc(zio, + tvd->vdev_ashift, logical_width, vdrz->vd_nparity); + } + rm->rm_original_width = vdrz->vd_original_width; - raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, - vdrz->vd_logical_width, vdrz->vd_nparity); - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; - - /* - * Until raidz expansion is implemented all maps for a raidz vdev - * contain a single row. - */ - ASSERT3U(rm->rm_nrows, ==, 1); - raidz_row_t *rr = rm->rm_row[0]; - + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; if (zio->io_type == ZIO_TYPE_WRITE) { - vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_io_start_write(zio, rm->rm_row[i]); + } + + if (logical_width == vdrz->vd_physical_width) { + raidz_start_skip_writes(zio); + } } else { ASSERT(zio->io_type == ZIO_TYPE_READ); - vdev_raidz_io_start_read(zio, rr); + vdev_raidz_io_start_read(zio, rm); } zio_execute(zio); @@ -1837,6 +2484,9 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) continue; if (abd_cmp(orig[c], rc->rc_abd) != 0) { + zfs_dbgmsg("raidz_parity_verify found error on " + "col=%u devidx=%u", + c, (int)rc->rc_devidx); raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -1852,8 +2502,10 @@ vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; - for (int c = 0; c < rr->rr_cols; c++) + for (int c = 0; c < rr->rr_cols; c++) { error = zio_worst_error(error, rr->rr_col[c].rc_error); + error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); + } return (error); } @@ -1892,9 +2544,20 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) * Note that we also regenerate parity when resilvering so we * can write it out to failed devices later. */ +#if 1 + zfs_dbgmsg("parity_errors=%u parity_untried=%u data_errors=%u " + "verifying=%s", + parity_errors, parity_untried, data_errors, + (parity_errors + parity_untried < + rr->rr_firstdatacol - data_errors) ? "yes" : "no"); +#endif +#if 1 if (parity_errors + parity_untried < rr->rr_firstdatacol - data_errors || (zio->io_flags & ZIO_FLAG_RESILVER)) { +#else + if ((zio->io_flags & ZIO_FLAG_RESILVER)) { +#endif int n = raidz_parity_verify(zio, rr); unexpected_errors += n; ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol); @@ -1926,6 +2589,39 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } + + /* + * Scrub or resilver i/o's: overwrite any shadow locations with the + * good data. This ensures that if we've already copied this sector, + * it will be corrected if it was damaged. This writes more than is + * necessary, but since expansion is paused during scrub/resilver, at + * most a single row will have a shadow location. + */ + if (zio->io_error == 0 && spa_writeable(zio->io_spa) && + (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *vd = zio->io_vd; + + if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) + continue; + vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; + + /* + * Note: We don't want to update the repair stats + * because that would incorrectly indicate that there + * was bad data to repair. By clearing the + * SCAN_THREAD flag, we prevent this from happening, + * despite having the REPAIR flag set. + */ + zio_t *cio = zio_vdev_child_io(zio, NULL, cvd2, + rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR, NULL, NULL); + cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; + zio_nowait(cio); + } + } } static void @@ -1944,6 +2640,51 @@ raidz_restore_orig_data(raidz_map_t *rm) } } +/* + * Treating logical child i as failed, return TRUE if the given column should + * be treated as failed. The idea of logical children allows us to imagine + * that a disk silently failed before a RAIDZ expansion (reads from this disk + * succeed but return the wrong data). Since the expansion doesn't verify + * checksums, the incorrect data will be moved to new locations spread among + * the children (going diagonally across them). + * + * Higher "logical child failures" (values of `i`) indicate these + * "pre-expansion failures". The first physical_width values imagine that a + * current child failed; the next physical_width-1 values imagine that a + * child failed before the most recent expansion; the next physical_width-2 + * values imagine a child failed in the expansion before that, etc. + */ +static boolean_t +raidz_simulate_failure(int physical_width, int original_width, int ashift, + int i, raidz_col_t *rc) +{ + uint64_t sector_id = + physical_width * (rc->rc_offset >> ashift) + + rc->rc_devidx; + +#if 1 + zfs_dbgmsg("raidz_simulate_failure(pw=%u lw=%u ashift=%u i=%u " + "rc_offset=%llx rc_devidx=%u sector_id=%llu", + physical_width, + original_width, + ashift, + i, + (long long)rc->rc_offset, + (int)rc->rc_devidx, + (long long)sector_id); +#endif + + for (int w = physical_width; w >= original_width; w--) { + if (i < w) { + return (sector_id % w == i); + } else { + i -= w; + } + } + ASSERT(!"invalid logical child id"); + return (B_FALSE); +} + /* * returns EINVAL if reconstruction of the block will not be possible * returns ECKSUM if this specific reconstruction failed @@ -1953,6 +2694,13 @@ static int raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) { raidz_map_t *rm = zio->io_vsd; + int physical_width = zio->io_vd->vdev_children; + int original_width = (rm->rm_original_width != 0) ? + rm->rm_original_width : physical_width; + + zfs_dbgmsg( + "raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u ntgts=%u", + zio, ltgts[0], ltgts[1], ltgts[2], ntgts); /* Reconstruct each row */ for (int r = 0; r < rm->rm_nrows; r++) { @@ -1962,6 +2710,9 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) int dead = 0; int dead_data = 0; + zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", + r); + for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; ASSERT0(rc->rc_need_orig_restore); @@ -1974,7 +2725,10 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) if (rc->rc_size == 0) continue; for (int lt = 0; lt < ntgts; lt++) { - if (rc->rc_devidx == ltgts[lt]) { + if (raidz_simulate_failure(physical_width, + original_width, + zio->io_vd->vdev_top->vdev_ashift, + ltgts[lt], rc)) { if (rc->rc_orig_data == NULL) { rc->rc_orig_data = abd_alloc_linear( @@ -1987,13 +2741,33 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) dead++; if (c >= nparity) dead_data++; - my_tgts[t++] = c; + /* + * Note: simulating failure of a + * pre-expansion device can hit more + * than one column, in which case we + * might try to simulate more + * failures than can be + * reconstructed, which is also more + * than the size of my_tgts. This + * check prevents accessing past the + * end of my_tgts. The "dead > + * nparity" check below will fail + * this reconstruction attempt. + */ + if (t < VDEV_RAIDZ_MAXPARITY) { + my_tgts[t++] = c; + zfs_dbgmsg("simulating failure " + "of col %u devidx %u", + c, (int)rc->rc_devidx); + } break; } } } if (dead > nparity) { /* reconstruction not possible */ + zfs_dbgmsg("reconstruction not possible; " + "too many failures"); raidz_restore_orig_data(rm); return (EINVAL); } @@ -2037,11 +2811,14 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) zio_checksum_verified(zio); + zfs_dbgmsg("reconstruction successful (checksum verified)"); return (0); } /* Reconstruction failed - restore original data */ raidz_restore_orig_data(rm); + zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum failed", + zio); return (ECKSUM); } @@ -2056,7 +2833,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) * The order that we find the various possible combinations of failed * disks is dictated by these rules: * - Examine each "slot" (the "i" in tgts[i]) - * - Try to increment this slot (tgts[i] = tgts[i] + 1) + * - Try to increment this slot (tgts[i] += 1) * - if we can't increment because it runs into the next slot, * reset our slot to the minimum, and examine the next slot * @@ -2087,18 +2864,22 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) * * This strategy works for dRAID but is less efficient when there are a large * number of child vdevs and therefore permutations to check. Furthermore, - * since the raidz_map_t rows likely do not overlap reconstruction would be + * since the raidz_map_t rows likely do not overlap, reconstruction would be * possible as long as there are no more than nparity data errors per row. * These additional permutations are not currently checked but could be as * a future improvement. + * + * Returns 0 on success, ECKSUM on failure. */ static int vdev_raidz_combrec(zio_t *zio) { int nparity = vdev_get_nparity(zio->io_vd); raidz_map_t *rm = zio->io_vsd; + int physical_width = zio->io_vd->vdev_children; + int original_width = (rm->rm_original_width != 0) ? + rm->rm_original_width : physical_width; - /* Check if there's enough data to attempt reconstrution. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; int total_errors = 0; @@ -2116,8 +2897,16 @@ vdev_raidz_combrec(zio_t *zio) int tstore[VDEV_RAIDZ_MAXPARITY + 2]; int *ltgts = &tstore[1]; /* value is logical child ID */ - /* Determine number of logical children, n */ - int n = zio->io_vd->vdev_children; + + /* + * Determine number of logical children, n. See comment + * above raidz_simulate_failure(). + */ + int n = 0; + for (int w = physical_width; + w >= original_width; w--) { + n += w; + } ASSERT3U(num_failures, <=, nparity); ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); @@ -2148,6 +2937,10 @@ vdev_raidz_combrec(zio_t *zio) if (ltgts[t] == n) { /* try more failures */ ASSERT3U(t, ==, num_failures - 1); + zfs_dbgmsg("reconstruction failed " + "for num_failures=%u; tried all " + "combinations", + num_failures); break; } @@ -2159,7 +2952,7 @@ vdev_raidz_combrec(zio_t *zio) * Try the next combination. */ if (ltgts[t] != ltgts[t + 1]) - break; + break; // found next combination /* * Otherwise, reset this tgt to the minimum, @@ -2174,7 +2967,7 @@ vdev_raidz_combrec(zio_t *zio) break; } } - + zfs_dbgmsg("reconstruction failed for all num_failures"); return (ECKSUM); } @@ -2199,7 +2992,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) static void vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) { - int total_errors = 0; + int normal_errors = 0; + int shadow_errors = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); @@ -2208,32 +3002,38 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; - if (rc->rc_error) { + if (rc->rc_error != 0) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ - - total_errors++; + normal_errors++; + } + if (rc->rc_shadow_error != 0) { + ASSERT(rc->rc_shadow_error != ECKSUM); + shadow_errors++; } } /* * Treat partial writes as a success. If we couldn't write enough - * columns to reconstruct the data, the I/O failed. Otherwise, - * good enough. + * columns to reconstruct the data, the I/O failed. Otherwise, good + * enough. Note that in the case of a shadow write (during raidz + * expansion), depending on if we crash, either the normal (old) or + * shadow (new) location may become the "real" version of the block, + * so both locations must have sufficient redundancy. * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ - if (total_errors > rr->rr_firstdatacol) { + if (normal_errors > rr->rr_firstdatacol || + shadow_errors > rr->rr_firstdatacol) { zio->io_error = zio_worst_error(zio->io_error, vdev_raidz_worst_error(rr)); } } static void -vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, - raidz_row_t *rr) +vdev_raidz_io_done_reconstruct_known_missing(raidz_map_t *rm, raidz_row_t *rr) { int parity_errors = 0; int parity_untried = 0; @@ -2242,7 +3042,6 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; @@ -2314,7 +3113,7 @@ vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) * for a normal read then allocate an ABD for them now so they * may be read, verified, and any needed repairs performed. */ - if (rr->rr_nempty && rr->rr_abd_empty == NULL) + if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) vdev_draid_map_alloc_empty(zio, rr); for (int c = 0; c < rr->rr_cols; c++) { @@ -2372,15 +3171,51 @@ vdev_raidz_io_done(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; + ASSERT(zio->io_bp != NULL); if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); } } else { + if (rm->rm_phys_col) { + /* + * This is an aggregated read. Copy the data and status + * from the aggregate abd's to the individual rows. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; + + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + rc->rc_error = prc->rc_error; + rc->rc_tried = prc->rc_tried; + rc->rc_skipped = prc->rc_skipped; + if (c >= rr->rr_firstdatacol) { + /* + * Note: this is slightly faster + * than using abd_copy_off(). + */ + char *physbuf = abd_to_buf( + prc->rc_abd); + void *physloc = physbuf + + rc->rc_offset - + prc->rc_offset; + + abd_copy_from_buf(rc->rc_abd, + physloc, rc->rc_size); + } + } + } + } + for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; - vdev_raidz_io_done_reconstruct_known_missing(zio, - rm, rr); + vdev_raidz_io_done_reconstruct_known_missing(rm, rr); } if (raidz_checksum_verify(zio) == 0) { @@ -2423,7 +3258,54 @@ vdev_raidz_io_done(zio_t *zio) zio_vdev_io_redone(zio); return; } - + /* + * It would be too expensive to try every possible + * combination of failed sectors in every row, so + * instead we try every combination of failed current or + * past physical disk. This means that if the incorrect + * sectors were all on Nparity disks at any point in the + * past, we will find the correct data. I think that + * the only case where this is less durable than + * a non-expanded RAIDZ, is if we have a silent + * failure during expansion. In that case, one block + * could be partially in the old format and partially + * in the new format, so we'd lost some sectors + * from the old format and some from the new format. + * + * e.g. logical_width=4 physical_width=6 + * the 15 (6+5+4) possible failed disks are: + * width=6 child=0 + * width=6 child=1 + * width=6 child=2 + * width=6 child=3 + * width=6 child=4 + * width=6 child=5 + * width=5 child=0 + * width=5 child=1 + * width=5 child=2 + * width=5 child=3 + * width=5 child=4 + * width=4 child=0 + * width=4 child=1 + * width=4 child=2 + * width=4 child=3 + * And we will try every combination of Nparity of these + * failing. + * + * As a first pass, we can generate every combo, + * and try reconstructing, ignoring any known + * failures. If any row has too many known + simulated + * failures, then we bail on reconstructing with this + * number of simulated failures. As an improvement, + * we could detect the number of whole known failures + * (i.e. we have known failures on these disks for + * every row; the disks never succeeded), and + * subtract that from the max # failures to simulate. + * We could go even further like the current + * combrec code, but that doesn't seem like it + * gains us very much. If we simulate a failure + * that is also a known failure, that's fine. + */ zio->io_error = vdev_raidz_combrec(zio); if (zio->io_error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -2431,6 +3313,10 @@ vdev_raidz_io_done(zio_t *zio) } } } + if (rm->rm_lr != NULL) { + zfs_rangelock_exit(rm->rm_lr); + rm->rm_lr = NULL; + } } static void @@ -2457,6 +3343,14 @@ vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { vdev_raidz_t *vdrz = vd->vdev_tsd; + + /* + * If we're in the middle of a RAIDZ expansion, this block may be in + * the old and/or new location. For simplicity, always resilver it. + */ + if (vdrz->vn_vre.vre_state == DSS_SCANNING) + return (B_TRUE); + uint64_t dcols = vd->vdev_children; uint64_t nparity = vdrz->vd_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; @@ -2499,7 +3393,24 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); - uint64_t width = raidvd->vdev_children; + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + /* + * We're in the middle of expansion, in which case the + * translation is in flux. Any answer we give may be wrong + * by the time we return, so it isn't safe for the caller to + * act on it. Therefore we say that this range isn't present + * on any children. The only consumers of this are "zpool + * initialize" and trimming, both of which are "best effort" + * anyway. + */ + physical_rs->rs_start = physical_rs->rs_end = 0; + remain_rs->rs_start = remain_rs->rs_end = 0; + return; + } + + uint64_t width = vdrz->vd_physical_width; uint64_t tgt_col = cvd->vdev_id; uint64_t ashift = raidvd->vdev_top->vdev_ashift; @@ -2525,15 +3436,1065 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, logical_rs->rs_end - logical_rs->rs_start); } +static void +raidz_reflow_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + /* + * Ensure there are no i/os to the range that is being committed. + */ + uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); + ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); + + mutex_enter(&vre->vre_lock); + uint64_t new_offset = + MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); + /* + * We should not have committed anything that failed. + */ + VERIFY3U(vre->vre_failed_offset, >=, old_offset); + mutex_exit(&vre->vre_lock); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, + old_offset, new_offset - old_offset, + RL_WRITER); + + /* + * Update the uberblock that will be written when this txg completes. + */ + zfs_dbgmsg("reflow syncing txg=%llu off_pertxg=%llu failed_off=%llu", + (long long)dmu_tx_get_txg(tx), + (long long)vre->vre_offset_pertxg[txgoff], + (long long)vre->vre_failed_offset); + RAIDZ_REFLOW_SET(&spa->spa_uberblock, + RRSS_SCRATCH_NOT_IN_USE, new_offset); + vre->vre_offset_pertxg[txgoff] = 0; + zfs_rangelock_exit(lr); + + mutex_enter(&vre->vre_lock); + vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; + vre->vre_bytes_copied_pertxg[txgoff] = 0; + mutex_exit(&vre->vre_lock); + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, + sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); +} + +static void +raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + + for (int i = 0; i < TXG_SIZE; i++) + VERIFY0(vre->vre_offset_pertxg[i]); + + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = tx->tx_txg + 1; + re->re_logical_width = vdrz->vd_physical_width; + mutex_enter(&vdrz->vd_expand_lock); + avl_add(&vdrz->vd_expand_txgs, re); + mutex_exit(&vdrz->vd_expand_lock); + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + + /* + * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS + * will get written (based on vd_expand_txgs). + */ + vdev_config_dirty(vd); + + /* + * Before we change vre_state, the on-disk state must reflect that we + * have completed all copying, so that vdev_raidz_io_start() can use + * vre_state to determine if the reflow is in progress. See also the + * end of spa_raidz_expand_cb(). + */ + VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, + raidvd->vdev_ms_count << raidvd->vdev_ms_shift); + + vre->vre_end_time = gethrestime_sec(); + vre->vre_state = DSS_FINISHED; + + uint64_t state = vre->vre_state; + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state, tx)); + + uint64_t end_time = vre->vre_end_time; + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, + sizeof (end_time), 1, &end_time, tx)); + + spa->spa_uberblock.ub_raidz_reflow_info = 0; + + spa_history_log_internal(spa, "raidz vdev expansion completed", tx, + "%s vdev %llu new width %llu", spa_name(spa), + (unsigned long long)vd->vdev_id, + (unsigned long long)vd->vdev_children); + + spa->spa_raidz_expand = NULL; + raidvd->vdev_rz_expanding = B_FALSE; + + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); + spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); + + spa_notify_waiters(spa); +} + +/* + * Struct for one copy zio. + */ +typedef struct raidz_reflow_arg { + vdev_raidz_expand_t *rra_vre; + zfs_locked_range_t *rra_lr; + uint64_t rra_txg; +} raidz_reflow_arg_t; + +/* + * The write of the new location is done. + */ +static void +raidz_reflow_write_done(zio_t *zio) +{ + raidz_reflow_arg_t *rra = zio->io_private; + vdev_raidz_expand_t *vre = rra->rra_vre; + + abd_free(zio->io_abd); + + zfs_dbgmsg("completed reflow offset=%llu size=%llu txg=%llu err=%u", + (long long)rra->rra_lr->lr_offset, + (long long)rra->rra_lr->lr_length, + (long long)rra->rra_txg, + zio->io_error); + + mutex_enter(&vre->vre_lock); + if (zio->io_error != 0) { + vre->vre_failed_offset = + MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); + } + ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); + vre->vre_outstanding_bytes -= zio->io_size; + if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < + vre->vre_failed_offset) { + vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += + zio->io_size; + } + cv_signal(&vre->vre_cv); + mutex_exit(&vre->vre_lock); + + zfs_rangelock_exit(rra->rra_lr); + + kmem_free(rra, sizeof (*rra)); + spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); +} + +/* + * The read of the old location is done. The parent zio is the write to + * the new location. Allow it to start. + */ +static void +raidz_reflow_read_done(zio_t *zio) +{ + raidz_reflow_arg_t *rra = zio->io_private; + vdev_raidz_expand_t *vre = rra->rra_vre; + + /* + * If the read failed, or if it was done on a vdev that is not fully + * healthy (e.g. a child that has a resilver in progress), we may not + * have the correct data. Note that it's OK if the write proceeds. + * It may write garbage but the location is otherwise unused and we + * will retry later due to vre_failed_offset. + */ + if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { + zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu \ +err=%u partial_dtl_empty=%u missing_dtl_empty=%u", + (long long)rra->rra_lr->lr_offset, + (long long)rra->rra_lr->lr_length, + (long long)rra->rra_txg, + zio->io_error, + vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), + vdev_dtl_empty(zio->io_vd, DTL_MISSING)); + mutex_enter(&vre->vre_lock); + vre->vre_failed_offset = + MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); + mutex_exit(&vre->vre_lock); + } + + zio_nowait(zio_unique_parent(zio)); +} + +static void +raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, + dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (offset == 0) + return; + + mutex_enter(&vre->vre_lock); + ASSERT3U(vre->vre_offset, <=, offset); + vre->vre_offset = offset; + mutex_exit(&vre->vre_lock); + + if (vre->vre_offset_pertxg[txgoff] == 0) { + dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, + spa, tx); + } + vre->vre_offset_pertxg[txgoff] = offset; +} + +static boolean_t +raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, + dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + int ashift = vd->vdev_top->vdev_ashift; + uint64_t offset, size; + + if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize, + &offset, &size)) { + return (B_FALSE); + } + ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); + ASSERT3U(size, >=, 1 << ashift); + uint64_t length = 1 << ashift; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + uint64_t blkid = offset >> ashift; + + int old_children = vd->vdev_children - 1; + + /* + * We can only progress to the point that writes will not overlap + * with blocks whose progress has not yet been recorded on disk. + * Since partially-copied rows are still read from the old location, + * we need to stop one row before the sector-wise overlap, to prevent + * row-wise overlap. + * + * Note that even if we are skipping over a large unallocated region, + * we can't move the on-disk progress to `offset`, because concurrent + * writes/allocations could still use the currently-unallocated + * region. + */ + uint64_t ubsync_blkid = + RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; + uint64_t next_overwrite_blkid = ubsync_blkid + + ubsync_blkid / old_children - old_children; + VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); + + if (blkid >= next_overwrite_blkid) { + raidz_reflow_record_progress(vre, + next_overwrite_blkid << ashift, tx); + + zfs_dbgmsg("copying offset %llu, ubsync offset = %llu, " + "max_overwrite = %llu wait for txg %llu to sync", + (long long)offset, + (long long)ubsync_blkid << ashift, + (long long)next_overwrite_blkid << ashift, + (long long)dmu_tx_get_txg(tx)); + return (B_TRUE); + } + + range_tree_remove(rt, offset, length); + + raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP); + rra->rra_vre = vre; + rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, + offset, length, RL_WRITER); + rra->rra_txg = dmu_tx_get_txg(tx); + + zfs_dbgmsg("initiating reflow write offset=%llu length=%llu", + (long long)offset, (long long)length); + + raidz_reflow_record_progress(vre, offset + length, tx); + + mutex_enter(&vre->vre_lock); + vre->vre_outstanding_bytes += length; + mutex_exit(&vre->vre_lock); + + /* + * SCL_STATE will be released when the read and write are done, + * by raidz_reflow_write_done(). + */ + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + zio_t *pio = spa->spa_txg_zio[txgoff]; + abd_t *abd = abd_alloc_for_io(length, B_FALSE); + zio_t *write_zio = zio_vdev_child_io(pio, NULL, + vd->vdev_child[blkid % vd->vdev_children], + (blkid / vd->vdev_children) << ashift, + abd, length, + ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + raidz_reflow_write_done, rra); + + zio_nowait(zio_vdev_child_io(write_zio, NULL, + vd->vdev_child[blkid % old_children], + (blkid / old_children) << ashift, + abd, length, + ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + raidz_reflow_read_done, rra)); + + return (B_FALSE); +} + +/* + * For testing. + */ +static void +raidz_expand_pause(spa_t *spa, uint64_t progress) +{ + while (raidz_expand_max_offset_pause != 0 && + raidz_expand_max_offset_pause <= progress) + delay(hz); +} + +static void +raidz_scratch_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + mutex_enter(&pio->io_lock); + pio->io_error = zio_worst_error(pio->io_error, zio->io_error); + mutex_exit(&pio->io_lock); +} + +static void +raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) +{ + vdev_raidz_expand_t *vre = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zio_t *pio; + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + int ashift = raidvd->vdev_ashift; + uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift); + uint64_t logical_size = write_size * raidvd->vdev_children; + uint64_t read_size = + P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), + 1 << ashift); + + /* + * The scratch space much be large enough to get us to the point + * that one row does not overlap itself when moved. This is checked + * by vdev_raidz_attach_check(). + */ + VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); + VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); + VERIFY3U(write_size, <=, read_size); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, + 0, logical_size, RL_WRITER); + + abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), + KM_SLEEP); + for (int i = 0; i < raidvd->vdev_children; i++) { + abds[i] = abd_alloc_linear(read_size, B_FALSE); + } + + raidz_expand_pause(spa, 1); + + /* + * Read from original location. + */ + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children - 1; i++) { +#if 0 + zio_nowait(zio_read_phys(pio, raidvd->vdev_child[i], + VDEV_LABEL_START_SIZE, read_size, abds[i], + ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + 0, B_FALSE)); +#else + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], read_size, ZIO_TYPE_READ, + ZIO_PRIORITY_ASYNC_READ, 0, raidz_scratch_child_done, pio)); +#endif + } + zio_wait(pio); + + /* + * Reflow in memory. + */ + raidz_expand_pause(spa, 2); + uint64_t logical_sectors = logical_size >> ashift; + for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { + int oldchild = i % (raidvd->vdev_children - 1); + uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; + + int newchild = i % raidvd->vdev_children; + uint64_t newoff = (i / raidvd->vdev_children) << ashift; + + /* a single sector should not be copying over itself */ + ASSERT(!(newchild == oldchild && newoff == oldoff)); + + abd_copy_off(abds[newchild], abds[oldchild], + newoff, oldoff, 1 << ashift); + } + + /* + * Verify that we filled in everything we intended to (write_size on + * each child). + */ + VERIFY0(logical_sectors % raidvd->vdev_children); + VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, + write_size); + + /* + * Write to scratch location (boot area). + */ + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { +#if 0 + zio_nowait(zio_write_phys(pio, raidvd->vdev_child[i], + VDEV_BOOT_OFFSET, write_size, abds[i], + ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, + 0, B_TRUE)); +#else + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to + * the offset to calculate the physical offset to write to. + * Passing in a negative offset lets us access to boot area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + write_size, ZIO_TYPE_WRITE, + ZIO_PRIORITY_ASYNC_WRITE, 0, + raidz_scratch_child_done, pio)); + } +#endif + zio_wait(pio); + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", + (long long)logical_size); + + raidz_expand_pause(spa, 3); + + /* + * Update uberblock to indicate that scratch space is valid. This is + * needed because after this point, the real location may be + * overwritten. If we crash, we need to get the data from the + * scratch space, rather than the real location. + * + * Note: ub_timestamp is bumped so that vdev_uberblock_compare() + * will prefer this uberblock. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, + RRSS_SCRATCH_VALID, logical_size); + spa->spa_ubsync.ub_timestamp++; + ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + + zfs_dbgmsg("reflow: uberblock updated " + "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + raidz_expand_pause(spa, 4); + + /* + * Overwrite with reflow'ed data. + */ + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { +#if 0 + zio_nowait(zio_write_phys(pio, raidvd->vdev_child[i], + VDEV_LABEL_START_SIZE, write_size, abds[i], + ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, + 0, B_FALSE)); +#else + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], write_size, ZIO_TYPE_WRITE, + ZIO_PRIORITY_ASYNC_WRITE, 0, + raidz_scratch_child_done, pio)); +#endif + } + zio_wait(pio); + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", + (long long)logical_size); + + for (int i = 0; i < raidvd->vdev_children; i++) + abd_free(abds[i]); + kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); + + raidz_expand_pause(spa, 5); + + /* + * Update uberblock to indicate that the initial part has been + * reflow'ed. This is needed because after this point (when we exit + * the rangelock), we allow regular writes to this region, which will + * be written to the new location only (because reflow_offset_next == + * reflow_offset_synced). If we crashed and re-copied from the + * scratch space, we would lose the regular writes. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, + RRSS_SCRATCH_NOT_IN_USE, logical_size); + spa->spa_ubsync.ub_timestamp++; + ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + + zfs_dbgmsg("reflow: uberblock updated " + "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + raidz_expand_pause(spa, 6); + + /* + * Update progress. + */ + vre->vre_offset = logical_size; + zfs_rangelock_exit(lr); + spa_config_exit(spa, SCL_STATE, FTAG); + + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vre->vre_offset_pertxg[txgoff] = vre->vre_offset; + vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; + raidz_reflow_sync(spa, tx); + + raidz_expand_pause(spa, 7); +} + +/* + * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work + * here. No other i/o can be in progress, so we don't need the + * vre_rangelock. + */ +void +vdev_raidz_reflow_copy_scratch(spa_t *spa) +{ + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); + ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + ASSERT0(logical_size % raidvd->vdev_children); + uint64_t write_size = logical_size / raidvd->vdev_children; + + zio_t *pio; + + /* + * Read from scratch space. + */ + abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), + KM_SLEEP); + for (int i = 0; i < raidvd->vdev_children; i++) { + abds[i] = abd_alloc_linear(write_size, B_FALSE); + } + + raidz_expand_pause(spa, 8); + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { +#if 0 + zio_nowait(zio_read_phys(pio, raidvd->vdev_child[i], + VDEV_BOOT_OFFSET, write_size, abds[i], + ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + 0, B_TRUE)); +#else + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to + * the offset to calculate the physical offset to write to. + * Passing in a negative offset lets us access to boot area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + write_size, ZIO_TYPE_READ, + ZIO_PRIORITY_ASYNC_READ, 0, + raidz_scratch_child_done, pio)); +#endif + } + zio_wait(pio); + raidz_expand_pause(spa, 9); + + /* + * Overwrite real location with reflow'ed data. + */ + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { +#if 0 + zio_nowait(zio_write_phys(pio, raidvd->vdev_child[i], + VDEV_LABEL_START_SIZE, write_size, abds[i], + ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, + 0, B_FALSE)); +#else + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], write_size, ZIO_TYPE_WRITE, + ZIO_PRIORITY_ASYNC_WRITE, 0, + raidz_scratch_child_done, pio)); +#endif + } + zio_wait(pio); + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " + "to real location", (long long)logical_size); + + for (int i = 0; i < raidvd->vdev_children; i++) + abd_free(abds[i]); + kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); + raidz_expand_pause(spa, 10); + + /* + * Update uberblock. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, + RRSS_SCRATCH_NOT_IN_USE, logical_size); + spa->spa_ubsync.ub_timestamp++; + VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + + zfs_dbgmsg("reflow recovery: uberblock updated " + "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, + spa_first_txg(spa)); + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vre->vre_offset = logical_size; + vre->vre_offset_pertxg[txgoff] = vre->vre_offset; + vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; + raidz_reflow_sync(spa, tx); + + dmu_tx_commit(tx); + + spa_config_exit(spa, SCL_STATE, FTAG); + + raidz_expand_pause(spa, 11); +} + +/* ARGSUSED */ +static boolean_t +spa_raidz_expand_cb_check(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + + return (spa->spa_raidz_expand != NULL && + !spa->spa_raidz_expand->vre_waiting_for_resilver); +} + +/* ARGSUSED */ +static void +spa_raidz_expand_cb(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + ASSERT(vre->vre_offset == UINT64_MAX || + vre->vre_offset == RRSS_GET_OFFSET(&spa->spa_ubsync)); + vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); + + if (vre->vre_offset == 0) { + VERIFY0(dsl_sync_task(spa_name(spa), + NULL, raidz_reflow_scratch_sync, + vre, 0, ZFS_SPACE_CHECK_NONE)); + } + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + + uint64_t guid = raidvd->vdev_guid; + + for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; + i < raidvd->vdev_ms_count && + !zthr_iscancelled(spa->spa_raidz_expand_zthr) && + vre->vre_failed_offset == UINT64_MAX; i++) { + metaslab_t *msp = raidvd->vdev_ms[i]; + + metaslab_disable(msp); + mutex_enter(&msp->ms_lock); + + /* + * The metaslab may be newly created (for the expanded + * space), in which case its trees won't exist yet, + * so we need to bail out early. + */ + if (msp->ms_new) { + mutex_exit(&msp->ms_lock); + metaslab_enable(msp, B_FALSE, B_FALSE); + continue; + } + + VERIFY0(metaslab_load(msp)); + + /* + * We want to copy everything except the free (allocatable) + * space. Note that there may be a little bit more free + * space (e.g. in ms_defer), and it's fine to copy that too. + */ + range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64, + NULL, 0, 0); + range_tree_add(rt, msp->ms_start, msp->ms_size); + range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); + mutex_exit(&msp->ms_lock); + + /* + * Force the last sector of each metaslab to be copied. This + * ensures that we advance the on-disk progress to the end of + * this metaslab while the metaslab is disabled. Otherwise, we + * could move past this metaslab without advancing the on-disk + * progress, and then an allocation to this metaslab would not + * be copied. + */ + int sectorsz = 1 << raidvd->vdev_ashift; + uint64_t ms_last_offset = msp->ms_start + + msp->ms_size - sectorsz; + if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { + range_tree_add(rt, ms_last_offset, sectorsz); + } + + /* + * When we are resuming from a paused expansion (i.e. + * when importing a pool with a expansion in progress), + * discard any state that we have already processed. + */ + range_tree_clear(rt, 0, vre->vre_offset); + + while (!zthr_iscancelled(spa->spa_raidz_expand_zthr) && + !range_tree_is_empty(rt) && + vre->vre_failed_offset == UINT64_MAX) { + + /* + * We need to periodically drop the config lock so that + * writers can get in. Additionally, we can't wait + * for a txg to sync while holding a config lock + * (since a waiting writer could cause a 3-way deadlock + * with the sync thread, which also gets a config + * lock for reader). So we can't hold the config lock + * while calling dmu_tx_assign(). + */ + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * This delay will pause the removal around the point + * specified by zfs_remove_max_bytes_pause. We do this + * solely from the test suite or during debugging. + */ + while (raidz_expand_max_offset_pause != 0 && + raidz_expand_max_offset_pause <= vre->vre_offset && + !zthr_iscancelled(spa->spa_raidz_expand_zthr)) + delay(hz); + + mutex_enter(&vre->vre_lock); + while (vre->vre_outstanding_bytes > + raidz_expand_max_copy_bytes) { + cv_wait(&vre->vre_cv, &vre->vre_lock); + } + mutex_exit(&vre->vre_lock); + + dmu_tx_t *tx = + dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + /* + * Reacquire the vdev_config lock. Theoretically, the + * vdev_t that we're expanding may have changed. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + + boolean_t needsync = + raidz_reflow_impl(raidvd, vre, rt, tx); + + dmu_tx_commit(tx); + + if (needsync) { + spa_config_exit(spa, SCL_CONFIG, FTAG); + txg_wait_synced(spa->spa_dsl_pool, txg); + spa_config_enter(spa, SCL_CONFIG, FTAG, + RW_READER); + } + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + metaslab_enable(msp, B_FALSE, B_FALSE); + range_tree_vacate(rt, NULL, NULL); + range_tree_destroy(rt); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * The txg_wait_synced() here ensures that all reflow zio's have + * completed, and vre_failed_offset has been set if necessary. It + * also ensures that the progress of the last raidz_reflow_sync() is + * written to disk before raidz_reflow_complete_sync() changes the + * in-memory vre_state. vdev_raidz_io_start() uses vre_state to + * determine if a reflow is in progress, in which case we may need to + * write to both old and new locations. Therefore we can only change + * vre_state once this is not necessary, which is once the on-disk + * progress (in spa_ubsync) has been set past any possible writes (to + * the end of the last metaslab). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + + if (!zthr_iscancelled(spa->spa_raidz_expand_zthr) && + vre->vre_failed_offset == UINT64_MAX) { + /* + * We are not being canceled, so the reflow must be + * complete. In that case also mark it as completed on disk. + */ + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + raidz_reflow_complete_sync, spa, + 0, ZFS_SPACE_CHECK_NONE)); + (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); + } else { + /* + * Wait for all copy zio's to complete and for all the + * raidz_reflow_sync() synctasks to be run. + */ + spa_history_log_internal(spa, "reflow pause", + NULL, "offset=%llu failed_offset=%lld", + (long long)vre->vre_offset, + (long long)vre->vre_failed_offset); + mutex_enter(&vre->vre_lock); + if (vre->vre_failed_offset != UINT64_MAX) { + /* + * Reset progress so that we will retry everything + * after the point that something failed. + */ + vre->vre_offset = vre->vre_failed_offset; + vre->vre_failed_offset = UINT64_MAX; + vre->vre_waiting_for_resilver = B_TRUE; + } + mutex_exit(&vre->vre_lock); + } +} + +void +spa_start_raidz_expansion_thread(spa_t *spa) +{ + ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); + spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", + spa_raidz_expand_cb_check, spa_raidz_expand_cb, spa, defclsyspri); +} + +void +raidz_dtl_reassessed(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + if (spa->spa_raidz_expand != NULL) { + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + if (vd->vdev_top->vdev_id == vre->vre_vdev_id) { + mutex_enter(&vre->vre_lock); + if (vre->vre_waiting_for_resilver) { + vdev_dbgmsg(vd, "DTL reassessed, " + "continuing raidz expansion"); + vre->vre_waiting_for_resilver = B_FALSE; + zthr_wakeup(spa->spa_raidz_expand_zthr); + } + mutex_exit(&vre->vre_lock); + } + } +} + +int +vdev_raidz_attach_check(vdev_t *new_child) +{ + vdev_t *raidvd = new_child->vdev_parent; + uint64_t new_children = raidvd->vdev_children; + + /* + * We use the "boot" space as scratch space to handle overwriting the + * initial part of the vdev. If it is too small, then this expansion + * is not allowed. This would be very unusual (e.g. ashift > 13 and + * >200 children). + */ + if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { + return (EINVAL); + } + return (0); +} + +void +vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) +{ + vdev_t *new_child = arg; + spa_t *spa = new_child->vdev_spa; + vdev_t *raidvd = new_child->vdev_parent; + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); + ASSERT3P(raidvd->vdev_top, ==, raidvd); + ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); + ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); + ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, + new_child); + + spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); + + vdrz->vd_physical_width++; + + VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); + vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; + vdrz->vn_vre.vre_offset = 0; + vdrz->vn_vre.vre_failed_offset = UINT64_MAX; + spa->spa_raidz_expand = &vdrz->vn_vre; + zthr_wakeup(spa->spa_raidz_expand_zthr); + + /* + * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get + * written to the config. + */ + vdev_config_dirty(raidvd); + + vdrz->vn_vre.vre_start_time = gethrestime_sec(); + vdrz->vn_vre.vre_end_time = 0; + vdrz->vn_vre.vre_state = DSS_SCANNING; + vdrz->vn_vre.vre_bytes_copied = 0; + + uint64_t state = vdrz->vn_vre.vre_state; + VERIFY0(zap_update(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state, tx)); + + uint64_t start_time = vdrz->vn_vre.vre_start_time; + VERIFY0(zap_update(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, + sizeof (start_time), 1, &start_time, tx)); + + (void) zap_remove(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); + (void) zap_remove(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); + + spa_history_log_internal(spa, "raidz vdev expansion started", tx, + "%s vdev %llu new width %llu", spa_name(spa), + (unsigned long long)raidvd->vdev_id, + (unsigned long long)raidvd->vdev_children); +} + +int +vdev_raidz_load(vdev_t *vd) +{ + vdev_raidz_t *vdrz = vd->vdev_tsd; + int err; + + uint64_t state = DSS_NONE; + uint64_t start_time = 0; + uint64_t end_time = 0; + uint64_t bytes_copied = 0; + + if (vd->vdev_top_zap != 0) { + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, + sizeof (start_time), 1, &start_time); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, + sizeof (end_time), 1, &end_time); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, + sizeof (bytes_copied), 1, &bytes_copied); + if (err != 0 && err != ENOENT) + return (err); + } + + /* + * If we are in the middle of expansion, vre_state should have + * already been set by vdev_raidz_init(). + */ + EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); + vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; + vdrz->vn_vre.vre_start_time = start_time; + vdrz->vn_vre.vre_end_time = end_time; + vdrz->vn_vre.vre_bytes_copied = bytes_copied; + + return (0); +} + +int +spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) +{ + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + if (vre == NULL) { + /* no removal in progress; find most recent completed */ + for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; + if (vd->vdev_ops == &vdev_raidz_ops) { + vdev_raidz_t *vdrz = vd->vdev_tsd; + + if (vdrz->vn_vre.vre_end_time != 0 && + (vre == NULL || + vdrz->vn_vre.vre_end_time > + vre->vre_end_time)) { + vre = &vdrz->vn_vre; + } + } + } + } + + if (vre == NULL) { + return (SET_ERROR(ENOENT)); + } + + pres->pres_state = vre->vre_state; + pres->pres_expanding_vdev = vre->vre_vdev_id; + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + pres->pres_to_reflow = vd->vdev_stat.vs_alloc; + + mutex_enter(&vre->vre_lock); + pres->pres_reflowed = vre->vre_bytes_copied; + for (int i = 0; i < TXG_SIZE; i++) + pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; + mutex_exit(&vre->vre_lock); + + pres->pres_start_time = vre->vre_start_time; + pres->pres_end_time = vre->vre_end_time; + pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; + + return (0); +} + /* * Initialize private RAIDZ specific fields from the nvlist. */ static int vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) { - vdev_raidz_t *vdrz; - uint64_t nparity; - uint_t children; nvlist_t **child; int error = nvlist_lookup_nvlist_array(nv, @@ -2541,6 +4502,7 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) if (error != 0) return (SET_ERROR(EINVAL)); + uint64_t nparity; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); @@ -2567,10 +4529,54 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) nparity = 1; } - vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); - vdrz->vd_logical_width = children; + vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); + vdrz->vn_vre.vre_vdev_id = -1; + vdrz->vn_vre.vre_offset = UINT64_MAX; + vdrz->vn_vre.vre_failed_offset = UINT64_MAX; + mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); + zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); + mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, + sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); + + vdrz->vd_physical_width = children; vdrz->vd_nparity = nparity; + /* note, the ID does not exist when creating a pool */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, + &vdrz->vn_vre.vre_vdev_id); + + boolean_t reflow_in_progress = + nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); + zfs_dbgmsg("reflow_in_progress=%u", (int)reflow_in_progress); + if (reflow_in_progress) { + spa->spa_raidz_expand = &vdrz->vn_vre; + vdrz->vn_vre.vre_state = DSS_SCANNING; + } + + vdrz->vd_original_width = children; + uint64_t *txgs; + unsigned int txgs_size; + error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + &txgs, &txgs_size); + if (error == 0) { + for (int i = 0; i < txgs_size; i++) { + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = txgs[txgs_size - i - 1]; + re->re_logical_width = vdrz->vd_physical_width - i; + + if (reflow_in_progress) + re->re_logical_width--; + + avl_add(&vdrz->vd_expand_txgs, re); + } + + vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; + } + if (reflow_in_progress) + vdrz->vd_original_width--; + *tsd = vdrz; return (0); @@ -2579,7 +4585,20 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) static void vdev_raidz_fini(vdev_t *vd) { - kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); + vdev_raidz_t *vdrz = vd->vdev_tsd; + if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) + vd->vdev_spa->spa_raidz_expand = NULL; + reflow_node_t *re; + void *cookie = NULL; + avl_tree_t *tree = &vdrz->vd_expand_txgs; + while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) + kmem_free(re, sizeof (*re)); + avl_destroy(&vdrz->vd_expand_txgs); + mutex_destroy(&vdrz->vd_expand_lock); + mutex_destroy(&vdrz->vn_vre.vre_lock); + cv_destroy(&vdrz->vn_vre.vre_cv); + zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); + kmem_free(vdrz, sizeof (*vdrz)); } /* @@ -2607,6 +4626,29 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) * it. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); + + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); + } + + mutex_enter(&vdrz->vd_expand_lock); + if (!avl_is_empty(&vdrz->vd_expand_txgs)) { + uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); + uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, + KM_SLEEP); + uint64_t i = 0; + + for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); + re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { + txgs[i++] = re->re_txg; + } + + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + txgs, count); + + kmem_free(txgs, sizeof (uint64_t) * count); + } + mutex_exit(&vdrz->vd_expand_lock); } static uint64_t @@ -2646,3 +4688,10 @@ vdev_ops_t vdev_raidz_ops = { .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; + +ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_offset_pause, ULONG, ZMOD_RW, + "For testing, pause RAIDZ expansion at this offset"); +ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, + "Max amount of concurrent i/o for RAIDZ expansion"); +ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, + "For expanded RAIDZ, aggregate reads that have more rows than this"); diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index deea7fedd770..12a703bdbde0 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -168,7 +168,8 @@ static boolean_t vdev_trim_should_stop(vdev_t *vd) { return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) || - vd->vdev_detached || vd->vdev_top->vdev_removing); + vd->vdev_detached || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding); } /* @@ -179,6 +180,7 @@ vdev_autotrim_should_stop(vdev_t *tvd) { return (tvd->vdev_autotrim_exit_wanted || !vdev_writeable(tvd) || tvd->vdev_removing || + tvd->vdev_rz_expanding || spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); } @@ -202,7 +204,8 @@ vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx) kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); - if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + if (vd == NULL || vd->vdev_top->vdev_removing || + !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) return; uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK]; @@ -976,6 +979,7 @@ vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure) ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_trim_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_rz_expanding); vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure); vd->vdev_trim_thread = thread_create(NULL, 0, @@ -1132,12 +1136,13 @@ vdev_trim_restart(vdev_t *vd) ASSERT(err == 0 || err == ENOENT); vd->vdev_trim_action_time = timestamp; - if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || - vd->vdev_offline) { + if ((vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || + vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_trim_load(vd)); } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && + !vd->vdev_top->vdev_rz_expanding && vd->vdev_trim_thread == NULL) { VERIFY0(vdev_trim_load(vd)); vdev_trim(vd, vd->vdev_trim_rate, @@ -1454,7 +1459,8 @@ vdev_autotrim(spa_t *spa) mutex_enter(&tvd->vdev_autotrim_lock); if (vdev_writeable(tvd) && !tvd->vdev_removing && - tvd->vdev_autotrim_thread == NULL) { + tvd->vdev_autotrim_thread == NULL && + !tvd->vdev_rz_expanding) { ASSERT3P(tvd->vdev_top, ==, tvd); tvd->vdev_autotrim_thread = thread_create(NULL, 0, @@ -1664,6 +1670,7 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_top->vdev_rz_expanding); bzero(&ta, sizeof (ta)); ta.trim_vdev = vd; diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 980e25958f7f..4638072c9712 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -755,8 +755,11 @@ tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', tags = ['functional', 'redacted_send'] [tests/functional/raidz] -tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos'] +tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos', + 'raidz_expand_001_pos', 'raidz_expand_002_pos', 'raidz_expand_003_neg', + 'raidz_expand_003_pos', 'raidz_expand_004_pos', 'raidz_expand_005_pos'] tags = ['functional', 'raidz'] +timeout = 1200 [tests/functional/redundancy] tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2', diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index fff43e469165..7d8836f82c4b 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -34,6 +34,7 @@ DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync +EMBEDDED_SLOG_MIN_MS embedded_slog_min_ms zfs_embedded_slog_min_ms INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size INITIALIZE_VALUE initialize_value zfs_initialize_value KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export @@ -62,6 +63,7 @@ MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_inter MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable +RAIDZ_EXPAND_MAX_OFFSET_PAUSE vdev.expand_max_offset_pause raidz_expand_max_offset_pause REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 6075e1f1abbd..509d4e2e9fb7 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -96,6 +96,7 @@ if is_linux || is_freebsd; then "feature@bookmark_v2" "feature@livelist" "feature@zstd_compress" + "feature@raidz_expansion" ) fi diff --git a/tests/zfs-tests/tests/functional/raidz/Makefile.am b/tests/zfs-tests/tests/functional/raidz/Makefile.am index d93eb73cf832..c11c93ad9236 100644 --- a/tests/zfs-tests/tests/functional/raidz/Makefile.am +++ b/tests/zfs-tests/tests/functional/raidz/Makefile.am @@ -5,4 +5,10 @@ dist_pkgdata_SCRIPTS = \ raidz_001_neg.ksh \ raidz_002_pos.ksh \ raidz_003_pos.ksh \ - raidz_004_pos.ksh + raidz_004_pos.ksh \ + raidz_expand_001_pos.ksh \ + raidz_expand_002_pos.ksh \ + raidz_expand_003_neg.ksh \ + raidz_expand_003_pos.ksh \ + raidz_expand_004_pos.ksh \ + raidz_expand_005_pos.ksh diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh index bf22632c7eff..020593695c5d 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh @@ -36,6 +36,6 @@ # runtime might be longer. # -log_must raidz_test -S -e -t 60 +log_must raidz_test -S -e -t 300 log_pass "raidz_test parameter sweep test with expanded map succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh index 6cd2bf7c9f60..5a7087139650 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh @@ -36,6 +36,6 @@ # runtime might be longer. # -log_must raidz_test -S -e -r 0 -t 60 +log_must raidz_test -S -e -r 0 -t 300 log_pass "raidz_test parameter sweep test with expanded map succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh new file mode 100755 index 000000000000..3f064d47292c --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh @@ -0,0 +1,226 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach poolname raidz ...' should attach new devive to the pool. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool +# - fill it with some directories/files +# - attach device to the raidz pool +# - verify that device attached and the raidz pool size increase +# - verify resilver by replacing parity devices +# - verify resilver by replacing data devices +# - verify scrub by zeroing parity devices +# - verify scrub by zeroing data devices +# - verify the raidz pool +# - destroy the raidz pool + +typeset -r devs=6 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) +max_offset=$(get_tunable RAIDZ_EXPAND_MAX_OFFSET_PAUSE) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $max_offset +} + +function wait_expand_paused +{ + oldcopied='0' + newcopied='1' + while [[ $oldcopied != $newcopied ]]; do + oldcopied=$newcopied + sleep 5 + newcopied=$(zpool status $TESTPOOL | \ + grep 'copied out of' | \ + awk '{print $1}') + log_note "newcopied=$newcopied" + done + log_note "paused at $newcopied" +} + +function test_resilver # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool offline $pool $dir/dev-$i + done + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool labelclear -f $dir/dev-$i + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=0; i<$nparity; i=i+1 )); do + log_must zpool replace -f $pool $dir/dev-$i + done + + while ! is_pool_resilvered $pool; do + sleep 1 + done + + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool offline $pool $dir/dev-$i + done + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool labelclear -f $dir/dev-$i + done + + log_must zpool import -o cachefile=none -d $dir $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + log_must zpool replace -f $pool $dir/dev-$i + done + + while ! is_pool_resilvered $pool; do + sleep 1 + done + + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool +} + +function test_scrub # +{ + typeset pool=$1 + typeset nparity=$2 + typeset dir=$3 + typeset combrec=$4 + + randbyte=$(( ((RANDOM<<15) + RANDOM) % (dev_size_mb * (devs-1) * 1024 * 1024) )) + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $randbyte + log_must zpool attach $TESTPOOL ${raid}-0 $dir/dev-$devs + wait_expand_paused + + log_must zpool export $pool + + for (( i=0; i<$nparity; i=i+1 )); do + dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + log_must zpool scrub $pool + + while ! is_pool_scrubbed $pool; do + sleep 1 + done + + log_must zpool clear $pool + + log_must zpool export $pool + + for (( i=$nparity; i<$nparity*2; i=i+1 )); do + dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + done + + log_must zpool import -o cachefile=none -d $dir $pool + + log_must zpool scrub $pool + + while ! is_pool_scrubbed $pool; do + sleep 1 + done + + log_must check_pool_status $pool "errors" "No known data errors" + + log_must zpool clear $pool + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $max_offset + log_must zpool wait -t raidz_expand $TESTPOOL +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +# Disk file which will be attached +log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs + +for nparity in 1 2 3; do + raid=raidz$nparity + dir=$TEST_BASE_DIR + + log_must zpool create -f -o cachefile=none $TESTPOOL $raid ${disks[@]} + log_must zfs set primarycache=metadata $TESTPOOL + + log_must zfs create $TESTPOOL/fs + log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R + + log_must zfs create -o compress=on $TESTPOOL/fs2 + log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R + + log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 + log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R + + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + test_scrub $TESTPOOL $nparity $dir + #test_resilver $TESTPOOL $nparity $dir + + zpool destroy "$TESTPOOL" +done + +log_pass "raidz expansion test succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh new file mode 100755 index 000000000000..a605ac911e58 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh @@ -0,0 +1,118 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach poolname raidz ...' should attach new devive to the pool. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - for each free test block device +# - attach to the pool +# - verify the raidz pool +# - destroy the raidz pool + +typeset -r devs=6 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +for nparity in 1 2 3; do + raid=raidz$nparity + dir=$TEST_BASE_DIR + pool=$TESTPOOL + opts="-o cachefile=none" + + log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} + log_must zfs set primarycache=metadata $pool + + log_must zfs create $pool/fs + log_must fill_fs /$pool/fs 1 512 100 1024 R + + log_must zfs create -o compress=on $pool/fs2 + log_must fill_fs /$pool/fs2 1 512 100 1024 R + + log_must zfs create -o compress=on -o recordsize=8k $pool/fs3 + log_must fill_fs /$pool/fs3 1 512 100 1024 R + + typeset pool_size=$(get_pool_prop size $pool) + + for disk in ${disks[$(($nparity+2))..$devs]}; do + log_must dd if=/dev/urandom of=/${pool}/FILE-$RANDOM bs=1M \ + count=64 + + log_must zpool attach -w $pool ${raid}-0 $disk + + # Wait some time for pool size increase + sleep 5 + + typeset disk_attached=$(get_disklist $pool | grep $disk) + if [[ -z $disk_attached ]]; then + log_fail "pool $pool attached disk not found" + fi + + typeset expand_size=$(get_pool_prop size $pool) + if [[ "$expand_size" -le "$pool_size" ]]; then + log_fail "pool $pool not expanded" + fi + + verify_pool $pool + + pool_size=$expand_size + done + + zpool destroy "$pool" +done + +log_pass "raidz expansion test succeeded." \ No newline at end of file diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh new file mode 100755 index 000000000000..5e5ff6717083 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_neg.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach poolname raidz ...' should reject device attach if pool +# is in checkpointed state. If checkpoint creation requested on +# expanding pool, the request should be rejected. + +# +# STRATEGY: +# 1. Create block device files for the test raidz pool. +# 2. Create pool and checkpoint it. +# 3. Try to expand raidz, ensure that request rejected. +# 4. Recreate the pool. +# 5. Apply raidz expansion. +# 6. Ensure, that checkpoint cannot be created. + +typeset -r devs=6 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +nparity=1 +raid=raidz$nparity +pool=$TESTPOOL +opts="-o cachefile=none" + +# case 1: checkpoint exist, try to expand +log_must zpool create -f $opts $pool $raid ${disks[1..$(($devs-1))]} +log_must zfs set primarycache=metadata $pool +log_must zpool checkpoint $pool +log_mustnot zpool attach $pool ${raid}-0 ${disks[$devs]} +log_must zpool destroy $pool + +# case 2: expansion in progress, try to checkpoint +log_must zpool create -f $opts $pool $raid ${disks[1..$(($devs-1))]} +log_must zfs set primarycache=metadata $pool +log_must zfs create $pool/fs +log_must fill_fs /$pool/fs 1 512 100 1024 R +log_must zpool attach $pool ${raid}-0 ${disks[$devs]} +log_mustnot zpool checkpoint $pool +log_must zpool destroy $pool + +log_pass "raidz expansion test succeeded." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh new file mode 100755 index 000000000000..b06ccfeb14ab --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_003_pos.ksh @@ -0,0 +1,135 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib + +# +# DESCRIPTION: +# Check raidz expansion is able to work correctly under i/o load. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - create couple of datasets with different recordsize and fill it +# - set raidz expand offset pause +# - start randwritecomp on one of the datasets files +# - attach new device to the pool +# - wait reflow offset become equal to raidz expand pause offset +# - kill randwritecomp +# - verify pool +# - set raidz expand offset to max value to complete raidz expansion + +typeset -r devs=10 +typeset -r dev_size_mb=128 + +typeset -a disks + +embedded_slog_min_ms=$(get_tunable EMBEDDED_SLOG_MIN_MS) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 EMBEDDED_SLOG_MIN_MS $embedded_slog_min_ms +} + +function wait_expand_paused +{ + oldcopied='0' + newcopied='1' + while [[ $oldcopied != $newcopied ]]; do + oldcopied=$newcopied + sleep 1 + newcopied=$(zpool status $TESTPOOL | \ + grep 'copied out of' | \ + awk '{print $1}') + done +} + +log_onexit cleanup + +log_must set_tunable32 EMBEDDED_SLOG_MIN_MS 99999 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +for nparity in 1 2 3; do + raid=raidz$nparity + pool=$TESTPOOL + opts="-o cachefile=none" + + log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} + + log_must zfs create -o recordsize=8k $pool/fs + log_must fill_fs /$pool/fs 1 128 100 1024 R + + log_must zfs create -o recordsize=128k $pool/fs2 + log_must fill_fs /$pool/fs2 1 128 100 1024 R + + for disk in ${disks[$(($nparity+2))..$devs]}; do + pool_size=$(get_pool_prop size $pool) + pause=$((((RANDOM << 15) + RANDOM) % pool_size)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + + log_bkgrnd randwritecomp /$pool/fs/file + pid0=$! + + log_bkgrnd randwritecomp /$pool/fs2/file + pid1=$! + + log_must zpool attach $pool ${raid}-0 $disk + wait_expand_paused + + kill_if_running $pid0 + kill_if_running $pid1 + + log_must zpool scrub -w $pool + + log_must check_pool_status $pool "errors" "No known data errors" + log_must check_pool_status $pool "scan" "with 0 errors" + log_must check_pool_status $pool "scan" "repaired 0B" + + pause=$((devs*dev_size_mb*1024*1024)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + + log_must zpool wait -t raidz_expand $pool + done + + log_must zpool destroy "$pool" +done + +log_pass "raidz expansion test succeeded." + diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh new file mode 100755 index 000000000000..585a293ae90b --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_004_pos.ksh @@ -0,0 +1,119 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Check device replacement during raidz expansion. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - create couple of datasets with different recordsize and fill it +# - attach new device to the pool +# - offline and zero vdevs allowed by parity +# - wait some time and start offlined vdevs replacement +# - wait replacement completion and verify pool status + +typeset -r devs=10 +typeset -r dev_size_mb=128 + +typeset -a disks + +embedded_slog_min_ms=$(get_tunable EMBEDDED_SLOG_MIN_MS) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $embedded_slog_min_ms +} + +log_onexit cleanup + +log_must set_tunable32 EMBEDDED_SLOG_MIN_MS 99999 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +for nparity in 1 2 3; do + raid=raidz$nparity + pool=$TESTPOOL + opts="-o cachefile=none" + + log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} + + log_must zfs create -o recordsize=8k $pool/fs + log_must fill_fs /$pool/fs 1 128 100 1024 R + + log_must zfs create -o recordsize=128k $pool/fs2 + log_must fill_fs /$pool/fs2 1 128 100 1024 R + + for disk in ${disks[$(($nparity+2))..$devs]}; do + log_must zpool attach $pool ${raid}-0 $disk + + sleep 10 + + for (( i=1; i<=$nparity; i=i+1 )); do + log_must zpool offline $pool ${disks[$i]} + log_must dd if=/dev/zero of=${disks[$i]} \ + bs=1024k count=$dev_size_mb conv=notrunc + done + + sleep 3 + + for (( i=1; i<=$nparity; i=i+1 )); do + log_must zpool replace $pool ${disks[$i]} + done + + log_must zpool wait -t replace $pool + log_must check_pool_status $pool "scan" "with 0 errors" + + log_must zpool wait -t raidz_expand $pool + + log_must zpool clear $pool + log_must zpool scrub -w $pool + + # XXX step sometimes FAILED + log_must zpool status -v + # log_must check_pool_status $pool "scan" "repaired 0B" + done + + log_must zpool destroy "$pool" +done + +log_pass "raidz expansion test succeeded." + diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh new file mode 100755 index 000000000000..954ee8a82d79 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_005_pos.ksh @@ -0,0 +1,170 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Check device replacement during raidz expansion using expansion pausing. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - create couple of datasets with different recordsize and fill it +# - set raidz expand offset pause +# - attach new device to the pool +# - wait reflow offset become equal to raidz expand pause offset +# - offline and zero vdevs allowed by parity +# - wait some time and start offlined vdevs replacement +# - wait replacement completion and verify pool status +# - loop thru vdevs replacing and raidz expand pause offset increasing +# - verify pool +# - set raidz expand offset to max value to complete raidz expansion + +typeset -r devs=10 +typeset -r dev_size_mb=128 + +typeset -a disks + +embedded_slog_min_ms=$(get_tunable EMBEDDED_SLOG_MIN_MS) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $embedded_slog_min_ms +} + +function wait_expand_paused +{ + oldcopied='0' + newcopied='1' + while [[ $oldcopied != $newcopied ]]; do + oldcopied=$newcopied + sleep 1 + newcopied=$(zpool status $TESTPOOL | \ + grep 'copied out of' | \ + awk '{print $1}') + done +} + +log_onexit cleanup + +function test_replace # +{ + pool=${1} + devices=${2} + nparity=${3} + device_count=0 + + log_must echo "devices=$devices" + + for dev in ${devices}; do + device_count=$((device_count+1)) + done + + index=$((RANDOM%(device_count-nparity))) + for (( j=1; j<=$nparity; j=j+1 )); do + log_must zpool offline $pool ${disks[$((index+j))]} + log_must dd if=/dev/zero of=${disks[$((index+j))]} \ + bs=1024k count=$dev_size_mb conv=notrunc + done + + for (( j=1; j<=$nparity; j=j+1 )); do + log_must zpool replace $pool ${disks[$((index+j))]} + done + + log_must zpool wait -t replace $pool + log_must check_pool_status $pool "scan" "with 0 errors" + + log_must zpool clear $pool + log_must zpool scrub -w $pool + + # XXX step sometimes FAILED + log_must zpool status -v + # log_must check_pool_status $pool "scan" "repaired 0B" +} + +log_must set_tunable32 EMBEDDED_SLOG_MIN_MS 99999 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +for nparity in 1 2 3; do + raid=raidz$nparity + pool=$TESTPOOL + opts="-o cachefile=none" + devices="" + + log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} + devices="${disks[1..$(($nparity+1))]}" + + log_must zfs create -o recordsize=8k $pool/fs + log_must fill_fs /$pool/fs 1 128 100 1024 R + + log_must zfs create -o recordsize=128k $pool/fs2 + log_must fill_fs /$pool/fs2 1 128 100 1024 R + + for disk in ${disks[$(($nparity+2))..$devs]}; do + pool_size=$(get_pool_prop size $pool) + pause=$((((RANDOM << 15) + RANDOM) % pool_size / 2)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + + log_must zpool attach $pool ${raid}-0 $disk + devices="$devices $disk" + + wait_expand_paused + + for (( i=0; i<2; i++ )); do + test_replace $pool "$devices" $nparity + + pause=$((pause + (((RANDOM << 15) + RANDOM) % \ + pool_size) / 4)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + + wait_expand_paused + done + + pause=$((devs*dev_size_mb*1024*1024)) + log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $pause + + log_must zpool wait -t raidz_expand $pool + done + + log_must zpool destroy "$pool" +done + +log_pass "raidz expansion test succeeded." + From c64244b369b90d7977dc3921a116eaaad76b672b Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 25 Oct 2021 03:24:59 -0700 Subject: [PATCH 03/25] Increase the number of txgs added to last reflow complete sync txg Some blocks, which were synced in the same txg as raidz_reflow_complete_sync(), can have incorrect logical width. The increasing of txg value, which was added to expand txgs array, can help in this case. --- module/zfs/vdev_raidz.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 9631ac8a9ccc..1f171ffddc8b 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -3497,7 +3497,7 @@ raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) VERIFY0(vre->vre_offset_pertxg[i]); reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); - re->re_txg = tx->tx_txg + 1; + re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; re->re_logical_width = vdrz->vd_physical_width; mutex_enter(&vdrz->vd_expand_lock); avl_add(&vdrz->vd_expand_txgs, re); From c32cc85fd0c0bb6eb479086e9e747bcc3fbe5f8b Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Tue, 16 Nov 2021 12:35:04 -0800 Subject: [PATCH 04/25] panic in zthr_iscancelled --- module/zfs/arc.c | 2 +- module/zfs/vdev_raidz.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 79e2d4381830..707c4711b90c 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4911,7 +4911,7 @@ arc_evict_cb(void *arg, zthr_t *zthr) * broadcast will wake any remaining arc evict waiters. */ mutex_enter(&arc_evict_lock); - arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && + arc_evict_needed = !zthr_iscancelled(zthr) && evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0; if (!arc_evict_needed) { /* diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 1f171ffddc8b..9e2c6da82809 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -4122,7 +4122,7 @@ spa_raidz_expand_cb(void *arg, zthr_t *zthr) for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; i < raidvd->vdev_ms_count && - !zthr_iscancelled(spa->spa_raidz_expand_zthr) && + !zthr_iscancelled(zthr) && vre->vre_failed_offset == UINT64_MAX; i++) { metaslab_t *msp = raidvd->vdev_ms[i]; @@ -4175,7 +4175,7 @@ spa_raidz_expand_cb(void *arg, zthr_t *zthr) */ range_tree_clear(rt, 0, vre->vre_offset); - while (!zthr_iscancelled(spa->spa_raidz_expand_zthr) && + while (!zthr_iscancelled(zthr) && !range_tree_is_empty(rt) && vre->vre_failed_offset == UINT64_MAX) { @@ -4197,7 +4197,7 @@ spa_raidz_expand_cb(void *arg, zthr_t *zthr) */ while (raidz_expand_max_offset_pause != 0 && raidz_expand_max_offset_pause <= vre->vre_offset && - !zthr_iscancelled(spa->spa_raidz_expand_zthr)) + !zthr_iscancelled(zthr)) delay(hz); mutex_enter(&vre->vre_lock); @@ -4259,7 +4259,7 @@ spa_raidz_expand_cb(void *arg, zthr_t *zthr) */ txg_wait_synced(spa->spa_dsl_pool, 0); - if (!zthr_iscancelled(spa->spa_raidz_expand_zthr) && + if (!zthr_iscancelled(zthr) && vre->vre_failed_offset == UINT64_MAX) { /* * We are not being canceled, so the reflow must be From 8cb83b7e481b6e0201b71cdcad1bc97209e33b9e Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 25 Oct 2021 03:36:34 -0700 Subject: [PATCH 05/25] ztest: Skip ztest_vdev_LUN_growth() if raidz expansion is in-progress --- cmd/ztest/ztest.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 646805ab57cb..0fedcd49b84d 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -4227,6 +4227,18 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) return; } + /* + * If we under raidz expansion, the test can failed because metaslabs + * count will not increase immediately after vdevs growing. It will + * happen only after raidz expansion completion. + */ + if (spa->spa_raidz_expand) { + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); + return; + } + top = ztest_random_vdev_top(spa, B_TRUE); tvd = spa->spa_root_vdev->vdev_child[top]; From 50e8daef6890f0fcdd965ae829af097111cb613f Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 25 Oct 2021 03:45:57 -0700 Subject: [PATCH 06/25] ztest: Make ztest_vdev_raidz_attach() error checking more closer to ztest_vdev_attach_detach() --- cmd/ztest/ztest.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 0fedcd49b84d..c89307d54d7e 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -3934,8 +3934,8 @@ void ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; - uint64_t csize, ashift = ztest_get_ashift(); - vdev_t *cvd, *pvd; + uint64_t newsize, ashift = ztest_get_ashift(); + vdev_t *newvd, *pvd; nvlist_t *root; char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); int error, expected_error = 0; @@ -3965,9 +3965,8 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) * Get size of a child of the raidz group, * make sure device is a bit bigger */ - cvd = pvd->vdev_child[0]; - csize = vdev_get_min_asize(cvd); - csize += csize / 10; + newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)]; + newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2)); if (spa->spa_raidz_expand) expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; @@ -3983,7 +3982,7 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) /* * Build the nvlist describing newpath. */ - root = make_vdev_root(newpath, NULL, NULL, csize, ashift, NULL, + root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, 0, 0, 1); /* @@ -4002,11 +4001,16 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) nvlist_free(root); + if (error == EOVERFLOW || + error == ZFS_ERR_CHECKPOINT_EXISTS || + error == ZFS_ERR_DISCARDING_CHECKPOINT) + expected_error = error; + if (error == 0) { ztest_shared->zs_raidzs_attached++; } else if (error != 0 && error != expected_error) { fatal(0, "raidz attach (%s %llu) returned %d, expected %d", - newpath, (long long)csize, error, expected_error); + newpath, newsize, error, expected_error); } else if (error == 0 && ztest_shared->zs_do_raidz_scratch_verify) { /* * Wait raidz expansion thread starting and kill it. From 16a74ed2e30f22c4f41b3cd25b652173ff759537 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 25 Oct 2021 03:52:47 -0700 Subject: [PATCH 07/25] ztest: Restore scratch object testing --- cmd/ztest/ztest.c | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index c89307d54d7e..9b48dd59299d 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -3926,6 +3926,20 @@ ztest_vdev_raidz_attach_possible(spa_t *spa) return (B_FALSE); } +static void +ztest_scratch_thread(void *arg) +{ + ztest_shared_t *zs = arg; + for (int t = 100; t > 0; t -= 1) { + if (!zs->zs_do_raidz_scratch_verify) + thread_exit(); + + (void) poll(NULL, 0, 100); + } + + ztest_kill(ztest_shared); +} + /* * Verify that we can attach raidz device. */ @@ -3935,6 +3949,7 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; uint64_t newsize, ashift = ztest_get_ashift(); + kthread_t *scratch_thread = NULL; vdev_t *newvd, *pvd; nvlist_t *root; char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); @@ -3985,17 +4000,12 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, 0, 0, 1); - /* - * XXX this doesn't work right because spa_vdev_attach() won't - * return until it can write the first txg of the reflow, which - * will be paused. We need to kill off from another thread?? - */ -#if 0 if (ztest_random(2) == 0 && expected_error == 0) { raidz_expand_max_offset_pause = RAIDZ_REFLOW_OFFSET_PAUSE; ztest_shared->zs_do_raidz_scratch_verify = B_TRUE; + scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, + ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); } -#endif error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); @@ -4011,12 +4021,19 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) } else if (error != 0 && error != expected_error) { fatal(0, "raidz attach (%s %llu) returned %d, expected %d", newpath, newsize, error, expected_error); - } else if (error == 0 && ztest_shared->zs_do_raidz_scratch_verify) { - /* - * Wait raidz expansion thread starting and kill it. - */ - sleep(10); - ztest_kill(ztest_shared); + } + + if (ztest_shared->zs_do_raidz_scratch_verify) { + if (error != 0) { + /* + * Do not verify scratch object in case of error + * returned by vdev attaching. + */ + raidz_expand_max_offset_pause = 0; + ztest_shared->zs_do_raidz_scratch_verify = B_FALSE; + } + + VERIFY0(thread_join(scratch_thread)); } out: From 05a81366e2e52fb6659666779fcf904737f951f4 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 25 Oct 2021 04:34:06 -0700 Subject: [PATCH 08/25] ztest: Add raidz expansion testing as CLI option --- cmd/ztest/ztest.c | 63 +++++++++++++++++++++++++---------------------- man/man1/ztest.1 | 4 +-- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 9b48dd59299d..494c9d9464dc 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -172,6 +172,7 @@ typedef struct ztest_shared_opts { size_t zo_vdev_size; int zo_ashift; int zo_mirrors; + int zo_raid_do_expand; int zo_raid_children; int zo_raid_parity; char zo_raid_type[8]; @@ -372,8 +373,7 @@ typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); /* * XXX: remove zi_raidz_attach_compatible field, when - * raidz expansion will be completely integrated together with - * ztest_raidz_attach_test variable. + * raidz expansion will be completely integrated. */ typedef struct ztest_info { @@ -564,7 +564,6 @@ static ztest_ds_t *ztest_ds; static kmutex_t ztest_vdev_lock; static boolean_t ztest_device_removal_active = B_FALSE; -static boolean_t ztest_raidz_attach_test = B_FALSE; static boolean_t ztest_pool_scrubbed = B_FALSE; static kmutex_t ztest_checkpoint_lock; @@ -784,7 +783,7 @@ static ztest_option_t option_table[] = { DEFAULT_RAID_CHILDREN, NULL}, { 'R', "raid-parity", "INTEGER", "Raid parity", DEFAULT_RAID_PARITY, NULL}, - { 'K', "raid-kind", "raidz|draid|random", "Raid kind", + { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", NO_DEFAULT, "random"}, { 'D', "draid-data", "INTEGER", "Number of draid data drives", DEFAULT_DRAID_DATA, NULL}, @@ -1126,8 +1125,17 @@ process_options(int argc, char **argv) /* When raid choice is 'random' add a draid pool 50% of the time */ if (strcmp(raid_kind, "random") == 0) { - (void) strlcpy(raid_kind, (ztest_random(2) == 0) ? - "draid" : "raidz", sizeof (raid_kind)); + switch (ztest_random(3)) { + case 0: + (void) strlcpy(raid_kind, "raidz", sizeof (raid_kind)); + break; + case 1: + (void) strlcpy(raid_kind, "eraidz", sizeof (raid_kind)); + break; + case 2: + (void) strlcpy(raid_kind, "draid", sizeof (raid_kind)); + break; + } if (ztest_opts.zo_verbose >= 3) (void) printf("choosing RAID type '%s'\n", raid_kind); @@ -1166,6 +1174,16 @@ process_options(int argc, char **argv) (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, sizeof (zo->zo_raid_type)); + } else if (strcmp(raid_kind, "eraidz") == 0) { + /* using eraidz (expandable raidz) */ + zo->zo_raid_do_expand = B_TRUE; + + /* No top-level mirrors with raidz expansion for now */ + zo->zo_mirrors = 0; + + zo->zo_raid_parity = MIN(zo->zo_raid_parity, + zo->zo_raid_children - 1); + } else /* using raidz */ { ASSERT0(strcmp(raid_kind, "raidz")); @@ -3911,21 +3929,6 @@ raidz_scratch_verify(void) kernel_fini(); } -static boolean_t -ztest_vdev_raidz_attach_possible(spa_t *spa) -{ - ztest_shared_t *zs = ztest_shared; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *vd = rvd->vdev_child[0]; - - if (rvd->vdev_children == 1 && - strcmp(vd->vdev_ops->vdev_op_type, "raidz") == 0 && - zs->zs_mirrors == 0) - return (B_TRUE); - - return (B_FALSE); -} - static void ztest_scratch_thread(void *arg) { @@ -3955,19 +3958,21 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); int error, expected_error = 0; - if (ztest_opts.zo_mmp_test) - return; - mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); - if (ztest_device_removal_active) { + if (!ztest_opts.zo_raid_do_expand) { spa_config_exit(spa, SCL_ALL, FTAG); goto out; } - if (!ztest_vdev_raidz_attach_possible(spa)) { + if (ztest_opts.zo_mmp_test) { + spa_config_exit(spa, SCL_ALL, FTAG); + goto out; + } + + if (ztest_device_removal_active) { spa_config_exit(spa, SCL_ALL, FTAG); goto out; } @@ -7296,7 +7301,7 @@ ztest_execute(int test, ztest_info_t *zi, uint64_t id) int i; for (i = 0; i < zi->zi_iters; i++) { - if (!ztest_raidz_attach_test) + if (!ztest_opts.zo_raid_do_expand) zi->zi_func(zd, id); else if (zi->zi_raidz_attach_compatible) zi->zi_func(zd, id); @@ -7550,7 +7555,7 @@ ztest_freeze(void) spa_t *spa; int numloops = 0; - if (ztest_raidz_attach_test) + if (ztest_opts.zo_raid_do_expand) return; if (ztest_opts.zo_verbose >= 3) @@ -8106,8 +8111,6 @@ ztest_run(ztest_shared_t *zs) metaslab_preload_limit = ztest_random(20) + 1; ztest_spa = spa; - ztest_raidz_attach_test = ztest_vdev_raidz_attach_possible(spa); - /* * BUGBUG raidz expansion do not run this for now * VERIFY0(vdev_raidz_impl_set("cycle")); diff --git a/man/man1/ztest.1 b/man/man1/ztest.1 index fd1374a2f106..8f735d35643c 100644 --- a/man/man1/ztest.1 +++ b/man/man1/ztest.1 @@ -122,11 +122,11 @@ Number of mirror copies. Number of raidz/draid disks. .It Fl R , -raid-parity Ns = (default: Sy 1 ) Raid parity (raidz & draid). -.It Fl K , -raid-kind Ns = Ns Sy raidz Ns | Ns Sy draid Ns | Ns Sy random No (default: Sy random ) +.It Fl K , -raid-kind Ns = Ns Sy raidz Ns | Ns Sy eraidz Ns | Ns Sy draid Ns | Ns Sy random No (default: Sy random ) The kind of RAID config to use. With .Sy random -the kind alternates between raidz and draid. +the kind alternates between raidz, eraidz (expandable raidz) and draid. .It Fl D , -draid-data Ns = (default: Sy 4 ) Number of data disks in a dRAID redundancy group. .It Fl S , -draid-spares Ns = (default: Sy 1 ) From 2020f19e494eb9ca51b8445c80fecd1e0ca29e68 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 25 Oct 2021 23:16:21 -0700 Subject: [PATCH 09/25] ztest: Fix integer printing --- cmd/ztest/ztest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 494c9d9464dc..48d3b9c1b084 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -4024,7 +4024,7 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) if (error == 0) { ztest_shared->zs_raidzs_attached++; } else if (error != 0 && error != expected_error) { - fatal(0, "raidz attach (%s %llu) returned %d, expected %d", + fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", newpath, newsize, error, expected_error); } From bee78bb015f2376a82b02b2674eb4ed28c4af82b Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 18 Nov 2021 10:09:16 -0800 Subject: [PATCH 10/25] fix a bug where we can fail to repair a few blocks while in the middle of reflow The "shadow block" repair write was not acutally being executed due to bypassing in lower layers. --- module/zfs/vdev_raidz.c | 29 +++++++++++++++---- .../functional/raidz/raidz_expand_001_pos.ksh | 23 +++++---------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 9e2c6da82809..9dc74c7f328f 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2580,6 +2580,12 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) continue; } + zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " + "offset=%llx", + zio, c, + rc->rc_devidx, + (long long)rc->rc_offset); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, @@ -2605,19 +2611,28 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) continue; - vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; + vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; + + zfs_dbgmsg("zio=%px overwriting c=%u shadow_devidx=%u " + "shadow_offset=%llx", + zio, c, + rc->rc_shadow_devidx, + (long long)rc->rc_shadow_offset); /* * Note: We don't want to update the repair stats * because that would incorrectly indicate that there - * was bad data to repair. By clearing the - * SCAN_THREAD flag, we prevent this from happening, - * despite having the REPAIR flag set. + * was bad data to repair, which we aren't sure about. + * By clearing the SCAN_THREAD flag, we prevent this + * from happening, despite having the REPAIR flag set. + * We need to set SELF_HEAL so that this i/o can't be + * bypassed by zio_vdev_io_start(). */ - zio_t *cio = zio_vdev_child_io(zio, NULL, cvd2, + zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_IO_REPAIR, NULL, NULL); + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, + NULL, NULL); cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; zio_nowait(cio); } @@ -3171,6 +3186,8 @@ vdev_raidz_io_done(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; + zfs_dbgmsg("vdev_raidz_io_done(%px)", zio); + ASSERT(zio->io_bp != NULL); if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh index 3f064d47292c..0e205fc46bb2 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh @@ -45,7 +45,7 @@ # - destroy the raidz pool typeset -r devs=6 -typeset -r dev_size_mb=512 +typeset -r dev_size_mb=128 typeset -a disks @@ -54,6 +54,8 @@ max_offset=$(get_tunable RAIDZ_EXPAND_MAX_OFFSET_PAUSE) function cleanup { + log_pos zpool status $TESTPOOL + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" for i in {0..$devs}; do @@ -70,7 +72,7 @@ function wait_expand_paused newcopied='1' while [[ $oldcopied != $newcopied ]]; do oldcopied=$newcopied - sleep 5 + sleep 2 newcopied=$(zpool status $TESTPOOL | \ grep 'copied out of' | \ awk '{print $1}') @@ -141,7 +143,8 @@ function test_scrub # typeset dir=$3 typeset combrec=$4 - randbyte=$(( ((RANDOM<<15) + RANDOM) % (dev_size_mb * (devs-1) * 1024 * 1024) )) + randbyte=$(( ((RANDOM<<15) + RANDOM) % \ + (dev_size_mb * (devs-1) * 1024 * 1024) )) log_must set_tunable64 RAIDZ_EXPAND_MAX_OFFSET_PAUSE $randbyte log_must zpool attach $TESTPOOL ${raid}-0 $dir/dev-$devs wait_expand_paused @@ -155,14 +158,8 @@ function test_scrub # log_must zpool import -o cachefile=none -d $dir $pool - log_must zpool scrub $pool - - while ! is_pool_scrubbed $pool; do - sleep 1 - done - + log_must zpool scrub -w $pool log_must zpool clear $pool - log_must zpool export $pool for (( i=$nparity; i<$nparity*2; i=i+1 )); do @@ -172,11 +169,7 @@ function test_scrub # log_must zpool import -o cachefile=none -d $dir $pool - log_must zpool scrub $pool - - while ! is_pool_scrubbed $pool; do - sleep 1 - done + log_must zpool scrub -w $pool log_must check_pool_status $pool "errors" "No known data errors" From 682fa251b6b35349d5df9d5993f8f374d0718e8c Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 21 Feb 2022 20:37:20 -0800 Subject: [PATCH 11/25] manpage comments --- cmd/ztest/ztest.c | 8 ++++---- man/man4/zfs.4 | 7 ++++--- man/man7/zpool-features.7 | 2 +- man/man8/zpool-attach.8 | 41 +++++++++++++++++++++------------------ man/man8/zpool-wait.8 | 2 +- 5 files changed, 32 insertions(+), 28 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 48d3b9c1b084..b11c07478dde 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -4254,10 +4254,10 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) } /* - * If we under raidz expansion, the test can failed because metaslabs - * count will not increase immediately after vdevs growing. It will - * happen only after raidz expansion completion. - */ + * If we under raidz expansion, the test can failed because metaslabs + * count will not increase immediately after vdevs growing. It will + * happen only after raidz expansion completion. + */ if (spa->spa_raidz_expand) { spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index ca7ae67402fe..d7f52eab7f07 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -388,13 +388,14 @@ Note, that both this many milliseconds and TXGs must pass before unloading will occur. . .It Sy raidz_expand_max_copy_bytes Ns = Ns Sy 160MB Pq ulong -Max amount of concurrent i/o for RAIDZ expansion. +Max amount of memory to use for RAID-Z expansion I/O. +This limits how much I/O can be outstanding at once. . .It Sy raidz_expand_max_offset_pause Ns = Ns Sy 0 Pq ulong -For testing, pause RAIDZ expansion at this offset. +For testing, pause RAID-Z expansion at this offset. . .It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong -For expanded RAIDZ, aggregate reads that have more rows than this. +For expanded RAID-Z, aggregate reads that have more rows than this. . .It Sy reference_history Ns = Ns Sy 3 Pq int Maximum reference holders being tracked when reference_tracking_enable is active. diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 826f93b7a1f4..be9f5b385b00 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -840,7 +840,7 @@ are destroyed. .feature org.openzfs raidz_expansion no none This feature enables the .Nm zpool Cm attach -subcommand to attach a new device to a RAIDZ group, expanding the total +subcommand to attach a new device to a RAID-Z group, expanding the total amount usable space in the pool. See .Xr zpool-attach 8 . diff --git a/man/man8/zpool-attach.8 b/man/man8/zpool-attach.8 index c9adb8dada8a..a378fd1c47c2 100644 --- a/man/man8/zpool-attach.8 +++ b/man/man8/zpool-attach.8 @@ -47,12 +47,13 @@ to the existing .Ar device . The behavior differs depending on if the existing .Ar device -is a RAIDZ device, or a mirror/plain device. +is a RAID-Z device, or a mirror/plain device. .Pp -If the existing device is a mirror or plain device (e.g. specified as "sda" or -"mirror-7"), the new device will be mirrored with the existing device, a -resilver will be initiated, and the new device will contribute to additional -redundancy once the resilver completes. +If the existing device is a mirror or plain device +.Pq e.g. specified as Qo Li sda Qc or Qq Li mirror-7 , +the new device will be mirrored with the existing device, a resilver will be +initiated, and the new device will contribute to additional redundancy once the +resilver completes. If .Ar device is not currently part of a mirrored configuration, @@ -70,12 +71,14 @@ In either case, .Ar new_device begins to resilver immediately and any running scrub is cancelled. .Pp -If the existing device is a RAIDZ device (e.g. specified as "raidz2-0"), the new -device will become part of that RAIDZ group. +If the existing device is a RAID-Z device +.Pq e.g. specified as Qq Ar raidz2-0 , +the new device will become part of that RAID-Z group. A "raidz expansion" will be initiated, and the new device will contribute -additional space to the RAIDZ group once the expansion completes. +additional space to the RAID-Z group once the expansion completes. The expansion entails reading all allocated space from existing disks in the -RAIDZ group, and rewriting it to the new disks in the RAIDZ group (including the +RAID-Z group, and rewriting it to the new disks in the RAID-Z group (including +the newly added .Ar device ) . Its progress can be monitored with @@ -83,20 +86,20 @@ Its progress can be monitored with .Pp Data redundancy is maintained during and after the expansion. If a disk fails while the expansion is in progress, the expansion pauses until -the health of the RAIDZ vdev is restored (e.g. by replacing the failed disk +the health of the RAID-Z vdev is restored (e.g. by replacing the failed disk and waiting for reconstruction to complete). Expansion does not change the number of failures that can be tolerated -without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after expansion). -A RAIDZ vdev can be expanded multiple times. +without data loss (e.g. a RAID-Z2 is still a RAID-Z2 even after expansion). +A RAID-Z vdev can be expanded multiple times. .Pp -After the expansion completes, old blocks remain with their old data-to-parity -ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but distributed among the -larger set of disks. +After the expansion completes, old blocks retain their old data-to-parity +ratio +.Pq e.g. 5-wide RAID-Z2 has 3 data and 2 parity +but distributed among the larger set of disks. New blocks will be written with the new data-to-parity ratio (e.g. a 5-wide -RAIDZ2 which has been expanded once to 6-wide, has 4 data to 2 parity). -However, the RAIDZ vdev's "assumed parity ratio" does not change, so slightly -less space than is expected may be reported for newly-written blocks, according -to +RAID-Z2 which has been expanded once to 6-wide, has 4 data and 2 parity). +However, the vdev's "assumed parity ratio" does not change, so slightly less +space than is expected may be reported for newly-written blocks, according to .Nm zfs Cm list , .Nm df , .Nm ls Fl s , diff --git a/man/man8/zpool-wait.8 b/man/man8/zpool-wait.8 index 40603b671873..e979db7783c4 100644 --- a/man/man8/zpool-wait.8 +++ b/man/man8/zpool-wait.8 @@ -77,7 +77,7 @@ Scrub to cease .It Sy trim Manual trim to cease .It Sy raidz_expand -Attaching to a RAIDZ vdev to complete +Attaching to a RAID-Z vdev to complete .El .Pp If an From 6f8d24faea5e7b9bee6854423b1925520097c952 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 24 Feb 2022 20:34:59 -0800 Subject: [PATCH 12/25] fix assertion failure in raidz_reflow_sync() --- include/sys/uberblock_impl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h index 28ebca3c2ba1..d8e2853a37b7 100644 --- a/include/sys/uberblock_impl.h +++ b/include/sys/uberblock_impl.h @@ -81,14 +81,14 @@ typedef enum raidz_reflow_scratch_state { } raidz_reflow_scratch_state_t; #define RRSS_GET_OFFSET(ub) \ - BF64_GET_SB((ub)->ub_raidz_reflow_info, 0, 32, SPA_MINBLOCKSHIFT, 0) + BF64_GET_SB((ub)->ub_raidz_reflow_info, 0, 55, SPA_MINBLOCKSHIFT, 0) #define RRSS_SET_OFFSET(ub, x) \ - BF64_SET_SB((ub)->ub_raidz_reflow_info, 0, 32, SPA_MINBLOCKSHIFT, 0, x) + BF64_SET_SB((ub)->ub_raidz_reflow_info, 0, 55, SPA_MINBLOCKSHIFT, 0, x) #define RRSS_GET_STATE(ub) \ - BF64_GET_SB((ub)->ub_raidz_reflow_info, 32, 8, 0, 0) + BF64_GET_SB((ub)->ub_raidz_reflow_info, 55, 9, 0, 0) #define RRSS_SET_STATE(ub, x) \ - BF64_SET_SB((ub)->ub_raidz_reflow_info, 32, 8, 0, 0, x) + BF64_SET_SB((ub)->ub_raidz_reflow_info, 55, 9, 0, 0, x) #define RAIDZ_REFLOW_SET(ub, state, offset) do { \ (ub)->ub_raidz_reflow_info = 0; \ From 972d154edafa5632ff3aeddefa31d795c435ba86 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 24 Feb 2022 22:24:49 -0800 Subject: [PATCH 13/25] one more manpage tweak --- man/man8/zpool-attach.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/man8/zpool-attach.8 b/man/man8/zpool-attach.8 index a378fd1c47c2..209a5cfca0b3 100644 --- a/man/man8/zpool-attach.8 +++ b/man/man8/zpool-attach.8 @@ -98,7 +98,7 @@ ratio but distributed among the larger set of disks. New blocks will be written with the new data-to-parity ratio (e.g. a 5-wide RAID-Z2 which has been expanded once to 6-wide, has 4 data and 2 parity). -However, the vdev's "assumed parity ratio" does not change, so slightly less +However, the vdev's assumed parity ratio does not change, so slightly less space than is expected may be reported for newly-written blocks, according to .Nm zfs Cm list , .Nm df , From 03751b33ee823907d92d452e7ba1b60d71534c8c Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 28 Feb 2022 01:08:01 -0800 Subject: [PATCH 14/25] Do not work with shadow location in case if scratch space requested --- module/zfs/vdev_raidz.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 9dc74c7f328f..a521fd5c039d 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -683,7 +683,7 @@ vdev_raidz_map_alloc_expanded(zio_t *zio, * rangelock, which is held exclusively while the * copy is in progress. */ - if (rc->rc_size != 0 && + if (!use_scratch && rc->rc_size != 0 && row_phys_cols != physical_cols && b + c < reflow_offset_next >> ashift) { ASSERT3U(row_phys_cols, ==, physical_cols - 1); From a26ffc99289aec211208b350694cbac46261fece Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 28 Feb 2022 01:32:03 -0800 Subject: [PATCH 15/25] Do not switch to scratch offset in the middle of the row --- module/zfs/vdev_raidz.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index a521fd5c039d..025dca4592bd 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -622,7 +622,7 @@ vdev_raidz_map_alloc_expanded(zio_t *zio, * when the pool is imported readonly. */ if (use_scratch && - (b + c) << ashift < reflow_offset_synced) { + (b + cols) << ashift <= reflow_offset_synced) { rc->rc_offset -= VDEV_BOOT_SIZE; } From 49f131fd013a5e4bee267aa33482624a5125b962 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 28 Feb 2022 02:13:02 -0800 Subject: [PATCH 16/25] Fix raidz asize computation in case of expansion is in progress --- module/zfs/vdev_raidz.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 025dca4592bd..5ff7330937a0 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1970,8 +1970,15 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, cvd->vdev_physical_ashift); } - *asize *= vd->vdev_children; - *max_asize *= vd->vdev_children; + if (vd->vdev_rz_expanding) { + *asize *= vd->vdev_children - 1; + *max_asize *= vd->vdev_children - 1; + + vd->vdev_min_asize = *asize; + } else { + *asize *= vd->vdev_children; + *max_asize *= vd->vdev_children; + } if (numerrors > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; @@ -2146,6 +2153,9 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) if (rc->rc_size == 0) continue; + ASSERT(rc->rc_offset + rc->rc_size < + cvd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT3P(rc->rc_abd, !=, NULL); zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, @@ -2154,6 +2164,10 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) if (rc->rc_shadow_devidx != INT_MAX) { vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; + + ASSERT(rc->rc_shadow_offset + abd_get_size(rc->rc_abd) < + cvd2->vdev_psize - VDEV_LABEL_END_SIZE); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, rc->rc_shadow_offset, rc->rc_abd, abd_get_size(rc->rc_abd), @@ -2182,6 +2196,9 @@ raidz_start_skip_writes(zio_t *zio) continue; ASSERT3P(rc->rc_abd, ==, NULL); + ASSERT(rc->rc_offset + rc->rc_size < + cvd->vdev_psize - VDEV_LABEL_END_SIZE); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift, zio->io_type, zio->io_priority, From d99d9a30760dbe75be74717be8ef29e1720ddc0d Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 28 Feb 2022 04:30:07 -0800 Subject: [PATCH 17/25] Make vdev_rz_expanding config variable syncing more earlier --- module/zfs/vdev_label.c | 3 +++ module/zfs/vdev_raidz.c | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index d30e8cdcf177..cb11d07de2c2 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -508,6 +508,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); } + if (vd->vdev_rz_expanding) { + fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); + } /* zpool command expects alloc class data */ if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) { diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 5ff7330937a0..6457411a550d 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -4661,10 +4661,6 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); - if (vdrz->vn_vre.vre_state == DSS_SCANNING) { - fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); - } - mutex_enter(&vdrz->vd_expand_lock); if (!avl_is_empty(&vdrz->vd_expand_txgs)) { uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); From cba22533e4826a87483d9fe254d20ee9b6b78e85 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Tue, 19 Apr 2022 00:43:17 -0700 Subject: [PATCH 18/25] Skip mmp ub writing if scratch object is active. MMP uberblock could be owerwritten by scratch object if raidz expansion is in progress. --- module/zfs/mmp.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index f67a4eb22a2d..ae320e0ce9d7 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -437,6 +437,7 @@ static void mmp_write_uberblock(spa_t *spa) { int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; mmp_thread_t *mmp = &spa->spa_mmp; uberblock_t *ub; vdev_t *vd = NULL; @@ -451,6 +452,19 @@ mmp_write_uberblock(spa_t *spa) "gethrtime %llu", spa_name(spa), lock_acquire_time, gethrtime()); + /* + * Skip mmp uberblock writing if raidz expansion is in progress and + * scratch object is active. + */ + if (vre) { + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + if (vre->vre_offset < + vd->vdev_children * VDEV_LABEL_START_SIZE) { + spa_config_exit(spa, SCL_STATE, mmp_tag); + return; + } + } + mutex_enter(&mmp->mmp_io_lock); error = mmp_next_leaf(spa); From 5b531056ccb339ac7bd496597deaf8f5d5a2ae91 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 24 Oct 2022 03:40:41 -0700 Subject: [PATCH 19/25] Revert "Make vdev_rz_expanding config variable syncing more earlier" This reverts commit d99d9a30760dbe75be74717be8ef29e1720ddc0d. --- module/zfs/vdev_label.c | 3 --- module/zfs/vdev_raidz.c | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index cb11d07de2c2..d30e8cdcf177 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -508,9 +508,6 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); } - if (vd->vdev_rz_expanding) { - fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); - } /* zpool command expects alloc class data */ if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) { diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 6457411a550d..5ff7330937a0 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -4661,6 +4661,10 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); + } + mutex_enter(&vdrz->vd_expand_lock); if (!avl_is_empty(&vdrz->vd_expand_txgs)) { uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); From 4c565c79fd1a45a18b2a302180b72df45dbcc8f1 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 14 Nov 2022 08:06:07 -0800 Subject: [PATCH 20/25] Skip mmp ub writing if scratch object is active. Improve comment. --- module/zfs/mmp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index ae320e0ce9d7..358be5cbc91b 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -454,7 +454,8 @@ mmp_write_uberblock(spa_t *spa) /* * Skip mmp uberblock writing if raidz expansion is in progress and - * scratch object is active. + * scratch object is active. Take a priority to uberblocks with actual + * scratch state information. */ if (vre) { vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); From d4c6d3687bf602ca25d2dd65fdc88649c19d684f Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 14 Nov 2022 08:09:30 -0800 Subject: [PATCH 21/25] Handle scratch in case of shadow writes --- module/zfs/vdev_raidz.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 5ff7330937a0..ff3ee2d8e3d0 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -683,13 +683,16 @@ vdev_raidz_map_alloc_expanded(zio_t *zio, * rangelock, which is held exclusively while the * copy is in progress. */ - if (!use_scratch && rc->rc_size != 0 && + if (rc->rc_size != 0 && row_phys_cols != physical_cols && b + c < reflow_offset_next >> ashift) { ASSERT3U(row_phys_cols, ==, physical_cols - 1); rc->rc_shadow_devidx = (b + c) % physical_cols; rc->rc_shadow_offset = ((b + c) / physical_cols) << ashift; + if (use_scratch) + rc->rc_shadow_offset -= VDEV_BOOT_SIZE; + zfs_dbgmsg("rm=%px row=%d b+c=%llu " "shadow_devidx=%u shadow_offset=%llu", rm, (int)row, (long long)(b + c), From 6d3738824b5adf382bfc506f5f39f41fb1dc1db6 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 14 Nov 2022 08:17:22 -0800 Subject: [PATCH 22/25] Scratch logic refactoring --- module/zfs/vdev_raidz.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index ff3ee2d8e3d0..cdf452452912 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -552,6 +552,7 @@ vdev_raidz_map_alloc_expanded(zio_t *zio, #endif for (uint64_t row = 0; row < rows; row++) { + boolean_t row_use_scratch = B_FALSE; raidz_row_t *rr = vdev_raidz_row_alloc(cols); rm->rm_row[row] = rr; @@ -574,6 +575,8 @@ vdev_raidz_map_alloc_expanded(zio_t *zio, int row_phys_cols = physical_cols; if (b + cols > reflow_offset_synced >> ashift) row_phys_cols--; + else if (use_scratch) + row_use_scratch = B_TRUE; /* starting child of this row */ uint64_t child_id = b % row_phys_cols; @@ -613,18 +616,15 @@ vdev_raidz_map_alloc_expanded(zio_t *zio, rc->rc_offset = child_offset; /* - * Get this from the scratch space if appropriate. We - * should only be doing reads if this is the case. + * Get this from the scratch space if appropriate. * This only happens if we crashed in the middle of * raidz_reflow_scratch_sync() (while it's running, * the rangelock prevents us from doing concurrent * io), and even then only during zpool import or * when the pool is imported readonly. */ - if (use_scratch && - (b + cols) << ashift <= reflow_offset_synced) { + if (row_use_scratch) rc->rc_offset -= VDEV_BOOT_SIZE; - } uint64_t dc = c - rr->rr_firstdatacol; if (c < rr->rr_firstdatacol) { From d2f0fedf5527a5a6896868808a4292cd7453cea9 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Wed, 16 Nov 2022 06:40:46 -0800 Subject: [PATCH 23/25] Remove skip mmp ub writing if scratch object is active. Add mmp uberblock actualization from scratch object side --- module/zfs/mmp.c | 14 -------------- module/zfs/vdev_raidz.c | 7 +++++++ 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 358be5cbc91b..fae24a513820 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -452,20 +452,6 @@ mmp_write_uberblock(spa_t *spa) "gethrtime %llu", spa_name(spa), lock_acquire_time, gethrtime()); - /* - * Skip mmp uberblock writing if raidz expansion is in progress and - * scratch object is active. Take a priority to uberblocks with actual - * scratch state information. - */ - if (vre) { - vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); - if (vre->vre_offset < - vd->vdev_children * VDEV_LABEL_START_SIZE) { - spa_config_exit(spa, SCL_STATE, mmp_tag); - return; - } - } - mutex_enter(&mmp->mmp_io_lock); error = mmp_next_leaf(spa); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index cdf452452912..6bfd814de24f 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -3936,6 +3936,9 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) spa->spa_ubsync.ub_timestamp++; ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); + zfs_dbgmsg("reflow: uberblock updated " "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", @@ -3989,6 +3992,8 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) spa->spa_ubsync.ub_timestamp++; ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); zfs_dbgmsg("reflow: uberblock updated " "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", @@ -4103,6 +4108,8 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa) spa->spa_ubsync.ub_timestamp++; VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); zfs_dbgmsg("reflow recovery: uberblock updated " "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", From 636573dcfa2c96b7622e4933e7b3d1be3dc1f0cc Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Wed, 16 Nov 2022 06:44:49 -0800 Subject: [PATCH 24/25] Fix 'shadow write' to scratch region --- module/zfs/vdev_raidz.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 6bfd814de24f..1ed17a20afa5 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -668,6 +668,9 @@ vdev_raidz_map_alloc_expanded(zio_t *zio, rc->rc_size); } + if (rc->rc_size == 0) + continue; + /* * If any part of this row is in both old and new * locations, the primary location is the old @@ -683,14 +686,13 @@ vdev_raidz_map_alloc_expanded(zio_t *zio, * rangelock, which is held exclusively while the * copy is in progress. */ - if (rc->rc_size != 0 && - row_phys_cols != physical_cols && - b + c < reflow_offset_next >> ashift) { - ASSERT3U(row_phys_cols, ==, physical_cols - 1); + if (row_use_scratch || + (row_phys_cols != physical_cols && + b + c < reflow_offset_next >> ashift)) { rc->rc_shadow_devidx = (b + c) % physical_cols; rc->rc_shadow_offset = ((b + c) / physical_cols) << ashift; - if (use_scratch) + if (row_use_scratch) rc->rc_shadow_offset -= VDEV_BOOT_SIZE; zfs_dbgmsg("rm=%px row=%d b+c=%llu " From c3de3a60937f90c69f3df5c87d47c3c7a8776ec9 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Wed, 16 Nov 2022 06:50:58 -0800 Subject: [PATCH 25/25] Remove unneeded argument from mmp.c --- module/zfs/mmp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index fae24a513820..f67a4eb22a2d 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -437,7 +437,6 @@ static void mmp_write_uberblock(spa_t *spa) { int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; - vdev_raidz_expand_t *vre = spa->spa_raidz_expand; mmp_thread_t *mmp = &spa->spa_mmp; uberblock_t *ub; vdev_t *vd = NULL;