From a6de073b6242e7b163aad7d58e605e83f30f9d49 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Tue, 13 Feb 2018 11:37:56 -0800 Subject: [PATCH] DLPX-57225 Reads from indirect mirrors only fin... ...d one copy Reviewed at: http://reviews.delphix.com/r/38104/ Ported-by: Tim Chase Signed-off-by: Tim Chase Requires-builders: none Problem When we read from an indirect vdev that points to a mirror vdev, we only consider one copy of the data. This can lead to reduced effective redundancy, because we might read a bad copy of the data from one side of the mirror, and not retry the other, good side of the mirror. Note that the problem is not with the removal process, but rather after the removal has completed (having copied correct data to both sides of the mirror), if one side of the mirror is silently damaged, we encounter the problem when reading the relocated data via the indirect vdev. Also note that the problem doesn't occur when ZFS knows that one side of the mirror is bad, e.g. when a disk entirely fails or is offlined. The problem manifests itself in several ways: Reads (from indirect vdevs that point to mirrors) may return a checksum error even though the good data exists on one side of the mirror scrub doesn't repair all data on the mirror (if some of it is pointed to via an indirect vdev). Solution: as an expedient fix, we will do the following: When reading a non-split block, the indirect vdev will pass the whole BP down to the mirror code. This lets the mirror code deal with verifying the checksum and trying other copies, etc. So for non-split blocks (typically >90% of all relocated blocks), the problem is completely solved. When reading split blocks, read the left side of all mirrors, and if the checksum of that fails, read the right side of all mirrors (and the 3rd side for 3-way mirrors, etc). This ensures that if there is a good copy of the data, we will return it. However, it doesn't issue repair i/os, and it doesn't fix scrub for indirect split blocks. A full fix will be the subject of a later PR. Porting notes: Commit 9f50093, "FreeBSD r256956: Improve ZFS N-way mirror read performance by using load and locality information" caused vdev_mirror.c. do diverge from the upstream code. Among other things, parts of vdev_mirror_map_alloc() were moved into the new vdev_mirror_map_init() function. The upstream version of this patch modified vdev_mirror_map_init() to select the child vdev based on the new zio hint. That code has been slightly refactored and put into vdev_mirror_map_init(). Upstream commit 20ef1b6, "8473 scrub does not detect errors on active spares" has not been ported to ZoL. It changed the name of the "mm_replacing" member of "struct mirror_map" to "mm_resilvering". The name has been left as "mm_replacing". --- include/sys/zio.h | 1 + module/zfs/vdev_indirect.c | 122 ++++++++++++++++++++++++++++++++++--- module/zfs/vdev_mirror.c | 24 +++++--- module/zfs/zio.c | 1 + 4 files changed, 132 insertions(+), 16 deletions(-) diff --git a/include/sys/zio.h b/include/sys/zio.h index 7eff42f86d9a..d2e261242f31 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -467,6 +467,7 @@ struct zio { vdev_t *io_vd; void *io_vsd; const zio_vsd_ops_t *io_vsd_ops; + int io_mirror_child_hint; uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 7e40e9a60b89..7af26d83d8ca 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -201,6 +202,28 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024; */ int zfs_condense_indirect_commit_entry_delay_ticks = 0; +/* + * data specific to an indirect zio + */ +typedef struct indirect_vsd { + boolean_t iv_split_block; + int iv_mirror_child_hint; + int iv_mirror_max_children; +} indirect_vsd_t; + +static void +vdev_indirect_map_free(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + kmem_free(iv, sizeof (iv)); +} + +static const zio_vsd_ops_t vdev_indirect_vsd_ops = { + vdev_indirect_map_free, + zio_vsd_default_cksum_report +}; + /* * Mark the given offset and size as being obsolete in the given txg. */ @@ -810,12 +833,6 @@ vdev_indirect_close(vdev_t *vd) { } -/* ARGSUSED */ -static void -vdev_indirect_io_done(zio_t *zio) -{ -} - /* ARGSUSED */ static int vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, @@ -992,22 +1009,52 @@ vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { zio_t *zio = arg; + blkptr_t *bp = NULL; + indirect_vsd_t *iv = zio->io_vsd; ASSERT3P(vd, !=, NULL); if (vd->vdev_ops == &vdev_indirect_ops) return; - zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, + if (split_offset == 0 && size == zio->io_size) { + /* + * This is not a split block; we are pointing to the entire + * data, which will checksum the same as the original data. + * Pass the BP down so that the child i/o can verify the + * checksum, and try a different location if available + * (e.g. on a mirror). + */ + bp = zio->io_bp; + iv->iv_split_block = B_FALSE; + } else { + iv->iv_split_block = B_TRUE; + } + + if (vd->vdev_ops == &vdev_mirror_ops) { + iv->iv_mirror_max_children = MAX(iv->iv_mirror_max_children, + vd->vdev_children); + } + + zio_t *cio = zio_vdev_child_io(zio, bp, vd, offset, abd_get_offset(zio->io_abd, split_offset), size, zio->io_type, zio->io_priority, - 0, vdev_indirect_child_io_done, zio)); + 0, vdev_indirect_child_io_done, zio); + + if (iv->iv_split_block) + cio->io_mirror_child_hint = iv->iv_mirror_child_hint; + + zio_nowait(cio); } static void vdev_indirect_io_start(zio_t *zio) { ASSERTV(spa_t *spa = zio->io_spa); + indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); + + zio->io_vsd = iv; + zio->io_vsd_ops = &vdev_indirect_vsd_ops; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (zio->io_type != ZIO_TYPE_READ) { @@ -1022,6 +1069,65 @@ vdev_indirect_io_start(zio_t *zio) zio_execute(zio); } +/* ARGSUSED */ +static void +vdev_indirect_io_done(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + if (!iv->iv_split_block) { + /* + * This was not a split block, so we passed the BP down, + * and the checksum was handled by the (one) child zio. + */ + return; + } + if (iv->iv_mirror_max_children == 0) { + /* + * We didn't point to any mirrors, in which case there's + * nothing else to try. The checksum will be verified + * by this zio. + */ + return; + } + + zio_bad_cksum_t zbc; + int ret = zio_checksum_error(zio, &zbc); + if (ret == 0) + goto out; + + if (iv->iv_mirror_child_hint + 1 < iv->iv_mirror_max_children) { + /* + * The checksum doesn't match, but there are more mirrored + * copies of this data. Try a different version. + */ + iv->iv_mirror_child_hint++; + zio_vdev_io_redone(zio); + vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, + vdev_indirect_io_start_cb, zio); + return; + } else { + zio->io_error = ret; + } + +out: + /* + * Note, indirect vdevs don't handle a few important cases for split + * blocks when one side of a mirror is silently corrupt: + * + * When we read bad data and then retry the other side of the mirror, + * we should issue a repair write to fix the bad copy. + * + * When scrubbing, we should read all copies of the data (and + * repair any bad copies). + * + * We should generate ereports for any data we read with the wrong + * checksums. + */ + + zio_checksum_verified(zio); +} + vdev_ops_t vdev_indirect_ops = { vdev_indirect_open, vdev_indirect_close, diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index b56f955eb01b..8c4ed0646974 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -252,8 +252,10 @@ vdev_mirror_map_init(zio_t *zio) dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; - mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE, - B_TRUE); + c = BP_GET_NDVAS(zio->io_bp); + if (zio->io_mirror_child_hint != -1) + c = zio->io_mirror_child_hint % c; + mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; @@ -261,7 +263,10 @@ vdev_mirror_map_init(zio_t *zio) mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { - mm = vdev_mirror_map_alloc(vd->vdev_children, + c = vd->vdev_children; + if (zio->io_mirror_child_hint != -1) + c = zio->io_mirror_child_hint % c; + mm = vdev_mirror_map_alloc(c, (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops), B_FALSE); for (c = 0; c < mm->mm_children; c++) { @@ -486,12 +491,15 @@ vdev_mirror_io_start(zio_t *zio) mm = vdev_mirror_map_init(zio); if (zio->io_type == ZIO_TYPE_READ) { - if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { + if (zio->io_bp != NULL && + (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { /* - * For scrubbing reads we need to allocate a read - * buffer for each child and issue reads to all - * children. If any child succeeds, it will copy its - * data into zio->io_data in vdev_mirror_scrub_done. + * For scrubbing reads (if we can verify the + * checksum here, as indicated by io_bp being + * non-NULL) we need to allocate a read buffer for + * each child and issue reads to all children. If + * any child succeeds, it will copy its data into + * zio->io_data in vdev_mirror_scrub_done. */ for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index b3beeda35b0e..d39cd3e60595 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -738,6 +738,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); + zio->io_mirror_child_hint = -1; mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);