Skip to content

Commit

Permalink
Detect IO errors during device removal
Browse files Browse the repository at this point in the history
* Detect IO errors during device removal

While device removal cannot verify the checksums of individual
blocks during device removal, it can reasonably detect hard IO
errors from the leaf vdevs.  Failure to perform this error
checking can result in device removal completing successfully,
but moving no data which will permanently corrupt the pool.

Situation 1: faulted/degraded vdevs

In the configuration shown below, the removal of mirror-0 will
permanently corrupt the pool.  Device removal will preferentially
copy data from 'vdev1 -> vdev3' and from 'vdev2 -> vdev4'.  Which
in this case will result in nothing being copied since one vdev
in each of those groups in unavailable.  However, device removal
will complete successfully since all IO errors are ignored.

  tank                DEGRADED     0     0     0
    mirror-0          DEGRADED     0     0     0
      /var/tmp/vdev1  FAULTED      0     0     0  external fault
      /var/tmp/vdev2  ONLINE       0     0     0
    mirror-1          DEGRADED     0     0     0
      /var/tmp/vdev3  ONLINE       0     0     0
      /var/tmp/vdev4  FAULTED      0     0     0  external fault

This issue is resolved by updating the source child selection
logic to exclude unreadable leaf vdevs.  Additionally, unwritable
destination child vdevs which can never succeed are skipped to
prevent generating a large number of write IO errors.

Situation 2: individual hard IO errors

During removal if an unexpected hard IO error is encountered when
either reading or writing the child vdev the entire removal
operation is cancelled.  While it may be possible to reconstruct
the data after removal that cannot be guaranteed.  The only
strictly safe thing to do is to cancel the removal.

As a future improvement we may want to instead suspend the removal
process and allow the damaged region to be retried.  But that work
is left for another time, hard IO errors during the removal process
are expected to be exceptionally rare.

Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Tom Caputi <tcaputi@datto.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue openzfs#6900
Closes openzfs#8161
  • Loading branch information
behlendorf authored and Gregor Kopka committed Jan 7, 2019
1 parent 06514ca commit 0f70df0
Show file tree
Hide file tree
Showing 9 changed files with 350 additions and 19 deletions.
16 changes: 16 additions & 0 deletions man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -1982,6 +1982,22 @@ last resort, as it typically results in leaked space, or worse.
Use \fB1\fR for yes and \fB0\fR for no (default).
.RE

.sp
.ne 2
.na
\fBzfs_removal_ignore_errors\fR (int)
.ad
.RS 12n
.sp
Ignore hard IO errors during device removal. When set, if a device encounters
a hard IO error during the removal process the removal will not be cancelled.
This can result in a normally recoverable block becoming permanently damaged
and is not recommended. This should only be used as a last resort when the
pool cannot be returned to a healthy state prior to removing the device.
.sp
Default value: \fB0\fR.
.RE

.sp
.ne 2
.na
Expand Down
7 changes: 4 additions & 3 deletions man/man8/zpool.8
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
.Dd April 27, 2018
.Dd November 29, 2018
.Dt ZPOOL 8 SMM
.Os Linux
.Sh NAME
Expand Down Expand Up @@ -1942,8 +1942,9 @@ In this case, the
command initiates the removal and returns, while the evacuation continues in
the background.
The removal progress can be monitored with
.Nm zpool Cm status.
The
.Nm zpool Cm status .
If an IO error is encountered during the removal process it will be
cancelled. The
.Sy device_removal
feature flag must be enabled to remove a top-level vdev, see
.Xr zpool-features 5 .
Expand Down
100 changes: 89 additions & 11 deletions module/zfs/vdev_removal.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@
typedef struct vdev_copy_arg {
metaslab_t *vca_msp;
uint64_t vca_outstanding_bytes;
uint64_t vca_read_error_bytes;
uint64_t vca_write_error_bytes;
kcondvar_t vca_cv;
kmutex_t vca_lock;
} vdev_copy_arg_t;
Expand All @@ -99,6 +101,14 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
*/
int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;

/*
* Ignore hard IO errors during device removal. When set if a device
* encounters hard IO error during the removal process the removal will
* not be cancelled. This can result in a normally recoverable block
* becoming permanently damaged and is not recommended.
*/
int zfs_removal_ignore_errors = 0;

/*
* Allow a remap segment to span free chunks of at most this size. The main
* impact of a larger span is that we will read and write larger, more
Expand Down Expand Up @@ -126,6 +136,7 @@ int zfs_removal_suspend_progress = 0;
#define VDEV_REMOVAL_ZAP_OBJS "lzap"

static void spa_vdev_remove_thread(void *arg);
static int spa_vdev_remove_cancel_impl(spa_t *spa);

static void
spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
Expand Down Expand Up @@ -802,6 +813,10 @@ spa_vdev_copy_segment_write_done(zio_t *zio)

mutex_enter(&vca->vca_lock);
vca->vca_outstanding_bytes -= zio->io_size;

if (zio->io_error != 0)
vca->vca_write_error_bytes += zio->io_size;

cv_signal(&vca->vca_cv);
mutex_exit(&vca->vca_lock);
}
Expand All @@ -813,6 +828,14 @@ spa_vdev_copy_segment_write_done(zio_t *zio)
static void
spa_vdev_copy_segment_read_done(zio_t *zio)
{
vdev_copy_arg_t *vca = zio->io_private;

if (zio->io_error != 0) {
mutex_enter(&vca->vca_lock);
vca->vca_read_error_bytes += zio->io_size;
mutex_exit(&vca->vca_lock);
}

zio_nowait(zio_unique_parent(zio));
}

Expand Down Expand Up @@ -866,25 +889,45 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
{
ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);

/*
* If the destination child in unwritable then there is no point
* in issuing the source reads which cannot be written.
*/
if (!vdev_writeable(dest_child_vd))
return;

mutex_enter(&vca->vca_lock);
vca->vca_outstanding_bytes += size;
mutex_exit(&vca->vca_lock);

abd_t *abd = abd_alloc_for_io(size, B_FALSE);

vdev_t *source_child_vd;
vdev_t *source_child_vd = NULL;
if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
/*
* Source and dest are both mirrors. Copy from the same
* child id as we are copying to (wrapping around if there
* are more dest children than source children).
* are more dest children than source children). If the
* preferred source child is unreadable select another.
*/
source_child_vd =
source_vd->vdev_child[dest_id % source_vd->vdev_children];
for (int i = 0; i < source_vd->vdev_children; i++) {
source_child_vd = source_vd->vdev_child[
(dest_id + i) % source_vd->vdev_children];
if (vdev_readable(source_child_vd))
break;
}
} else {
source_child_vd = source_vd;
}

/*
* There should always be at least one readable source child or
* the pool would be in a suspended state. Somehow selecting an
* unreadable child would result in IO errors, the removal process
* being cancelled, and the pool reverting to its pre-removal state.
*/
ASSERT3P(source_child_vd, !=, NULL);

zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
dest_child_vd, dest_offset, abd, size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
Expand Down Expand Up @@ -1361,6 +1404,8 @@ spa_vdev_remove_thread(void *arg)
mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
vca.vca_outstanding_bytes = 0;
vca.vca_read_error_bytes = 0;
vca.vca_write_error_bytes = 0;

mutex_enter(&svr->svr_lock);

Expand Down Expand Up @@ -1490,6 +1535,14 @@ spa_vdev_remove_thread(void *arg)
dmu_tx_commit(tx);
mutex_enter(&svr->svr_lock);
}

mutex_enter(&vca.vca_lock);
if (zfs_removal_ignore_errors == 0 &&
(vca.vca_read_error_bytes > 0 ||
vca.vca_write_error_bytes > 0)) {
svr->svr_thread_exit = B_TRUE;
}
mutex_exit(&vca.vca_lock);
}

mutex_exit(&svr->svr_lock);
Expand All @@ -1511,6 +1564,21 @@ spa_vdev_remove_thread(void *arg)
svr->svr_thread = NULL;
cv_broadcast(&svr->svr_cv);
mutex_exit(&svr->svr_lock);

/*
* During the removal process an unrecoverable read or write
* error was encountered. The removal process must be
* cancelled or this damage may become permanent.
*/
if (zfs_removal_ignore_errors == 0 &&
(vca.vca_read_error_bytes > 0 ||
vca.vca_write_error_bytes > 0)) {
zfs_dbgmsg("canceling removal due to IO errors: "
"[read_error_bytes=%llu] [write_error_bytes=%llu]",
vca.vca_read_error_bytes,
vca.vca_write_error_bytes);
spa_vdev_remove_cancel_impl(spa);
}
} else {
ASSERT0(range_tree_space(svr->svr_allocd_segs));
vdev_remove_complete(spa);
Expand Down Expand Up @@ -1689,14 +1757,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
}

int
spa_vdev_remove_cancel(spa_t *spa)
static int
spa_vdev_remove_cancel_impl(spa_t *spa)
{
spa_vdev_remove_suspend(spa);

if (spa->spa_vdev_removal == NULL)
return (ENOTACTIVE);

uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;

int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
Expand All @@ -1713,6 +1776,17 @@ spa_vdev_remove_cancel(spa_t *spa)
return (error);
}

int
spa_vdev_remove_cancel(spa_t *spa)
{
spa_vdev_remove_suspend(spa);

if (spa->spa_vdev_removal == NULL)
return (ENOTACTIVE);

return (spa_vdev_remove_cancel_impl(spa));
}

/*
* Called every sync pass of every txg if there's a svr.
*/
Expand Down Expand Up @@ -2162,6 +2236,10 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
}

#if defined(_KERNEL)
module_param(zfs_removal_ignore_errors, int, 0644);
MODULE_PARM_DESC(zfs_removal_ignore_errors,
"Ignore hard IO errors when removing device");

module_param(zfs_remove_max_segment, int, 0644);
MODULE_PARM_DESC(zfs_remove_max_segment,
"Largest contiguous segment to allocate when removing device");
Expand Down
6 changes: 3 additions & 3 deletions tests/runfiles/linux.run
Original file line number Diff line number Diff line change
Expand Up @@ -728,9 +728,9 @@ tests = ['removal_all_vdev', 'removal_check_space',
'removal_remap', 'removal_remap_deadlists',
'removal_resume_export', 'removal_sanity', 'removal_with_add',
'removal_with_create_fs', 'removal_with_dedup',
'removal_with_export', 'removal_with_ganging',
'removal_with_remap', 'removal_with_remove',
'removal_with_scrub', 'removal_with_send',
'removal_with_errors', 'removal_with_export',
'removal_with_ganging', 'removal_with_faulted', 'removal_with_remap',
'removal_with_remove', 'removal_with_scrub', 'removal_with_send',
'removal_with_send_recv', 'removal_with_snapshot',
'removal_with_write', 'removal_with_zdb', 'remove_expanded',
'remove_mirror', 'remove_mirror_sanity', 'remove_raidz']
Expand Down
22 changes: 21 additions & 1 deletion tests/zfs-tests/include/libtest.shlib
Original file line number Diff line number Diff line change
Expand Up @@ -1931,6 +1931,23 @@ function verify_filesys # pool filesystem dir
log_must rm -rf $zdbout
}

#
# Given a pool issue a scrub and verify that no checksum errors are reported.
#
function verify_pool
{
typeset pool=${1:-$TESTPOOL}

log_must zpool scrub $pool
log_must wait_scrubbed $pool

cksum=$(zpool status $pool | awk 'L{print $NF;L=0} /CKSUM$/{L=1}')
if [[ $cksum != 0 ]]; then
log_must zpool status -v
log_fail "Unexpected CKSUM errors found on $pool ($cksum)"
fi
}

#
# Given a pool, and this function list all disks in the pool
#
Expand Down Expand Up @@ -3025,8 +3042,11 @@ function vdevs_in_pool

shift

# We could use 'zpool list' to only get the vdevs of the pool but we
# can't reference a mirror/raidz vdev using its ID (i.e mirror-0),
# therefore we use the 'zpool status' output.
typeset tmpfile=$(mktemp)
zpool list -Hv "$pool" >$tmpfile
zpool status -v "$pool" | grep -A 1000 "config:" >$tmpfile
for vdev in $@; do
grep -w ${vdev##*/} $tmpfile >/dev/null 2>&1
[[ $? -ne 0 ]] && return 1
Expand Down
3 changes: 2 additions & 1 deletion tests/zfs-tests/tests/functional/removal/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ dist_pkgdata_SCRIPTS = \
removal_remap_deadlists.ksh removal_remap.ksh \
removal_reservation.ksh removal_resume_export.ksh \
removal_sanity.ksh removal_with_add.ksh removal_with_create_fs.ksh \
removal_with_dedup.ksh removal_with_export.ksh \
removal_with_dedup.ksh removal_with_errors.ksh \
removal_with_export.ksh removal_with_faulted.ksh \
removal_with_ganging.ksh removal_with_remap.ksh \
removal_with_remove.ksh removal_with_scrub.ksh \
removal_with_send.ksh removal_with_send_recv.ksh \
Expand Down
2 changes: 2 additions & 0 deletions tests/zfs-tests/tests/functional/removal/removal.kshlib
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ function test_removal_with_operation # callback [args]

kill $killpid
wait

verify_pool $TESTPOOL
}

#
Expand Down
Loading

0 comments on commit 0f70df0

Please sign in to comment.