Skip to content

Commit

Permalink
Defer new resilvers until the current one ends
Browse files Browse the repository at this point in the history
Currently, if a resilver is triggered for any reason while an
existing one is running, zfs will immediately restart the existing
resilver from the beginning to include the new drive. This causes
problems for system administrators when a drive fails while another
is already resilvering. In this case, the optimal thing to do to
reduce risk of data loss is to wait for the current resilver to end
before immediately replacing the second failed drive, which allows
the system to operate with two incomplete drives for the minimum
amount of time.

This patch introduces the resilver_defer feature that essentially
does this for the admin without forcing them to wait and monitor
the resilver manually. The change requires an on-disk feature
since we must mark drives that are part of a deferred resilver in
the vdev config to ensure that we do not assume they are done
resilvering when an existing resilver completes.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: @mmaybee 
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #7732
  • Loading branch information
Tom Caputi authored and behlendorf committed Oct 19, 2018
1 parent 9f438c5 commit 80a91e7
Show file tree
Hide file tree
Showing 28 changed files with 543 additions and 21 deletions.
58 changes: 52 additions & 6 deletions cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ static int zpool_do_replace(int, char **);
static int zpool_do_split(int, char **);

static int zpool_do_scrub(int, char **);
static int zpool_do_resilver(int, char **);

static int zpool_do_import(int, char **);
static int zpool_do_export(int, char **);
Expand Down Expand Up @@ -149,6 +150,7 @@ typedef enum {
HELP_REPLACE,
HELP_REMOVE,
HELP_SCRUB,
HELP_RESILVER,
HELP_STATUS,
HELP_UPGRADE,
HELP_EVENTS,
Expand Down Expand Up @@ -276,6 +278,7 @@ static zpool_command_t command_table[] = {
{ "split", zpool_do_split, HELP_SPLIT },
{ NULL },
{ "scrub", zpool_do_scrub, HELP_SCRUB },
{ "resilver", zpool_do_resilver, HELP_RESILVER },
{ NULL },
{ "import", zpool_do_import, HELP_IMPORT },
{ "export", zpool_do_export, HELP_EXPORT },
Expand Down Expand Up @@ -358,6 +361,8 @@ get_usage(zpool_help_t idx)
return (gettext("\treopen [-n] <pool>\n"));
case HELP_SCRUB:
return (gettext("\tscrub [-s | -p] <pool> ...\n"));
case HELP_RESILVER:
return (gettext("\tresilver <pool> ...\n"));
case HELP_STATUS:
return (gettext("\tstatus [-c [script1,script2,...]] [-gLPvxD]"
"[-T d|u] [pool] ... \n"
Expand Down Expand Up @@ -1874,11 +1879,14 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
(void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS,
(uint64_t **)&ps, &c);

if (ps != NULL && ps->pss_state == DSS_SCANNING &&
vs->vs_scan_processed != 0 && children == 0) {
(void) printf(gettext(" (%s)"),
(ps->pss_func == POOL_SCAN_RESILVER) ?
"resilvering" : "repairing");
if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) {
if (vs->vs_scan_processed != 0) {
(void) printf(gettext(" (%s)"),
(ps->pss_func == POOL_SCAN_RESILVER) ?
"resilvering" : "repairing");
} else if (vs->vs_resilver_deferred) {
(void) printf(gettext(" (awaiting resilver)"));
}
}

if (cb->vcdl != NULL) {
Expand Down Expand Up @@ -6251,7 +6259,7 @@ scrub_callback(zpool_handle_t *zhp, void *data)
* Ignore faulted pools.
*/
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
(void) fprintf(stderr, gettext("cannot scrub '%s': pool is "
(void) fprintf(stderr, gettext("cannot scan '%s': pool is "
"currently unavailable\n"), zpool_get_name(zhp));
return (1);
}
Expand Down Expand Up @@ -6319,6 +6327,44 @@ zpool_do_scrub(int argc, char **argv)
return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
}

/*
* zpool resilver <pool> ...
*
* Restarts any in-progress resilver
*/
int
zpool_do_resilver(int argc, char **argv)
{
int c;
scrub_cbdata_t cb;

cb.cb_type = POOL_SCAN_RESILVER;
cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
cb.cb_argc = argc;
cb.cb_argv = argv;

/* check options */
while ((c = getopt(argc, argv, "")) != -1) {
switch (c) {
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}

argc -= optind;
argv += optind;

if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name argument\n"));
usage(B_FALSE);
}

return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
}


/*
* Print out detailed scrub status.
*/
Expand Down
1 change: 1 addition & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/cli_root/zpool_online/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_remove/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_reopen/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_resilver/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_replace/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_set/Makefile
Expand Down
2 changes: 2 additions & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top"
#define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf"
#define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps"
#define ZPOOL_CONFIG_RESILVER_DEFER "com.datto:resilver_defer"
#define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */
Expand Down Expand Up @@ -988,6 +989,7 @@ typedef struct vdev_stat {
uint64_t vs_scan_processed; /* scan processed bytes */
uint64_t vs_fragmentation; /* device fragmentation */
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
uint64_t vs_resilver_deferred; /* resilver deferred */
} vdev_stat_t;

/*
Expand Down
7 changes: 7 additions & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,13 @@ struct spa {
uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */
uint64_t spa_scan_pass_exam; /* examined bytes per pass */
uint64_t spa_scan_pass_issued; /* issued bytes per pass */

/*
* We are in the middle of a resilver, and another resilver
* is needed once this one completes. This is set iff any
* vdev_resilver_deferred is set.
*/
boolean_t spa_resilver_deferred;
kmutex_t spa_async_lock; /* protect async state */
kthread_t *spa_async_thread; /* thread doing async task */
int spa_async_suspended; /* async tasks suspended */
Expand Down
2 changes: 2 additions & 0 deletions include/sys/vdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
extern void vdev_state_dirty(vdev_t *vd);
extern void vdev_state_clean(vdev_t *vd);

extern void vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd);

typedef enum vdev_config_flag {
VDEV_CONFIG_SPARE = 1 << 0,
VDEV_CONFIG_L2CACHE = 1 << 1,
Expand Down
1 change: 1 addition & 0 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ struct vdev {
boolean_t vdev_isspare; /* was a hot spare */
boolean_t vdev_isl2cache; /* was a l2cache device */
boolean_t vdev_copy_uberblocks; /* post expand copy uberblocks */
boolean_t vdev_resilver_deferred; /* resilver deferred */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
vdev_cache_t vdev_cache; /* physical block cache */
spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
Expand Down
1 change: 1 addition & 0 deletions include/zfeature_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ typedef enum spa_feature {
SPA_FEATURE_POOL_CHECKPOINT,
SPA_FEATURE_SPACEMAP_V2,
SPA_FEATURE_ALLOCATION_CLASSES,
SPA_FEATURE_RESILVER_DEFER,
SPA_FEATURES
} spa_feature_t;

Expand Down
1 change: 1 addition & 0 deletions include/zfs_gitrev.h
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#define ZFS_META_GITREV "unknown"
21 changes: 21 additions & 0 deletions man/man5/zpool-features.5
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,27 @@ can also be triggered on filesystems via `zfs set version=current <pool/fs>`.
The upgrade process runs in the background and may take a while to complete
for the filesystems containing a large number of files.

.RE
.sp
.ne 2
.na
\fB\fBresilver_defer\fR\fR
.ad
.RS 4n
.TS
l l .
GUID com.datto:resilver_defer
READ\-ONLY COMPATIBLE yes
DEPENDENCIES none
.TE

This feature allows zfs to postpone new resilvers if an existing one is already
in progress. Without this feature, any new resilvers will cause the currently
running one to be immediately restarted from the beginning.

This feature becomes \fBactive\fR once a resilver has been defered, and returns
to being \fBenabled\fR when the defered resilver begins.

.RE

.sp
Expand Down
11 changes: 11 additions & 0 deletions man/man8/zpool.8
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@
.Oo Fl o Ar property Ns = Ns Ar value Oc
.Ar pool Ar device Op Ar new_device
.Nm
.Cm resilver
.Ar pool Ns ...
.Nm
.Cm scrub
.Op Fl s | Fl p
.Ar pool Ns ...
Expand Down Expand Up @@ -2069,6 +2072,14 @@ again.
.El
.It Xo
.Nm
.Cm resilver
.Ar pool Ns ...
.Xc
Starts a resilver. If an existing resilver is already running it will be
restarted from the beginning. Any drives that were scheduled for a deferred
resilver will be added to the new one.
.It Xo
.Nm
.Cm set
.Ar property Ns = Ns Ar value
.Ar pool
Expand Down
5 changes: 5 additions & 0 deletions module/zcommon/zfeature_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,11 @@ zpool_feature_init(void)
"Support for separate allocation classes.",
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
}

zfeature_register(SPA_FEATURE_RESILVER_DEFER,
"com.datto:resilver_defer", "resilver_defer",
"Support for defering new resilvers when one is already running.",
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
}

#if defined(_KERNEL)
Expand Down
103 changes: 101 additions & 2 deletions module/zfs/dsl_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,8 @@ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
/* max number of blocks to free in a single TXG */
unsigned long zfs_async_block_max_blocks = 100000;

int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */

/*
* We wait a few txgs after importing a pool to begin scanning so that
* the import / mounting code isn't held up by scrub / resilver IO.
Expand Down Expand Up @@ -720,6 +722,11 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
spa->spa_scrub_reopen = B_FALSE;
(void) spa_vdev_state_exit(spa, NULL, 0);

if (func == POOL_SCAN_RESILVER) {
dsl_resilver_restart(spa->spa_dsl_pool, 0);
return (0);
}

if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
/* got scrub start cmd, resume paused scrub */
int err = dsl_scrub_set_pause_resume(scn->scn_dp,
Expand All @@ -736,6 +743,41 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
}

/*
* Sets the resilver defer flag to B_FALSE on all leaf devs under vd. Returns
* B_TRUE if we have devices that need to be resilvered and are available to
* accept resilver I/Os.
*/
static boolean_t
dsl_scan_clear_deferred(vdev_t *vd, dmu_tx_t *tx)
{
boolean_t resilver_needed = B_FALSE;
spa_t *spa = vd->vdev_spa;

for (int c = 0; c < vd->vdev_children; c++) {
resilver_needed |=
dsl_scan_clear_deferred(vd->vdev_child[c], tx);
}

if (vd == spa->spa_root_vdev &&
spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
vdev_config_dirty(vd);
spa->spa_resilver_deferred = B_FALSE;
return (resilver_needed);
}

if (!vdev_is_concrete(vd) || vd->vdev_aux ||
!vd->vdev_ops->vdev_op_leaf)
return (resilver_needed);

if (vd->vdev_resilver_deferred)
vd->vdev_resilver_deferred = B_FALSE;

return (!vdev_is_dead(vd) && !vd->vdev_offline &&
vdev_resilver_needed(vd, NULL, NULL));
}

/* ARGSUSED */
static void
dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
Expand Down Expand Up @@ -835,6 +877,25 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
* Let the async thread assess this and handle the detach.
*/
spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);

/*
* Clear any deferred_resilver flags in the config.
* If there are drives that need resilvering, kick
* off an asynchronous request to start resilver.
* dsl_scan_clear_deferred() may update the config
* before the resilver can restart. In the event of
* a crash during this period, the spa loading code
* will find the drives that need to be resilvered
* when the machine reboots and start the resilver then.
*/
boolean_t resilver_needed =
dsl_scan_clear_deferred(spa->spa_root_vdev, tx);
if (resilver_needed) {
spa_history_log_internal(spa,
"starting deferred resilver", tx,
"errors=%llu", spa_get_errlog_size(spa));
spa_async_request(spa, SPA_ASYNC_RESILVER);
}
}

scn->scn_phys.scn_end_time = gethrestime_sec();
Expand Down Expand Up @@ -2966,6 +3027,26 @@ dsl_scan_active(dsl_scan_t *scn)
return (used != 0);
}

static boolean_t
dsl_scan_check_deferred(vdev_t *vd)
{
boolean_t need_resilver = B_FALSE;

for (int c = 0; c < vd->vdev_children; c++) {
need_resilver |=
dsl_scan_check_deferred(vd->vdev_child[c]);
}

if (!vdev_is_concrete(vd) || vd->vdev_aux ||
!vd->vdev_ops->vdev_op_leaf)
return (need_resilver);

if (!vd->vdev_resilver_deferred)
need_resilver = B_TRUE;

return (need_resilver);
}

static boolean_t
dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
uint64_t phys_birth)
Expand Down Expand Up @@ -3013,6 +3094,13 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
return (B_FALSE);

/*
* Check that this top-level vdev has a device under it which
* is resilvering and is not deferred.
*/
if (!dsl_scan_check_deferred(vd))
return (B_FALSE);

return (B_TRUE);
}

Expand Down Expand Up @@ -3173,12 +3261,19 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
spa_t *spa = dp->dp_spa;
state_sync_type_t sync_type = SYNC_OPTIONAL;

if (spa->spa_resilver_deferred &&
!spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx);

/*
* Check for scn_restart_txg before checking spa_load_state, so
* that we can restart an old-style scan while the pool is being
* imported (see dsl_scan_init).
* imported (see dsl_scan_init). We also restart scans if there
* is a deferred resilver and the user has manually disabled
* deferred resilvers via the tunable.
*/
if (dsl_scan_restarting(scn, tx)) {
if (dsl_scan_restarting(scn, tx) ||
(spa->spa_resilver_deferred && zfs_resilver_disable_defer)) {
pool_scan_func_t func = POOL_SCAN_SCRUB;
dsl_scan_done(scn, B_FALSE, tx);
if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
Expand Down Expand Up @@ -4000,4 +4095,8 @@ MODULE_PARM_DESC(zfs_scan_strict_mem_lim,
module_param(zfs_scan_fill_weight, int, 0644);
MODULE_PARM_DESC(zfs_scan_fill_weight,
"Tunable to adjust bias towards more filled segments during scans");

module_param(zfs_resilver_disable_defer, int, 0644);
MODULE_PARM_DESC(zfs_resilver_disable_defer,
"Process all resilvers immediately");
#endif
Loading

0 comments on commit 80a91e7

Please sign in to comment.