Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Special failsafe feature #6

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1526,6 +1526,23 @@ zpool_do_add(int argc, char **argv)
}
}

/*
* Special case:
*
* We need to know the special_failsafe pool property value to determine
* if the new vdev configuration has the correct redundancy requirements
* for special and dedup vdevs.
*
* Pass in the current value for special_failsafe to the proplist.
*/
char strval[ZFS_MAXPROPLEN];
if (zpool_get_prop(zhp, ZPOOL_PROP_SPECIAL_FAILSAFE, strval,
ZFS_MAXPROPLEN, NULL, B_FALSE) == 0) {
verify(add_prop_list(
zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), strval,
&props, B_TRUE) == 0);
}

/* pass off to make_root_vdev for processing */
nvroot = make_root_vdev(zhp, props, !check_inuse,
check_replication, B_FALSE, dryrun, argc, argv);
Expand Down Expand Up @@ -7604,6 +7621,23 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
}
}

/*
* Special case:
*
* We need to know the special_failsafe pool property value to determine
* if the new vdev configuration has the correct redundancy requirements
* for special and dedup vdevs.
*
* Pass in the current value for special_failsafe to the proplist.
*/
char strval[ZFS_MAXPROPLEN];
if (zpool_get_prop(zhp, ZPOOL_PROP_SPECIAL_FAILSAFE, strval,
ZFS_MAXPROPLEN, NULL, B_FALSE) == 0) {
verify(add_prop_list(
zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), strval,
&props, B_TRUE) == 0);
}

nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE,
argc, argv);
if (nvroot == NULL) {
Expand Down
97 changes: 85 additions & 12 deletions cmd/zpool/zpool_vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
*/
boolean_t error_seen;
boolean_t is_force;
boolean_t is_alloc_class;

void
vdev_error(const char *fmt, ...)
Expand All @@ -94,8 +95,15 @@ vdev_error(const char *fmt, ...)
if (!error_seen) {
(void) fprintf(stderr, gettext("invalid vdev specification\n"));
if (!is_force)
(void) fprintf(stderr, gettext("use '-f' to override "
"the following errors:\n"));
if (is_alloc_class) {
(void) fprintf(stderr, gettext("Turn on the "
"special_failsafe pool property or use '-f'"
" to override the following errors:\n"));
is_alloc_class = B_FALSE;
} else {
(void) fprintf(stderr, gettext("use '-f' to "
"override the following errors:\n"));
}
else
(void) fprintf(stderr, gettext("the following errors "
"must be manually repaired:\n"));
Expand Down Expand Up @@ -442,6 +450,7 @@ typedef struct replication_level {
const char *zprl_type;
uint64_t zprl_children;
uint64_t zprl_parity;
boolean_t zprl_is_alloc_class;
} replication_level_t;

#define ZPOOL_FUZZ (16 * 1024 * 1024)
Expand Down Expand Up @@ -480,13 +489,43 @@ is_raidz_draid(replication_level_t *a, replication_level_t *b)
return (B_FALSE);
}

/*
* Return true if 'props' contains:
*
* special_failsafe=on
*
* ... and feature@special_failsafe is NOT disabled.
*/
static boolean_t
is_special_failsafe_enabled_in_props(nvlist_t *props)
{
const char *str = NULL;

if (nvlist_lookup_string(props, "feature@special_failsafe",
&str) == 0) {
if ((str != NULL) && strcmp(str, "disabled") == 0) {
return (B_FALSE);
}
}

if (nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE),
&str) == 0) {
if ((str != NULL) && strcmp(str, "on") == 0) {
return (B_TRUE); /* It is enabled */
}
}

return (B_FALSE);
}

/*
* Given a list of toplevel vdevs, return the current replication level. If
* the config is inconsistent, then NULL is returned. If 'fatal' is set, then
* an error message will be displayed for each self-inconsistent vdev.
*/
static replication_level_t *
get_replication(nvlist_t *nvroot, boolean_t fatal)
get_replication(nvlist_t *props, nvlist_t *nvroot, boolean_t fatal)
{
nvlist_t **top;
uint_t t, toplevels;
Expand All @@ -495,7 +534,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
nvlist_t *nv;
const char *type;
replication_level_t lastrep = {0};
replication_level_t rep;
replication_level_t rep = {0};
replication_level_t *ret;
replication_level_t *raidz, *mirror;
boolean_t dontreport;
Expand All @@ -507,6 +546,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)

for (t = 0; t < toplevels; t++) {
uint64_t is_log = B_FALSE;
const char *str = NULL;

nv = top[t];

Expand All @@ -528,12 +568,32 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
strcmp(type, VDEV_TYPE_INDIRECT) == 0)
continue;

rep.zprl_type = type;

/*
* If special_failsafe=on then we know the special allocation
* class devices have at least one copy of their data on the
* pool so we can ignore their replication level.
*/
(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
&str);
if (str &&
((strcmp(str, VDEV_ALLOC_BIAS_SPECIAL) == 0) ||
(strcmp(str, VDEV_ALLOC_BIAS_DEDUP) == 0))) {
rep.zprl_is_alloc_class = B_TRUE;
is_alloc_class = B_TRUE;
if (is_special_failsafe_enabled_in_props(props)) {
continue; /* We're backed up, skip redundancy */
}
} else {
is_alloc_class = B_FALSE;
}

if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0) {
/*
* This is a 'file' or 'disk' vdev.
*/
rep.zprl_type = type;
rep.zprl_children = 1;
rep.zprl_parity = 0;
} else {
Expand All @@ -548,7 +608,6 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
* We also check that the size of each vdev (if it can
* be determined) is the same.
*/
rep.zprl_type = type;
rep.zprl_children = 0;

if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
Expand Down Expand Up @@ -808,7 +867,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
* report any difference between the two.
*/
static int
check_replication(nvlist_t *config, nvlist_t *newroot)
check_replication(nvlist_t *props, nvlist_t *config, nvlist_t *newroot)
{
nvlist_t **child;
uint_t children;
Expand All @@ -825,7 +884,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot)

verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
if ((current = get_replication(nvroot, B_FALSE)) == NULL)
if ((current = get_replication(props, nvroot, B_FALSE)) == NULL)
return (0);
}
/*
Expand All @@ -850,17 +909,31 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
* Get the replication level of the new vdev spec, reporting any
* inconsistencies found.
*/
if ((new = get_replication(newroot, B_TRUE)) == NULL) {
if ((new = get_replication(props, newroot, B_TRUE)) == NULL) {
free(current);
return (-1);
}

/*
* Check to see if the new vdev spec matches the replication level of
* the current pool.
*/
ret = 0;
if (current != NULL) {
if (current->zprl_is_alloc_class || new->zprl_is_alloc_class)
is_alloc_class = B_TRUE;
else
is_alloc_class = B_FALSE;

/*
* Special case:
* If there were any redundancy problems with alloc class vdevs
* BUT the pool had special_failsafe on, then we're fine since
* all the alloc class data has a copy in the main pool.
*/
if (is_special_failsafe_enabled_in_props(props) &&
is_alloc_class)
goto out;

if (is_raidz_mirror(current, new, &raidz, &mirror) ||
is_raidz_mirror(new, current, &raidz, &mirror)) {
if (raidz->zprl_parity != mirror->zprl_children - 1) {
Expand Down Expand Up @@ -899,7 +972,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
ret = -1;
}
}

out:
free(new);
if (current != NULL)
free(current);
Expand Down Expand Up @@ -1888,7 +1961,7 @@ make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
* found. We include the existing pool spec, if any, as we need to
* catch changes against the existing replication level.
*/
if (check_rep && check_replication(poolconfig, newroot) != 0) {
if (check_rep && check_replication(props, poolconfig, newroot) != 0) {
nvlist_free(newroot);
return (NULL);
}
Expand Down
2 changes: 2 additions & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ typedef enum {
ZPOOL_PROP_DEDUP_TABLE_QUOTA,
ZPOOL_PROP_DEDUPCACHED,
ZPOOL_PROP_LAST_SCRUBBED_TXG,
ZPOOL_PROP_SPECIAL_FAILSAFE,
ZPOOL_NUM_PROPS
} zpool_prop_t;

Expand Down Expand Up @@ -1635,6 +1636,7 @@ typedef enum {
ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
ZFS_ERR_ASHIFT_MISMATCH,
ZFS_ERR_STREAM_LARGE_MICROZAP,
ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE,
} zfs_errno_t;

/*
Expand Down
3 changes: 2 additions & 1 deletion include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -1138,7 +1138,8 @@ extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
extern uint64_t spa_get_last_removal_txg(spa_t *spa);
extern boolean_t spa_trust_config(spa_t *spa);
extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing,
uint64_t missing_special);
extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
extern uint64_t spa_total_metaslabs(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa);
Expand Down
10 changes: 10 additions & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,13 @@ struct spa {
uint64_t spa_missing_tvds; /* unopenable tvds on load */
uint64_t spa_missing_tvds_allowed; /* allow loading spa? */

/*
* Number of 'spa_missing_tvds' that are alloc class devices
* in the pool that has special_failsafe on, and are thus recoverable
* from errors.
*/
uint64_t spa_missing_recovered_tvds;

uint64_t spa_nonallocating_dspace;
spa_removing_phys_t spa_removing_phys;
spa_vdev_removal_t *spa_vdev_removal;
Expand Down Expand Up @@ -484,6 +491,9 @@ struct spa {
*/
spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
zfs_refcount_t spa_refcount; /* number of opens */

/* Backup special/dedup devices data to the pool */
boolean_t spa_special_failsafe;
};

extern char *spa_config_path;
Expand Down
6 changes: 6 additions & 0 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,12 @@ int param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp);
#endif
int param_set_raidz_impl(ZFS_MODULE_PARAM_ARGS);

extern boolean_t vdev_is_leaf(vdev_t *vd);
extern boolean_t vdev_is_special(vdev_t *vd);
extern boolean_t vdev_is_dedup(vdev_t *vd);
extern boolean_t vdev_is_alloc_class(vdev_t *vd);
extern boolean_t vdev_is_special_failsafe(vdev_t *vd);

/*
* Vdev ashift optimization tunables
*/
Expand Down
1 change: 1 addition & 0 deletions include/zfeature_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ typedef enum spa_feature {
SPA_FEATURE_FAST_DEDUP,
SPA_FEATURE_LONGNAME,
SPA_FEATURE_LARGE_MICROZAP,
SPA_FEATURE_SPECIAL_FAILSAFE,
SPA_FEATURES
} spa_feature_t;

Expand Down
9 changes: 9 additions & 0 deletions lib/libzfs/libzfs_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,15 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
case ZFS_ERR_ASHIFT_MISMATCH:
zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap);
break;
case ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"Cannot set pool prop special_failsafe=on since "
"feature@special_failsafe is not set to 'enabled'.\n"
"This could be because the special_failsafe pool prop was "
"manually turned off while the special_failsafe feature "
"flag was active, or the feature flag was disabled."));
zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
break;
default:
zfs_error_aux(hdl, "%s", zfs_strerror(error));
zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
Expand Down
10 changes: 5 additions & 5 deletions lib/libzutil/zutil_import.c
Original file line number Diff line number Diff line change
Expand Up @@ -1939,7 +1939,7 @@ zpool_find_config(libpc_handle_t *hdl, const char *target, nvlist_t **configp,

/* Return if a vdev is a leaf vdev. Note: draid spares are leaf vdevs. */
static boolean_t
vdev_is_leaf(nvlist_t *nv)
vdev_is_leaf_nv(nvlist_t *nv)
{
uint_t children = 0;
nvlist_t **child;
Expand All @@ -1952,10 +1952,10 @@ vdev_is_leaf(nvlist_t *nv)

/* Return if a vdev is a leaf vdev and a real device (disk or file) */
static boolean_t
vdev_is_real_leaf(nvlist_t *nv)
vdev_is_real_leaf_nv(nvlist_t *nv)
{
const char *type = NULL;
if (!vdev_is_leaf(nv))
if (!vdev_is_leaf_nv(nv))
return (B_FALSE);

(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type);
Expand Down Expand Up @@ -1988,7 +1988,7 @@ __for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv,

/* The very first entry in the NV list is a special case */
if (*((nvlist_t **)state) == (nvlist_t *)FIRST_NV) {
if (real_leaves_only && !vdev_is_real_leaf(nv))
if (real_leaves_only && !vdev_is_real_leaf_nv(nv))
return (0);

*((nvlist_t **)last_nv) = nv;
Expand All @@ -2011,7 +2011,7 @@ __for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv,
* we want.
*/
if (*(nvlist_t **)state == (nvlist_t *)NEXT_IS_MATCH) {
if (real_leaves_only && !vdev_is_real_leaf(nv))
if (real_leaves_only && !vdev_is_real_leaf_nv(nv))
return (0);

*((nvlist_t **)last_nv) = nv;
Expand Down
Loading
Loading