Skip to content

Commit

Permalink
Backup allocation class vdev data to the pool
Browse files Browse the repository at this point in the history
This commit allows you to automatically backup allocation class vdevs to
the pool. If the alloc class vdev is fully backed up, it can fail
without the pool losing any data.  This also means you can safely create
pools with non-matching alloc class redundancy (like a mirrored pool
with a single special device).

It works by making sure all alloc class writes have at least two DVA
copies, and then having the 2nd copy always go to the pool itself. So
whenever you write to an alloc class vdev, another copy of the data is
also written to the pool.

This behavior is controlled via three properties:

1. feature@allow_backup_to_pool - This feature flag enables the backup
   subsystem.  It also prevents the backed-up pool from being imported
   read/write on an older version of ZFS that does not support alloc
   class backups.

2. backup_alloc_class_to_pool - This pool property is the main on/off
   switch to control the backup feature.  It is on by default but can be
   turned off at any time.  Once it is turned off, then all existing
   vdevs will no longer considered to be fully backed up.

3. backup_to_pool - This is a read-only vdev property that will report
   "on" if all the data on the vdev is fully backed up to the pool.

Note that the backup to pool feature is now enabled by default on all
new pools.  This may create a performance penalty over pure alloc class
writes due to the extra backup copy write to the pool.  Alloc class
reads should not be affected as they always read from DVA 0 first (the
copy of the data on the special device).

Closes: #15118

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
  • Loading branch information
tonyhutter committed Apr 11, 2024
1 parent 162cc80 commit 76e5591
Show file tree
Hide file tree
Showing 56 changed files with 2,573 additions and 304 deletions.
56 changes: 51 additions & 5 deletions cmd/zpool/zpool_vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -480,13 +480,43 @@ is_raidz_draid(replication_level_t *a, replication_level_t *b)
return (B_FALSE);
}

/*
* Return true if 'props' contains either:
*
* feature@allow_backup_to_pool=disabled
*
* or
*
* backup_alloc_class_to_pool=off
*/
static boolean_t
is_backup_to_pool_disabled_in_props(nvlist_t *props)
{
const char *str = NULL;
if (nvlist_lookup_string(props, "feature@allow_backup_to_pool",
&str) == 0) {
if ((str != NULL) && strcmp(str, "disabled") == 0) {
return (B_TRUE); /* It is disabled */
}
}

if (nvlist_lookup_string(props, "backup_alloc_class_to_pool",
&str) == 0) {
if ((str != NULL) && strcmp(str, "off") == 0) {
return (B_TRUE); /* It is disabled */
}
}

return (B_FALSE);
}

/*
* Given a list of toplevel vdevs, return the current replication level. If
* the config is inconsistent, then NULL is returned. If 'fatal' is set, then
* an error message will be displayed for each self-inconsistent vdev.
*/
static replication_level_t *
get_replication(nvlist_t *nvroot, boolean_t fatal)
get_replication(nvlist_t *props, nvlist_t *nvroot, boolean_t fatal)
{
nvlist_t **top;
uint_t t, toplevels;
Expand All @@ -507,6 +537,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)

for (t = 0; t < toplevels; t++) {
uint64_t is_log = B_FALSE;
const char *str = NULL;

nv = top[t];

Expand All @@ -518,6 +549,21 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
if (is_log)
continue;

/*
* By default, all alloc class devices have their backup to pool
* props enabled, so their replication level doesn't matter.
* However, if they're disabled for any reason, then we do need
* to force redundancy.
*/
(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
&str);
if (str &&
((strcmp(str, VDEV_ALLOC_BIAS_SPECIAL) == 0) ||
(strcmp(str, VDEV_ALLOC_BIAS_DEDUP) == 0))) {
if (!is_backup_to_pool_disabled_in_props(props))
continue; /* We're backed up, skip redundancy */
}

/*
* Ignore holes introduced by removing aux devices, along
* with indirect vdevs introduced by previously removed
Expand Down Expand Up @@ -808,7 +854,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
* report any difference between the two.
*/
static int
check_replication(nvlist_t *config, nvlist_t *newroot)
check_replication(nvlist_t *props, nvlist_t *config, nvlist_t *newroot)
{
nvlist_t **child;
uint_t children;
Expand All @@ -825,7 +871,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot)

verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
if ((current = get_replication(nvroot, B_FALSE)) == NULL)
if ((current = get_replication(props, nvroot, B_FALSE)) == NULL)
return (0);
}
/*
Expand All @@ -850,7 +896,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
* Get the replication level of the new vdev spec, reporting any
* inconsistencies found.
*/
if ((new = get_replication(newroot, B_TRUE)) == NULL) {
if ((new = get_replication(props, newroot, B_TRUE)) == NULL) {
free(current);
return (-1);
}
Expand Down Expand Up @@ -1888,7 +1934,7 @@ make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
* found. We include the existing pool spec, if any, as we need to
* catch changes against the existing replication level.
*/
if (check_rep && check_replication(poolconfig, newroot) != 0) {
if (check_rep && check_replication(props, poolconfig, newroot) != 0) {
nvlist_free(newroot);
return (NULL);
}
Expand Down
4 changes: 4 additions & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ typedef enum {
ZPOOL_PROP_BCLONEUSED,
ZPOOL_PROP_BCLONESAVED,
ZPOOL_PROP_BCLONERATIO,
ZPOOL_PROP_BACKUP_ALLOC_CLASS_TO_POOL,
ZPOOL_NUM_PROPS
} zpool_prop_t;

Expand Down Expand Up @@ -368,6 +369,7 @@ typedef enum {
VDEV_PROP_RAIDZ_EXPANDING,
VDEV_PROP_SLOW_IO_N,
VDEV_PROP_SLOW_IO_T,
VDEV_PROP_BACKUP_TO_POOL,
VDEV_NUM_PROPS
} vdev_prop_t;

Expand Down Expand Up @@ -845,6 +847,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */
#define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats"
#define ZPOOL_CONFIG_COMPATIBILITY "compatibility"
#define ZPOOL_CONFIG_BACKUP_TO_POOL "backup_to_pool"

/*
* The persistent vdev state is stored as separate values rather than a single
Expand Down Expand Up @@ -1604,6 +1607,7 @@ typedef enum {
ZFS_ERR_CRYPTO_NOTSUP,
ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
ZFS_ERR_ASHIFT_MISMATCH,
ZFS_ERR_BACKUP_DISABLED_BUT_REQUESTED,
} zfs_errno_t;

/*
Expand Down
3 changes: 2 additions & 1 deletion include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -1113,7 +1113,8 @@ extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
extern uint64_t spa_get_last_removal_txg(spa_t *spa);
extern boolean_t spa_trust_config(spa_t *spa);
extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing,
uint64_t missing_special);
extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
extern uint64_t spa_total_metaslabs(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa);
Expand Down
9 changes: 9 additions & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,12 @@ struct spa {
uint64_t spa_missing_tvds; /* unopenable tvds on load */
uint64_t spa_missing_tvds_allowed; /* allow loading spa? */

/*
* number of 'spa_missing_tvds' that are alloc class devices
* backed up to the pool, and thus recoverable from errors.
*/
uint64_t spa_missing_recovered_tvds;

uint64_t spa_nonallocating_dspace;
spa_removing_phys_t spa_removing_phys;
spa_vdev_removal_t *spa_vdev_removal;
Expand Down Expand Up @@ -465,6 +471,9 @@ struct spa {
*/
spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
zfs_refcount_t spa_refcount; /* number of opens */

/* Backup special/dedup devices data to the pool */
boolean_t spa_backup_alloc_class;
};

extern char *spa_config_path;
Expand Down
18 changes: 18 additions & 0 deletions include/sys/vdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,24 @@ extern uint32_t vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);

typedef enum {
/* (special flag) dry-run, get count only */
VDEV_ARRAY_COUNT = 1ULL << 0,

VDEV_ARRAY_ANY_LEAF = 1ULL << 1, /* match any leaf */
VDEV_ARRAY_SPECIAL_LEAF = 1ULL << 2, /* match special vdev leaves */
VDEV_ARRAY_DEDUP_LEAF = 1ULL << 3, /* match dedup vdev leaves */
} vdev_array_flag_t;

struct vdev_array
{
vdev_t **vds; /* Array of vdev_t's */
int count;
};

extern struct vdev_array *vdev_array_alloc(vdev_t *rvd, uint64_t flags);
extern void vdev_array_free(struct vdev_array *vda);

extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);
extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
Expand Down
12 changes: 12 additions & 0 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,13 @@ struct vdev {
uint64_t vdev_failfast; /* device failfast setting */
boolean_t vdev_rz_expanding; /* raidz is being expanded? */
boolean_t vdev_ishole; /* is a hole in the namespace */

/*
* If this is set to true, then all the data on this vdev is backed up
* to the pool. This is only used by allocation class devices.
*/
boolean_t vdev_backup_to_pool;

uint64_t vdev_top_zap;
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */

Expand Down Expand Up @@ -641,6 +648,11 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
void vdev_metaslab_group_create(vdev_t *vd);
uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b);
extern boolean_t vdev_is_fully_backed_up(vdev_t *vd);
extern boolean_t vdev_is_leaf(vdev_t *vd);
extern boolean_t vdev_is_special(vdev_t *vd);
extern boolean_t vdev_is_dedup(vdev_t *vd);
extern boolean_t vdev_is_alloc_class(vdev_t *vd);

/*
* Vdev ashift optimization tunables
Expand Down
1 change: 1 addition & 0 deletions include/zfeature_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ typedef enum spa_feature {
SPA_FEATURE_AVZ_V2,
SPA_FEATURE_REDACTION_LIST_SPILL,
SPA_FEATURE_RAIDZ_EXPANSION,
SPA_FEATURE_ALLOW_BACKUP_TO_POOL,
SPA_FEATURES
} spa_feature_t;

Expand Down
10 changes: 5 additions & 5 deletions lib/libnvpair/libnvpair.abi
Original file line number Diff line number Diff line change
Expand Up @@ -1156,6 +1156,11 @@
<parameter type-id='80f4b756'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='strchr' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='95e97e5e'/>
<return type-id='26a90f95'/>
</function-decl>
<function-decl name='strcspn' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='80f4b756'/>
Expand Down Expand Up @@ -2536,11 +2541,6 @@
<parameter type-id='b59d7dce'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='strchr' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='95e97e5e'/>
<return type-id='26a90f95'/>
</function-decl>
<function-decl name='strlen' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<return type-id='b59d7dce'/>
Expand Down
47 changes: 41 additions & 6 deletions lib/libuutil/libuutil.abi
Original file line number Diff line number Diff line change
Expand Up @@ -596,14 +596,11 @@
<var-decl name='prev' type-id='b03eadb4' visibility='default'/>
</data-member>
</class-decl>
<class-decl name='list' size-in-bits='256' is-struct='yes' visibility='default' id='e824dae9'>
<class-decl name='list' size-in-bits='192' is-struct='yes' visibility='default' id='e824dae9'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='list_size' type-id='b59d7dce' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='64'>
<var-decl name='list_offset' type-id='b59d7dce' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='128'>
<data-member access='public' layout-offset-in-bits='64'>
<var-decl name='list_head' type-id='b0b5e45e' visibility='default'/>
</data-member>
</class-decl>
Expand Down Expand Up @@ -800,9 +797,16 @@
<type-decl name='unsigned long int' size-in-bits='64' id='7359adad'/>
</abi-instr>
<abi-instr address-size='64' path='lib/libspl/os/linux/getmntany.c' language='LANG_C99'>
<array-type-def dimensions='1' type-id='38b51b3c' size-in-bits='832' id='02b72c00'>
<subrange length='13' type-id='7359adad' id='487fded1'/>
</array-type-def>
<array-type-def dimensions='1' type-id='03085adc' size-in-bits='192' id='083f8d58'>
<subrange length='3' type-id='7359adad' id='56f209d2'/>
</array-type-def>
<class-decl name='__locale_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='23de8b96'/>
<array-type-def dimensions='1' type-id='80f4b756' size-in-bits='832' id='39e6f84a'>
<subrange length='13' type-id='7359adad' id='487fded1'/>
</array-type-def>
<class-decl name='mnttab' size-in-bits='256' is-struct='yes' visibility='default' id='1b055409'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='mnt_special' type-id='26a90f95' visibility='default'/>
Expand Down Expand Up @@ -912,6 +916,25 @@
<typedef-decl name='__blksize_t' type-id='bd54fe1a' id='d3f10a7f'/>
<typedef-decl name='__blkcnt64_t' type-id='bd54fe1a' id='4e711bf1'/>
<typedef-decl name='__syscall_slong_t' type-id='bd54fe1a' id='03085adc'/>
<class-decl name='__locale_struct' size-in-bits='1856' is-struct='yes' visibility='default' id='90cc1ce3'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='__locales' type-id='02b72c00' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='832'>
<var-decl name='__ctype_b' type-id='31347b7a' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='896'>
<var-decl name='__ctype_tolower' type-id='6d60f45d' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='960'>
<var-decl name='__ctype_toupper' type-id='6d60f45d' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='1024'>
<var-decl name='__names' type-id='39e6f84a' visibility='default'/>
</data-member>
</class-decl>
<typedef-decl name='__locale_t' type-id='f01e1813' id='b7ac9b5f'/>
<typedef-decl name='locale_t' type-id='b7ac9b5f' id='973a4f8d'/>
<class-decl name='timespec' size-in-bits='128' is-struct='yes' visibility='default' id='a9c79a1f'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='tv_sec' type-id='65eda9c0' visibility='default'/>
Expand All @@ -920,12 +943,23 @@
<var-decl name='tv_nsec' type-id='03085adc' visibility='default'/>
</data-member>
</class-decl>
<pointer-type-def type-id='23de8b96' size-in-bits='64' id='38b51b3c'/>
<pointer-type-def type-id='90cc1ce3' size-in-bits='64' id='f01e1813'/>
<qualified-type-def type-id='95e97e5e' const='yes' id='2448a865'/>
<pointer-type-def type-id='2448a865' size-in-bits='64' id='6d60f45d'/>
<qualified-type-def type-id='8efea9e5' const='yes' id='3beb2af4'/>
<pointer-type-def type-id='3beb2af4' size-in-bits='64' id='31347b7a'/>
<pointer-type-def type-id='0c544dc0' size-in-bits='64' id='394fc496'/>
<pointer-type-def type-id='56fe4a37' size-in-bits='64' id='b6b61d2f'/>
<qualified-type-def type-id='b6b61d2f' restrict='yes' id='3cad23cd'/>
<pointer-type-def type-id='1b055409' size-in-bits='64' id='9d424d31'/>
<pointer-type-def type-id='0bbec9cd' size-in-bits='64' id='62f7a03d'/>
<qualified-type-def type-id='62f7a03d' restrict='yes' id='f1cadedf'/>
<class-decl name='__locale_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='23de8b96'/>
<function-decl name='uselocale' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='973a4f8d'/>
<return type-id='973a4f8d'/>
</function-decl>
<function-decl name='getmntent_r' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='e75a27e9'/>
<parameter type-id='3cad23cd'/>
Expand All @@ -937,8 +971,9 @@
<parameter type-id='822cd80b'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='strerror' visibility='default' binding='global' size-in-bits='64'>
<function-decl name='strerror_l' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='95e97e5e'/>
<parameter type-id='973a4f8d'/>
<return type-id='26a90f95'/>
</function-decl>
<function-decl name='__fprintf_chk' visibility='default' binding='global' size-in-bits='64'>
Expand Down
Loading

0 comments on commit 76e5591

Please sign in to comment.