diff --git a/cmd/dbufstat/dbufstat.in b/cmd/dbufstat/dbufstat.in index 82250353f5eb..4b61a04798ae 100755 --- a/cmd/dbufstat/dbufstat.in +++ b/cmd/dbufstat/dbufstat.in @@ -37,7 +37,7 @@ import re bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"] bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize", - "meta", "state", "dbholds", "dbc", "list", "atype", "flags", + "usize", "meta", "state", "dbholds", "dbc", "list", "atype", "flags", "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"] @@ -47,17 +47,17 @@ dhdr = ["pool", "objset", "object", "dtype", "cached"] dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct", "indirect", "bonus", "spill"] -dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds", - "dbc", "list", "atype", "flags", "count", "asize", "access", - "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", - "l2_comp", "aholds"] +dincompat = ["level", "blkid", "offset", "dbsize", "usize", "meta", "state", + "dbholds", "dbc", "list", "atype", "flags", "count", "asize", + "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", + "l2_asize", "l2_comp", "aholds"] thdr = ["pool", "objset", "dtype", "cached"] txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect", "bonus", "spill"] -tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state", - "dbc", "dbholds", "list", "atype", "flags", "count", "asize", - "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", +tincompat = ["object", "level", "blkid", "offset", "dbsize", "usize", "meta", + "state", "dbc", "dbholds", "list", "atype", "flags", "count", + "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"] @@ -70,6 +70,7 @@ cols = { "blkid": [8, -1, "block number of buffer"], "offset": [12, 1024, "offset in object of buffer"], "dbsize": [7, 1024, "size of buffer"], + "usize": [7, 1024, "size of attached user data"], "meta": [4, -1, "is this buffer metadata?"], "state": [5, -1, "state of buffer (read, cached, etc)"], "dbholds": [7, 1000, "number of holds on buffer"], @@ -399,6 +400,7 @@ def update_dict(d, k, line, labels): key = line[labels[k]] dbsize = int(line[labels['dbsize']]) + usize = int(line[labels['usize']]) blkid = int(line[labels['blkid']]) level = int(line[labels['level']]) @@ -416,7 +418,7 @@ def update_dict(d, k, line, labels): d[pool][objset][key]['indirect'] = 0 d[pool][objset][key]['spill'] = 0 - d[pool][objset][key]['cached'] += dbsize + d[pool][objset][key]['cached'] += dbsize + usize if blkid == -1: d[pool][objset][key]['bonus'] += dbsize diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index bf97b0d68713..cfec120e3fb9 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2023-2024, Klara Inc. */ /* @@ -276,6 +277,11 @@ usage(void) "\t\tcreate 3 lanes on the device; one lane with a latency\n" "\t\tof 10 ms and two lanes with a 25 ms latency.\n" "\n" + "\tzinject -P import|export -s pool\n" + "\t\tAdd an artificial delay to a future pool import or export,\n" + "\t\tsuch that the operation takes a minimum of supplied seconds\n" + "\t\tto complete.\n" + "\n" "\tzinject -I [-s | -g ] pool\n" "\t\tCause the pool to stop writing blocks yet not\n" "\t\treport errors for a duration. Simulates buggy hardware\n" @@ -358,8 +364,10 @@ print_data_handler(int id, const char *pool, zinject_record_t *record, { int *count = data; - if (record->zi_guid != 0 || record->zi_func[0] != '\0') + if (record->zi_guid != 0 || record->zi_func[0] != '\0' || + record->zi_duration != 0) { return (0); + } if (*count == 0) { (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s " @@ -462,6 +470,33 @@ print_panic_handler(int id, const char *pool, zinject_record_t *record, return (0); } +static int +print_pool_delay_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_cmd != ZINJECT_DELAY_IMPORT && + record->zi_cmd != ZINJECT_DELAY_EXPORT) { + return (0); + } + + if (*count == 0) { + (void) printf("%3s %-19s %-11s %s\n", + "ID", "POOL", "DELAY (sec)", "COMMAND"); + (void) printf("--- ------------------- -----------" + " -------\n"); + } + + *count += 1; + + (void) printf("%3d %-19s %-11llu %s\n", + id, pool, (u_longlong_t)record->zi_duration, + record->zi_cmd == ZINJECT_DELAY_IMPORT ? "import": "export"); + + return (0); +} + /* * Print all registered error handlers. Returns the number of handlers * registered. @@ -492,6 +527,13 @@ print_all_handlers(void) count = 0; } + (void) iter_handlers(print_pool_delay_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + (void) iter_handlers(print_panic_handler, &count); return (count + total); @@ -564,9 +606,27 @@ register_handler(const char *pool, int flags, zinject_record_t *record, zc.zc_guid = flags; if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) { - (void) fprintf(stderr, "failed to add handler: %s\n", - errno == EDOM ? "block level exceeds max level of object" : - strerror(errno)); + const char *errmsg = strerror(errno); + + switch (errno) { + case EDOM: + errmsg = "block level exceeds max level of object"; + break; + case EEXIST: + if (record->zi_cmd == ZINJECT_DELAY_IMPORT) + errmsg = "pool already imported"; + if (record->zi_cmd == ZINJECT_DELAY_EXPORT) + errmsg = "a handler already exists"; + break; + case ENOENT: + /* import delay injector running on older zfs module */ + if (record->zi_cmd == ZINJECT_DELAY_IMPORT) + errmsg = "import delay injector not supported"; + break; + default: + break; + } + (void) fprintf(stderr, "failed to add handler: %s\n", errmsg); return (1); } @@ -591,6 +651,9 @@ register_handler(const char *pool, int flags, zinject_record_t *record, } else if (record->zi_duration < 0) { (void) printf(" txgs: %lld \n", (u_longlong_t)-record->zi_duration); + } else if (record->zi_timer > 0) { + (void) printf(" timer: %lld ms\n", + (u_longlong_t)NSEC2MSEC(record->zi_timer)); } else { (void) printf("objset: %llu\n", (u_longlong_t)record->zi_objset); @@ -789,7 +852,7 @@ main(int argc, char **argv) } while ((c = getopt(argc, argv, - ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { + ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; @@ -919,6 +982,19 @@ main(int argc, char **argv) sizeof (record.zi_func)); record.zi_cmd = ZINJECT_PANIC; break; + case 'P': + if (strcasecmp(optarg, "import") == 0) { + record.zi_cmd = ZINJECT_DELAY_IMPORT; + } else if (strcasecmp(optarg, "export") == 0) { + record.zi_cmd = ZINJECT_DELAY_EXPORT; + } else { + (void) fprintf(stderr, "invalid command '%s': " + "must be 'import' or 'export'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; case 'q': quiet = 1; break; @@ -998,7 +1074,7 @@ main(int argc, char **argv) argc -= optind; argv += optind; - if (record.zi_duration != 0) + if (record.zi_duration != 0 && record.zi_cmd == 0) record.zi_cmd = ZINJECT_IGNORED_WRITES; if (cancel != NULL) { @@ -1128,8 +1204,8 @@ main(int argc, char **argv) if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || device != NULL || record.zi_freq > 0 || dvas != 0) { - (void) fprintf(stderr, "panic (-p) incompatible with " - "other options\n"); + (void) fprintf(stderr, "%s incompatible with other " + "options\n", "import|export delay (-P)"); usage(); libzfs_fini(g_zfs); return (2); @@ -1147,6 +1223,28 @@ main(int argc, char **argv) if (argv[1] != NULL) record.zi_type = atoi(argv[1]); dataset[0] = '\0'; + } else if (record.zi_cmd == ZINJECT_DELAY_IMPORT || + record.zi_cmd == ZINJECT_DELAY_EXPORT) { + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || device != NULL || record.zi_freq > 0 || + dvas != 0) { + (void) fprintf(stderr, "%s incompatible with other " + "options\n", "import|export delay (-P)"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (argc != 1 || record.zi_duration <= 0) { + (void) fprintf(stderr, "import|export delay (-P) " + "injection requires a duration (-s) and a single " + "pool name\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + (void) strlcpy(pool, argv[0], sizeof (pool)); } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_freq > 0 || dvas != 0) { diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 91aac5566cc5..27598eba9477 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -1848,10 +1849,19 @@ zpool_do_destroy(int argc, char **argv) } typedef struct export_cbdata { + tpool_t *tpool; + pthread_mutex_t mnttab_lock; boolean_t force; boolean_t hardforce; + int retval; } export_cbdata_t; + +typedef struct { + char *aea_poolname; + export_cbdata_t *aea_cbdata; +} async_export_args_t; + /* * Export one pool */ @@ -1860,11 +1870,20 @@ zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; - if (zpool_disable_datasets(zhp, cb->force, cb->hardforce) != 0) - return (1); + /* + * zpool_disable_datasets() is not thread-safe for mnttab access. + * So we serialize access here for 'zpool export -a' parallel case. + */ + if (cb->tpool != NULL) + pthread_mutex_lock(&cb->mnttab_lock); - /* The history must be logged as part of the export */ - log_history = B_FALSE; + int retval = zpool_disable_datasets(zhp, cb->force, cb->hardforce); + + if (cb->tpool != NULL) + pthread_mutex_unlock(&cb->mnttab_lock); + + if (retval) + return (1); if (cb->hardforce) { if (zpool_export_force(zhp, history_str) != 0) @@ -1876,6 +1895,48 @@ zpool_export_one(zpool_handle_t *zhp, void *data) return (0); } +/* + * Asynchronous export request + */ +static void +zpool_export_task(void *arg) +{ + async_export_args_t *aea = arg; + + zpool_handle_t *zhp = zpool_open(g_zfs, aea->aea_poolname); + if (zhp != NULL) { + int ret = zpool_export_one(zhp, aea->aea_cbdata); + if (ret != 0) + aea->aea_cbdata->retval = ret; + zpool_close(zhp); + } else { + aea->aea_cbdata->retval = 1; + } + + free(aea->aea_poolname); + free(aea); +} + +/* + * Process an export request in parallel + */ +static int +zpool_export_one_async(zpool_handle_t *zhp, void *data) +{ + tpool_t *tpool = ((export_cbdata_t *)data)->tpool; + async_export_args_t *aea = safe_malloc(sizeof (async_export_args_t)); + + /* save pool name since zhp will go out of scope */ + aea->aea_poolname = strdup(zpool_get_name(zhp)); + aea->aea_cbdata = data; + + /* ship off actual export to another thread */ + if (tpool_dispatch(tpool, zpool_export_task, (void *)aea) != 0) + return (errno); /* unlikely */ + else + return (0); +} + /* * zpool export [-f] ... * @@ -1919,17 +1980,33 @@ zpool_do_export(int argc, char **argv) cb.force = force; cb.hardforce = hardforce; + cb.tpool = NULL; + cb.retval = 0; argc -= optind; argv += optind; + /* The history will be logged as part of the export itself */ + log_history = B_FALSE; + if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } - return (for_each_pool(argc, argv, B_TRUE, NULL, - B_FALSE, zpool_export_one, &cb)); + cb.tpool = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), + 0, NULL); + pthread_mutex_init(&cb.mnttab_lock, NULL); + + /* Asynchronously call zpool_export_one using thread pool */ + ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, + zpool_export_one_async, &cb); + + tpool_wait(cb.tpool); + tpool_destroy(cb.tpool); + (void) pthread_mutex_destroy(&cb.mnttab_lock); + + return (ret | cb.retval); } /* check arguments */ @@ -3068,12 +3145,21 @@ zfs_force_import_required(nvlist_t *config) nvlist_t *nvinfo; state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); + nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); + + /* + * The hostid on LOAD_INFO comes from the MOS label via + * spa_tryimport(). If its not there then we're likely talking to an + * older kernel, so use the top one, which will be from the label + * discovered in zpool_find_import(), or if a cachefile is in use, the + * local hostid. + */ + if (nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_HOSTID, &hostid) != 0) + nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) return (B_TRUE); - nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); @@ -3143,7 +3229,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, uint64_t timestamp = 0; uint64_t hostid = 0; - if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTNAME)) + hostname = fnvlist_lookup_string(nvinfo, + ZPOOL_CONFIG_HOSTNAME); + else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) hostname = fnvlist_lookup_string(config, ZPOOL_CONFIG_HOSTNAME); @@ -3151,7 +3240,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, timestamp = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP); - if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTID)) + hostid = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_HOSTID); + else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID); @@ -3196,15 +3288,40 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, return (ret); } +typedef struct import_parameters { + nvlist_t *ip_config; + const char *ip_mntopts; + nvlist_t *ip_props; + int ip_flags; + int *ip_err; +} import_parameters_t; + +static void +do_import_task(void *arg) +{ + import_parameters_t *ip = arg; + *ip->ip_err |= do_import(ip->ip_config, NULL, ip->ip_mntopts, + ip->ip_props, ip->ip_flags); + free(ip); +} + + static int import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, - char *orig_name, char *new_name, - boolean_t do_destroyed, boolean_t pool_specified, boolean_t do_all, - importargs_t *import) + char *orig_name, char *new_name, importargs_t *import) { nvlist_t *config = NULL; nvlist_t *found_config = NULL; uint64_t pool_state; + boolean_t pool_specified = (import->poolname != NULL || + import->guid != 0); + + + tpool_t *tp = NULL; + if (import->do_all) { + tp = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), + 0, NULL); + } /* * At this point we have a list of import candidate configs. Even if @@ -3221,9 +3338,11 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); - if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) + if (!import->do_destroyed && + pool_state == POOL_STATE_DESTROYED) continue; - if (do_destroyed && pool_state != POOL_STATE_DESTROYED) + if (import->do_destroyed && + pool_state != POOL_STATE_DESTROYED) continue; verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, @@ -3232,12 +3351,21 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, if (!pool_specified) { if (first) first = B_FALSE; - else if (!do_all) + else if (!import->do_all) (void) printf("\n"); - if (do_all) { - err |= do_import(config, NULL, mntopts, - props, flags); + if (import->do_all) { + import_parameters_t *ip = safe_malloc( + sizeof (import_parameters_t)); + + ip->ip_config = config; + ip->ip_mntopts = mntopts; + ip->ip_props = props; + ip->ip_flags = flags; + ip->ip_err = &err; + + (void) tpool_dispatch(tp, do_import_task, + (void *)ip); } else { /* * If we're importing from cachefile, then @@ -3285,6 +3413,10 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, found_config = config; } } + if (import->do_all) { + tpool_wait(tp); + tpool_destroy(tp); + } /* * If we were searching for a specific pool, verify that we found a @@ -3514,7 +3646,6 @@ zpool_do_import(int argc, char **argv) boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; boolean_t pool_exists = B_FALSE; - boolean_t pool_specified = B_FALSE; uint64_t txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; @@ -3722,7 +3853,6 @@ zpool_do_import(int argc, char **argv) searchname = argv[0]; searchguid = 0; } - pool_specified = B_TRUE; /* * User specified a name or guid. Ensure it's unique. @@ -3763,6 +3893,8 @@ zpool_do_import(int argc, char **argv) idata.cachefile = cachefile; idata.scan = do_scan; idata.policy = policy; + idata.do_destroyed = do_destroyed; + idata.do_all = do_all; pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); @@ -3802,9 +3934,7 @@ zpool_do_import(int argc, char **argv) } err = import_pools(pools, props, mntopts, flags, - argc >= 1 ? argv[0] : NULL, - argc >= 2 ? argv[1] : NULL, - do_destroyed, pool_specified, do_all, &idata); + argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, &idata); /* * If we're using the cachefile and we failed to import, then @@ -3825,9 +3955,8 @@ zpool_do_import(int argc, char **argv) pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); err = import_pools(pools, props, mntopts, flags, - argc >= 1 ? argv[0] : NULL, - argc >= 2 ? argv[1] : NULL, - do_destroyed, pool_specified, do_all, &idata); + argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, + &idata); } error: @@ -8411,7 +8540,7 @@ status_callback(zpool_handle_t *zhp, void *data) printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" " are connected, then reboot your system and\n\timport the " - "pool.\n")); + "pool or run 'zpool clear' to resume the pool.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4 new file mode 100644 index 000000000000..d5ebd926986a --- /dev/null +++ b/config/kernel-mm-page-size.m4 @@ -0,0 +1,17 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [ + ZFS_LINUX_TEST_SRC([page_size], [ + #include + ],[ + unsigned long s; + s = page_size(NULL); + ]) +]) +AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [ + AC_MSG_CHECKING([whether page_size() is available]) + ZFS_LINUX_TEST_RESULT([page_size], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 41492c19d8a5..9c27ffcf3c7c 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -144,6 +144,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_KTHREAD ZFS_AC_KERNEL_SRC_ZERO_PAGE ZFS_AC_KERNEL_SRC___COPY_FROM_USER_INATOMIC + ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE AC_MSG_CHECKING([for available kernel interfaces]) ZFS_LINUX_TEST_COMPILE_ALL([kabi]) @@ -261,6 +262,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_KTHREAD ZFS_AC_KERNEL_ZERO_PAGE ZFS_AC_KERNEL___COPY_FROM_USER_INATOMIC + ZFS_AC_KERNEL_MM_PAGE_SIZE ]) dnl # diff --git a/include/libzutil.h b/include/libzutil.h index 6b9facdf9cbe..fca0646d7f9c 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018 by Delphix. All rights reserved. + * Copyright (c) 2018, 2024 by Delphix. All rights reserved. */ #ifndef _LIBZUTIL_H @@ -68,6 +68,8 @@ typedef struct importargs { boolean_t can_be_active; /* can the pool be active? */ boolean_t scan; /* prefer scanning to libblkid cache */ nvlist_t *policy; /* load policy (max txg, rewind, etc.) */ + boolean_t do_destroyed; + boolean_t do_all; } importargs_t; extern nvlist_t *zpool_search_import(void *, importargs_t *, diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index 5695abee7b85..d46f636b23dc 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -92,6 +92,12 @@ #define param_set_max_auto_ashift_args(var) \ CTLTYPE_U64, &var, 0, param_set_max_auto_ashift, "QU" +#define spa_taskq_read_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, spa_taskq_read_param, "A" + +#define spa_taskq_write_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, spa_taskq_write_param, "A" + #define fletcher_4_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A" diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h index 3040549e043d..74243ba903e7 100644 --- a/include/os/freebsd/spl/sys/taskq.h +++ b/include/os/freebsd/spl/sys/taskq.h @@ -91,6 +91,8 @@ extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t, clock_t); extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); +extern boolean_t taskq_try_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t); diff --git a/include/os/freebsd/spl/sys/vnode.h b/include/os/freebsd/spl/sys/vnode.h index bcfc177a4751..a23b11d60a7b 100644 --- a/include/os/freebsd/spl/sys/vnode.h +++ b/include/os/freebsd/spl/sys/vnode.h @@ -36,7 +36,11 @@ struct xucred; typedef struct flock flock64_t; typedef struct vnode vnode_t; typedef struct vattr vattr_t; +#if __FreeBSD_version < 1400093 typedef enum vtype vtype_t; +#else +#define vtype_t __enum_uint8(vtype) +#endif #include #include diff --git a/include/os/linux/kernel/linux/Makefile.am b/include/os/linux/kernel/linux/Makefile.am index 6ff0df506d9c..fc2bde013f22 100644 --- a/include/os/linux/kernel/linux/Makefile.am +++ b/include/os/linux/kernel/linux/Makefile.am @@ -10,6 +10,7 @@ KERNEL_H = \ simd_x86.h \ simd_aarch64.h \ simd_powerpc.h \ + mm_compat.h \ mod_compat.h \ page_compat.h \ compiler_compat.h diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h new file mode 100644 index 000000000000..40056c68d6dd --- /dev/null +++ b/include/os/linux/kernel/linux/mm_compat.h @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2023, 2024, Klara Inc. + */ + +#ifndef _ZFS_MM_COMPAT_H +#define _ZFS_MM_COMPAT_H + +#include + +/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */ +#ifndef HAVE_MM_PAGE_SIZE +#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p))) +#endif + +#endif /* _ZFS_MM_COMPAT_H */ diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h index b50175a10873..cdcaf8cbff85 100644 --- a/include/os/linux/spl/sys/taskq.h +++ b/include/os/linux/spl/sys/taskq.h @@ -146,6 +146,8 @@ extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t, clock_t); extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); +extern boolean_t taskq_try_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); extern void taskq_init_ent(taskq_ent_t *); extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); diff --git a/include/os/linux/zfs/sys/Makefile.am b/include/os/linux/zfs/sys/Makefile.am index a075db476e40..71c1dbda39c9 100644 --- a/include/os/linux/zfs/sys/Makefile.am +++ b/include/os/linux/zfs/sys/Makefile.am @@ -11,6 +11,7 @@ KERNEL_H = \ trace_dnode.h \ trace_multilist.h \ trace_rrwlock.h \ + trace_spa_taskqs.h \ trace_txg.h \ trace_vdev.h \ trace_zil.h \ diff --git a/include/os/linux/zfs/sys/trace_spa_taskqs.h b/include/os/linux/zfs/sys/trace_spa_taskqs.h new file mode 100644 index 000000000000..77e2b8ea1fd1 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_spa_taskqs.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_spa_taskqs + +#if !defined(_TRACE_SPA_TASKQS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SPA_TASKQS_H + +#include +#include + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * spa_taskqs_t *stqs, ..., + * taskq_ent_t *ent, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_spa_taskqs_ent_class, + TP_PROTO(spa_taskqs_t *stqs, taskq_ent_t *ent), + TP_ARGS(stqs, ent), +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_SPA_TASKQS_ENT_EVENT(name) \ +DEFINE_EVENT(zfs_spa_taskqs_ent_class, name, \ + TP_PROTO(spa_taskqs_t *stqs, taskq_ent_t *ent), \ + TP_ARGS(stqs, ent)) +/* END CSTYLED */ +DEFINE_SPA_TASKQS_ENT_EVENT(zfs_spa_taskqs_ent__dispatch); +DEFINE_SPA_TASKQS_ENT_EVENT(zfs_spa_taskqs_ent__dispatched); + +#endif /* _TRACE_SPA_TASKQS_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_spa_taskqs +#include + +#else + +DEFINE_DTRACE_PROBE2(spa_taskqs_ent__dispatch); +DEFINE_DTRACE_PROBE2(spa_taskqs_ent__dispatched); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_zfs.h b/include/os/linux/zfs/sys/trace_zfs.h index 0e19f8d186d0..0b0ac94e01f0 100644 --- a/include/os/linux/zfs/sys/trace_zfs.h +++ b/include/os/linux/zfs/sys/trace_zfs.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include diff --git a/include/sys/abd.h b/include/sys/abd.h index 6903e0c0e713..86868e5a08cf 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -79,6 +79,9 @@ typedef struct abd { typedef int abd_iter_func_t(void *buf, size_t len, void *priv); typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv); +#if defined(__linux__) && defined(_KERNEL) +typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *); +#endif extern int zfs_abd_scatter_enabled; @@ -119,6 +122,10 @@ void abd_release_ownership_of_buf(abd_t *); int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, abd_iter_func2_t *, void *); +#if defined(__linux__) && defined(_KERNEL) +int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, + void *); +#endif void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); @@ -207,6 +214,8 @@ void abd_fini(void); /* * Linux ABD bio functions + * Note: these are only needed to support vdev_classic. See comment in + * vdev_disk.c. */ #if defined(__linux__) && defined(_KERNEL) unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index 113700cd72b1..189b743c076a 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ #ifndef _ABD_IMPL_H @@ -38,12 +39,30 @@ typedef enum abd_stats_op { ABDSTAT_DECR /* Decrease abdstat values */ } abd_stats_op_t; -struct scatterlist; /* forward declaration */ +/* forward declarations */ +struct scatterlist; +struct page; struct abd_iter { /* public interface */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ + union { + /* for abd_iter_map()/abd_iter_unmap() */ + struct { + /* addr corresponding to iter_pos */ + void *iter_mapaddr; + /* length of data valid at mapaddr */ + size_t iter_mapsize; + }; + /* for abd_iter_page() */ + struct { + /* current page */ + struct page *iter_page; + /* offset of data in page */ + size_t iter_page_doff; + /* size of data in page */ + size_t iter_page_dsize; + }; + }; /* private */ abd_t *iter_abd; /* ABD being iterated through */ @@ -79,6 +98,7 @@ boolean_t abd_iter_at_end(struct abd_iter *); void abd_iter_advance(struct abd_iter *, size_t); void abd_iter_map(struct abd_iter *); void abd_iter_unmap(struct abd_iter *); +void abd_iter_page(struct abd_iter *); /* * Helper macros diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 0f80764a26a6..d8b44c4f3604 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -639,6 +639,9 @@ typedef struct dmu_buf_user { */ taskq_ent_t dbu_tqent; + /* Size of user data, for inclusion in dbuf_cache accounting. */ + uint64_t dbu_size; + /* * This instance's eviction function pointers. * @@ -721,6 +724,16 @@ void *dmu_buf_replace_user(dmu_buf_t *db, */ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); +/* + * User data size accounting. This can be used to artifically inflate the size + * of the dbuf during cache accounting, so that dbuf_evict_thread evicts enough + * to satisfy memory reclaim requests. It's not used for anything else, and + * defaults to 0. + */ +uint64_t dmu_buf_user_size(dmu_buf_t *db); +void dmu_buf_add_user_size(dmu_buf_t *db, uint64_t nadd); +void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub); + /* * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ diff --git a/include/sys/spa.h b/include/sys/spa.h index 262f7d482802..5f8d5f21b6a1 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -798,7 +798,7 @@ extern void spa_add_feature_stats(spa_t *spa, nvlist_t *config); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 -#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_FAULT_VDEV 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 @@ -854,6 +854,8 @@ extern int zfs_sync_pass_deferred_free; /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; +extern avl_tree_t spa_namespace_avl; +extern kcondvar_t spa_namespace_cv; /* * SPA configuration functions in spa_config.c @@ -1004,6 +1006,10 @@ extern int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t max_txg); extern int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t spa_load_state); +extern void spa_import_progress_set_notes(spa_t *spa, + const char *fmt, ...) __printflike(2, 3); +extern void spa_import_progress_set_notes_nolog(spa_t *spa, + const char *fmt, ...) __printflike(2, 3); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag, @@ -1147,6 +1153,8 @@ extern uint32_t spa_get_hostid(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern boolean_t spa_livelist_delete_check(spa_t *spa); +extern boolean_t spa_mmp_remote_host_activity(spa_t *spa); + extern spa_mode_t spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index b2a718af0f88..d84e2e4e1bc3 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -183,6 +183,8 @@ typedef enum spa_proc_state { } spa_proc_state_t; typedef struct spa_taskqs { + zio_taskq_type_t stqs_type; + zio_type_t stqs_zio_type; uint_t stqs_count; taskq_t **stqs_taskq; } spa_taskqs_t; @@ -229,6 +231,8 @@ struct spa { dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ boolean_t spa_is_exporting; /* true while exporting pool */ + kthread_t *spa_export_thread; /* valid during pool export */ + kthread_t *spa_load_thread; /* loading, no namespace lock */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h index 91699e65131a..a14d5cf8e11e 100644 --- a/include/sys/uberblock_impl.h +++ b/include/sys/uberblock_impl.h @@ -50,20 +50,20 @@ extern "C" { #define MMP_SEQ_VALID_BIT 0x02 #define MMP_FAIL_INT_VALID_BIT 0x04 -#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \ - ubp->ub_mmp_magic == MMP_MAGIC) -#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \ + (ubp)->ub_mmp_magic == MMP_MAGIC) +#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_INTERVAL_VALID_BIT)) -#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_SEQ_VALID_BIT)) -#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_FAIL_INT_VALID_BIT)) -#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \ +#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \ >> 8) -#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \ +#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \ >> 32) -#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \ +#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \ >> 48) #define MMP_INTERVAL_SET(write) \ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 3cfde40a77fe..762e41fa40ce 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -290,7 +290,7 @@ struct vdev { txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ - boolean_t vdev_probe_wanted; /* async probe wanted? */ + boolean_t vdev_fault_wanted; /* async faulted wanted? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 2c7ae3003f46..2ec9636da278 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -503,6 +503,8 @@ extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t, clock_t); extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); +extern boolean_t taskq_try_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); extern void taskq_init_ent(taskq_ent_t *); extern void taskq_destroy(taskq_t *); diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 1ca3f211b56d..a78f5cf7fc59 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2012, 2024 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2017, Intel Corporation. */ @@ -447,6 +447,8 @@ typedef enum zinject_type { ZINJECT_PANIC, ZINJECT_DELAY_IO, ZINJECT_DECRYPT_FAULT, + ZINJECT_DELAY_IMPORT, + ZINJECT_DELAY_EXPORT, } zinject_type_t; typedef struct zfs_share { diff --git a/include/sys/zio.h b/include/sys/zio.h index 9adeb5f1e8e8..2d65a777f202 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2012, 2024 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome @@ -685,6 +685,8 @@ extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); extern hrtime_t zio_handle_io_delay(zio_t *zio); +extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed); +extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed); /* * Checksum ereport functions diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h index 4c0d920fc2c5..6e96ed6cb3a0 100644 --- a/include/sys/zio_compress.h +++ b/include/sys/zio_compress.h @@ -182,6 +182,8 @@ extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, int level); extern size_t slack_compress(void *src, void *dst, size_t s_len, size_t d_len, int level); +extern int slack_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); /* * Compress and decompress data if necessary. diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index ed705a012730..0595c4407e8d 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -29,6 +29,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2018, loli10K * Copyright (c) 2021, Colm Buckley + * Copyright (c) 2021, 2023, Klara Inc. */ #include @@ -265,6 +266,7 @@ zpool_get_state_str(zpool_handle_t *zhp) } else if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { str = gettext("FAULTED"); } else if (status == ZPOOL_STATUS_IO_FAILURE_WAIT || + status == ZPOOL_STATUS_IO_FAILURE_CONTINUE || status == ZPOOL_STATUS_IO_FAILURE_MMP) { str = gettext("SUSPENDED"); } else { diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index 456080f7f247..28bead7930e3 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -156,8 +156,8 @@ taskq_init_ent(taskq_ent_t *t) t->tqent_flags = 0; } -void -taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, +static void +taskq_dispatch_ent_impl(taskq_t *tq, task_func_t func, void *arg, uint_t flags, taskq_ent_t *t) { ASSERT(func != NULL); @@ -170,7 +170,6 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, /* * Enqueue the task to the underlying queue. */ - mutex_enter(&tq->tq_lock); if (flags & TQ_FRONT) { t->tqent_next = tq->tq_task.tqent_next; @@ -184,7 +183,26 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, t->tqent_func = func; t->tqent_arg = arg; cv_signal(&tq->tq_dispatch_cv); +} + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + mutex_enter(&tq->tq_lock); + taskq_dispatch_ent_impl(tq, func, arg, flags, t); + mutex_exit(&tq->tq_lock); +} + +boolean_t +taskq_try_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + if (!mutex_tryenter(&tq->tq_lock)) + return (B_FALSE); + taskq_dispatch_ent_impl(tq, func, arg, flags, t); mutex_exit(&tq->tq_lock); + return (B_TRUE); } void diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index ea48b54587cf..1a3d23b816d8 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2,6 +2,7 @@ .\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved. .\" Copyright (c) 2019 Datto Inc. +.\" Copyright (c) 2023, 2024 Klara, Inc. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at @@ -15,7 +16,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd January 10, 2023 +.Dd January 9, 2024 .Dt ZFS 4 .Os . @@ -1305,6 +1306,29 @@ as fuller devices will tend to be slower than empty devices. Also see .Sy zio_dva_throttle_enabled . . +.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint +Maximum number of segments to add to a BIO (min 4). +If this is higher than the maximum allowed by the device queue or the kernel +itself, it will be clamped. +Setting it to zero will cause the kernel's ideal size to be used. +This parameter only applies on Linux. +This parameter is ignored if +.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 . +. +.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint +If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2 +and earlier. +This "classic" method has known issues with highly fragmented IO requests and +is slower on many workloads, but it has been in use for many years and is known +to be very stable. +If you set this parameter, please also open a bug report why you did so, +including the workload involved and any error messages. +.Pp +This parameter and the classic submission method will be removed once we have +total confidence in the new method. +.Pp +This parameter only applies on Linux, and can only be set at module load time. +. .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int Time before expiring .Pa .zfs/snapshot . @@ -2167,6 +2191,16 @@ If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. . +.It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp +Set the queue and thread configuration for the IO read queues. +This is an advanced debugging parameter. +Don't change this unless you understand what it does. +. +.It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp +Set the queue and thread configuration for the IO write queues. +This is an advanced debugging parameter. +Don't change this unless you understand what it does. +. .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. This may slightly improve startup time on diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index a29346929988..93083e1fbf9a 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -127,6 +127,14 @@ Force a vdev error. . .It Xo .Nm zinject +.Fl i Ar seconds +.Ar pool +.Xc +Add an artificial delay during the future import of a pool. +This injector is automatically cleared after the import is finished. +. +.It Xo +.Nm zinject .Fl I .Op Fl s Ar seconds Ns | Ns Fl g Ar txgs .Ar pool diff --git a/man/man8/zpool-clear.8 b/man/man8/zpool-clear.8 index 0b256b28bd21..2a1222e8f1b1 100644 --- a/man/man8/zpool-clear.8 +++ b/man/man8/zpool-clear.8 @@ -49,9 +49,10 @@ If the pool was suspended it will be brought back online provided the devices can be accessed. Pools with .Sy multihost -enabled which have been suspended cannot be resumed. -While the pool was suspended, it may have been imported on -another host, and resuming I/O could result in pool damage. +enabled which have been suspended cannot be resumed when there is evidence +that the pool was imported by another host. +The same checks performed during an import will be applied before the clear +proceeds. . .Sh SEE ALSO .Xr zdb 8 , diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c index 3fa7939bdb3c..ba9ab33227bb 100644 --- a/module/os/freebsd/spl/spl_taskq.c +++ b/module/os/freebsd/spl/spl_taskq.c @@ -411,6 +411,15 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, taskqueue_enqueue(tq->tq_queue, &task->tqent_task); } +boolean_t +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, + taskq_ent_t *task) +{ + /* XXX: implement me -- robn, 2023-10-23 */ + taskq_dispatch_ent(tq, func, arg, flags, task); + return (B_TRUE); +} + void taskq_wait(taskq_t *tq) { diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index ddd6d68b361c..2a7fa273deaa 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) { ASSERT(!abd_is_gang(abd)); abd_verify(abd); + memset(aiter, 0, sizeof (struct abd_iter)); aiter->iter_abd = abd; - aiter->iter_pos = 0; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; } /* diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index fb25a4154485..d02ad6a4450d 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -673,17 +673,13 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, } EXPORT_SYMBOL(taskq_dispatch_delay); -void -taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, +static void +taskq_dispatch_ent_impl(taskq_t *tq, task_func_t func, void *arg, uint_t flags, taskq_ent_t *t) { - unsigned long irqflags; ASSERT(tq); ASSERT(func); - spin_lock_irqsave_nested(&tq->tq_lock, irqflags, - tq->tq_lock_class); - /* Taskq being destroyed and all tasks drained */ if (!(tq->tq_flags & TASKQ_ACTIVE)) { t->tqent_id = TASKQID_INVALID; @@ -694,7 +690,7 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, /* Dynamic taskq may be able to spawn another thread */ if (!(tq->tq_flags & TASKQ_DYNAMIC) || taskq_thread_spawn(tq) == 0) - goto out2; + return; flags |= TQ_FRONT; } @@ -734,11 +730,45 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); -out2: +} + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + unsigned long irqflags; + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, + tq->tq_lock_class); + + taskq_dispatch_ent_impl(tq, func, arg, flags, t); + spin_unlock_irqrestore(&tq->tq_lock, irqflags); } EXPORT_SYMBOL(taskq_dispatch_ent); +boolean_t +taskq_try_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + unsigned long irqflags; + + /* + * XXX I don't _think_ losing _nested matters, because I think its + * only related to lockdep, and we don't have access to that anyway + * -- robn, 2023-10-23 + */ + if (!spin_trylock_irqsave(&tq->tq_lock, irqflags)) + return (B_FALSE); + + taskq_dispatch_ent_impl(tq, func, arg, flags, t); + + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + + return (B_TRUE); +} +EXPORT_SYMBOL(taskq_try_dispatch_ent); + int taskq_empty_ent(taskq_ent_t *t) { diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 254df32410f1..f82c4ce6b82f 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ /* @@ -59,7 +60,9 @@ #include #ifdef _KERNEL #include +#include #include +#include #else #define MAX_ORDER 1 #endif @@ -884,14 +887,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) { ASSERT(!abd_is_gang(abd)); abd_verify(abd); + memset(aiter, 0, sizeof (struct abd_iter)); aiter->iter_abd = abd; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; - aiter->iter_pos = 0; - if (abd_is_linear(abd)) { - aiter->iter_offset = 0; - aiter->iter_sg = NULL; - } else { + if (!abd_is_linear(abd)) { aiter->iter_offset = ABD_SCATTER(abd).abd_offset; aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; } @@ -904,6 +902,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) boolean_t abd_iter_at_end(struct abd_iter *aiter) { + ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); return (aiter->iter_pos == aiter->iter_abd->abd_size); } @@ -915,8 +914,15 @@ abd_iter_at_end(struct abd_iter *aiter) void abd_iter_advance(struct abd_iter *aiter, size_t amount) { + /* + * Ensure that last chunk is not in use. abd_iterate_*() must clear + * this state (directly or abd_iter_unmap()) before advancing. + */ ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); + ASSERT3P(aiter->iter_page, ==, NULL); + ASSERT0(aiter->iter_page_doff); + ASSERT0(aiter->iter_page_dsize); /* There's nothing left to advance to, so do nothing */ if (abd_iter_at_end(aiter)) @@ -998,6 +1004,106 @@ abd_cache_reap_now(void) } #if defined(_KERNEL) +/* + * Yield the next page struct and data offset and size within it, without + * mapping it into the address space. + */ +void +abd_iter_page(struct abd_iter *aiter) +{ + if (abd_iter_at_end(aiter)) { + aiter->iter_page = NULL; + aiter->iter_page_doff = 0; + aiter->iter_page_dsize = 0; + return; + } + + struct page *page; + size_t doff, dsize; + + if (abd_is_linear(aiter->iter_abd)) { + ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); + + /* memory address at iter_pos */ + void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; + + /* struct page for address */ + page = is_vmalloc_addr(paddr) ? + vmalloc_to_page(paddr) : virt_to_page(paddr); + + /* offset of address within the page */ + doff = offset_in_page(paddr); + + /* total data remaining in abd from this position */ + dsize = aiter->iter_abd->abd_size - aiter->iter_offset; + } else { + ASSERT(!abd_is_gang(aiter->iter_abd)); + + /* current scatter page */ + page = sg_page(aiter->iter_sg); + + /* position within page */ + doff = aiter->iter_offset; + + /* remaining data in scatterlist */ + dsize = MIN(aiter->iter_sg->length - aiter->iter_offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + } + ASSERT(page); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) + if (PageTail(page)) { + /* + * This page is part of a "compound page", which is a group of + * pages that can be referenced from a single struct page *. + * Its organised as a "head" page, followed by a series of + * "tail" pages. + * + * In OpenZFS, compound pages are allocated using the + * __GFP_COMP flag, which we get from scatter ABDs and SPL + * vmalloc slabs (ie >16K allocations). So a great many of the + * IO buffers we get are going to be of this type. + * + * The tail pages are just regular PAGE_SIZE pages, and can be + * safely used as-is. However, the head page has length + * covering itself and all the tail pages. If this ABD chunk + * spans multiple pages, then we can use the head page and a + * >PAGE_SIZE length, which is far more efficient. + * + * To do this, we need to adjust the offset to be counted from + * the head page. struct page for compound pages are stored + * contiguously, so we can just adjust by a simple offset. + * + * Before kernel 4.5, compound page heads were refcounted + * separately, such that moving back to the head page would + * require us to take a reference to it and releasing it once + * we're completely finished with it. In practice, that means + * when our caller is done with the ABD, which we have no + * insight into from here. Rather than contort this API to + * track head page references on such ancient kernels, we just + * compile this block out and use the tail pages directly. This + * is slightly less efficient, but makes everything far + * simpler. + */ + struct page *head = compound_head(page); + doff += ((page - head) * PAGESIZE); + page = head; + } +#endif + + /* final page and position within it */ + aiter->iter_page = page; + aiter->iter_page_doff = doff; + + /* amount of data in the chunk, up to the end of the page */ + aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff); +} + +/* + * Note: ABD BIO functions only needed to support vdev_classic. See comments in + * vdev_disk.c. + */ + /* * bio_nr_pages for ABD. * @off is the offset in @abd @@ -1220,4 +1326,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size, module_param(zfs_abd_scatter_max_order, uint, 0644); MODULE_PARM_DESC(zfs_abd_scatter_max_order, "Maximum order allocation used for a scatter ABD."); -#endif + +#endif /* _KERNEL */ diff --git a/module/os/linux/zfs/trace.c b/module/os/linux/zfs/trace.c index a690822ae14c..9c09baee0378 100644 --- a/module/os/linux/zfs/trace.c +++ b/module/os/linux/zfs/trace.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 935cdc039d5e..743b6f23f9ac 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -24,6 +24,7 @@ * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ #include @@ -49,11 +50,11 @@ typedef struct vdev_disk { int zio_suppress_zero_writes = B_TRUE; /* - * Maximum number of segments to add to a bio. If this is higher than the - * maximum allowed by the device queue or the kernel itself, it will be + * Maximum number of segments to add to a bio (min 4). If this is higher than + * the maximum allowed by the device queue or the kernel itself, it will be * clamped. Setting it to zero will cause the kernel's ideal size to be used. */ -unsigned long vdev_disk_max_segs = 0; +uint_t zfs_vdev_disk_max_segs = 0; /* * Unique identifier for the exclusive vdev holder. @@ -72,20 +73,22 @@ static unsigned zfs_vdev_open_timeout_ms = 1000; */ #define EFI_MIN_RESV_SIZE (16 * 1024) -/* - * Virtual device vector for disks. - */ -typedef struct dio_request { - zio_t *dr_zio; /* Parent ZIO */ - atomic_t dr_ref; /* References */ - int dr_error; /* Bio error */ - int dr_bio_count; /* Count of bio's */ - struct bio *dr_bio[0]; /* Attached bio's */ -} dio_request_t; - +#ifdef HAVE_BLK_MODE_T +static blk_mode_t +#else static fmode_t +#endif vdev_bdev_mode(spa_mode_t spa_mode) { +#ifdef HAVE_BLK_MODE_T + blk_mode_t mode = 0; + + if (spa_mode & SPA_MODE_READ) + mode |= BLK_OPEN_READ; + + if (spa_mode & SPA_MODE_WRITE) + mode |= BLK_OPEN_WRITE; +#else fmode_t mode = 0; if (spa_mode & SPA_MODE_READ) @@ -93,6 +96,7 @@ vdev_bdev_mode(spa_mode_t spa_mode) if (spa_mode & SPA_MODE_WRITE) mode |= FMODE_WRITE; +#endif return (mode); } @@ -355,98 +359,15 @@ vdev_disk_close(vdev_t *v) if (v->vdev_reopening || vd == NULL) return; - if (vd->vd_bdev != NULL) { + if (vd->vd_bdev != NULL) blkdev_put(vd->vd_bdev, vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); - } rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); v->vdev_tsd = NULL; } -static dio_request_t * -vdev_disk_dio_alloc(int bio_count) -{ - dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + - sizeof (struct bio *) * bio_count, KM_SLEEP); - atomic_set(&dr->dr_ref, 0); - dr->dr_bio_count = bio_count; - dr->dr_error = 0; - - for (int i = 0; i < dr->dr_bio_count; i++) - dr->dr_bio[i] = NULL; - - return (dr); -} - -static void -vdev_disk_dio_free(dio_request_t *dr) -{ - int i; - - for (i = 0; i < dr->dr_bio_count; i++) - if (dr->dr_bio[i]) - bio_put(dr->dr_bio[i]); - - kmem_free(dr, sizeof (dio_request_t) + - sizeof (struct bio *) * dr->dr_bio_count); -} - -static void -vdev_disk_dio_get(dio_request_t *dr) -{ - atomic_inc(&dr->dr_ref); -} - -static int -vdev_disk_dio_put(dio_request_t *dr) -{ - int rc = atomic_dec_return(&dr->dr_ref); - - /* - * Free the dio_request when the last reference is dropped and - * ensure zio_interpret is called only once with the correct zio - */ - if (rc == 0) { - zio_t *zio = dr->dr_zio; - int error = dr->dr_error; - - vdev_disk_dio_free(dr); - - if (zio) { - zio->io_error = error; - ASSERT3S(zio->io_error, >=, 0); - if (zio->io_error) - vdev_disk_error(zio); - - zio_delay_interrupt(zio); - } - } - - return (rc); -} - -BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) -{ - dio_request_t *dr = bio->bi_private; - int rc; - - if (dr->dr_error == 0) { -#ifdef HAVE_1ARG_BIO_END_IO_T - dr->dr_error = BIO_END_IO_ERROR(bio); -#else - if (error) - dr->dr_error = -(error); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - dr->dr_error = EIO; -#endif - } - - /* Drop reference acquired by __vdev_disk_physio */ - rc = vdev_disk_dio_put(dr); -} - static inline void vdev_submit_bio_impl(struct bio *bio) { @@ -598,13 +519,17 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, return (bio); } -static inline unsigned int -vdev_bio_max_segs(struct block_device *bdev) { - const unsigned long tune_max_segs = - vdev_disk_max_segs > 0 ? vdev_disk_max_segs : ULONG_MAX; - const unsigned long dev_max_segs = - queue_max_segments(bdev_get_queue(bdev)); - const unsigned long max_segs = MIN(tune_max_segs, dev_max_segs); +static inline uint_t +vdev_bio_max_segs(struct block_device *bdev) +{ + /* + * Smallest of the device max segs and the tuneable max segs. Minimum + * 4, so there's room to finish split pages if they come up. + */ + const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); + const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? + MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; + const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); #ifdef HAVE_BIO_MAX_SEGS return (bio_max_segs(max_segs)); @@ -613,10 +538,461 @@ vdev_bio_max_segs(struct block_device *bdev) { #endif } +static inline uint_t +vdev_bio_max_bytes(struct block_device *bdev) +{ + return (queue_max_sectors(bdev_get_queue(bdev)) << 9); +} + + +/* + * Virtual block IO object (VBIO) + * + * Linux block IO (BIO) objects have a limit on how many data segments (pages) + * they can hold. Depending on how they're allocated and structured, a large + * ZIO can require more than one BIO to be submitted to the kernel, which then + * all have to complete before we can return the completed ZIO back to ZFS. + * + * A VBIO is a wrapper around multiple BIOs, carrying everything needed to + * translate a ZIO down into the kernel block layer and back again. + * + * Note that these are only used for data ZIOs (read/write). Meta-operations + * (flush/trim) don't need multiple BIOs and so can just make the call + * directly. + */ +typedef struct { + zio_t *vbio_zio; /* parent zio */ + + struct block_device *vbio_bdev; /* blockdev to submit bios to */ + + abd_t *vbio_abd; /* abd carrying borrowed linear buf */ + + uint_t vbio_max_segs; /* max segs per bio */ + + uint_t vbio_max_bytes; /* max bytes per bio */ + uint_t vbio_lbs_mask; /* logical block size mask */ + + uint64_t vbio_offset; /* start offset of next bio */ + + struct bio *vbio_bio; /* pointer to the current bio */ + int vbio_flags; /* bio flags */ +} vbio_t; + +static vbio_t * +vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) +{ + vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); + + vbio->vbio_zio = zio; + vbio->vbio_bdev = bdev; + vbio->vbio_abd = NULL; + vbio->vbio_max_segs = vdev_bio_max_segs(bdev); + vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); + vbio->vbio_lbs_mask = bdev_logical_block_size(bdev)-1; + vbio->vbio_offset = zio->io_offset; + vbio->vbio_bio = NULL; + vbio->vbio_flags = flags; + + return (vbio); +} + +BIO_END_IO_PROTO(vbio_completion, bio, error); + static int -__vdev_disk_physio(struct block_device *bdev, zio_t *zio, - size_t io_size, uint64_t io_offset, int rw, int flags) +vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) { + struct bio *bio = vbio->vbio_bio; + uint_t ssize; + + while (size > 0) { + if (bio == NULL) { + /* New BIO, allocate and set up */ + bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, + vbio->vbio_max_segs); + VERIFY(bio); + + BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; + bio_set_op_attrs(bio, + vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? + WRITE : READ, vbio->vbio_flags); + + if (vbio->vbio_bio) { + bio_chain(vbio->vbio_bio, bio); + vdev_submit_bio(vbio->vbio_bio); + } + vbio->vbio_bio = bio; + } + + /* + * Only load as much of the current page data as will fit in + * the space left in the BIO, respecting lbs alignment. Older + * kernels will error if we try to overfill the BIO, while + * newer ones will accept it and split the BIO. This ensures + * everything works on older kernels, and avoids an additional + * overhead on the new. + */ + ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & + ~(vbio->vbio_lbs_mask)); + if (ssize > 0 && + bio_add_page(bio, page, ssize, offset) == ssize) { + /* Accepted, adjust and load any remaining. */ + size -= ssize; + offset += ssize; + continue; + } + + /* No room, set up for a new BIO and loop */ + vbio->vbio_offset += BIO_BI_SIZE(bio); + + /* Signal new BIO allocation wanted */ + bio = NULL; + } + + return (0); +} + +/* Iterator callback to submit ABD pages to the vbio. */ +static int +vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) +{ + vbio_t *vbio = priv; + return (vbio_add_page(vbio, page, len, off)); +} + +/* Create some BIOs, fill them with data and submit them */ +static void +vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) +{ + /* + * We plug so we can submit the BIOs as we go and only unplug them when + * they are fully created and submitted. This is important; if we don't + * plug, then the kernel may start executing earlier BIOs while we're + * still creating and executing later ones, and if the device goes + * away while that's happening, older kernels can get confused and + * trample memory. + */ + struct blk_plug plug; + blk_start_plug(&plug); + + (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); + ASSERT(vbio->vbio_bio); + + vbio->vbio_bio->bi_end_io = vbio_completion; + vbio->vbio_bio->bi_private = vbio; + + /* + * Once submitted, vbio_bio now owns vbio (through bi_private) and we + * can't touch it again. The bio may complete and vbio_completion() be + * called and free the vbio before this task is run again, so we must + * consider it invalid from this point. + */ + vdev_submit_bio(vbio->vbio_bio); + + blk_finish_plug(&plug); +} + +/* IO completion callback */ +BIO_END_IO_PROTO(vbio_completion, bio, error) +{ + vbio_t *vbio = bio->bi_private; + zio_t *zio = vbio->vbio_zio; + + ASSERT(zio); + + /* Capture and log any errors */ +#ifdef HAVE_1ARG_BIO_END_IO_T + zio->io_error = BIO_END_IO_ERROR(bio); +#else + zio->io_error = 0; + if (error) + zio->io_error = -(error); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + zio->io_error = EIO; +#endif + ASSERT3U(zio->io_error, >=, 0); + + if (zio->io_error) + vdev_disk_error(zio); + + /* Return the BIO to the kernel */ + bio_put(bio); + + /* + * If we copied the ABD before issuing it, clean up and return the copy + * to the ADB, with changes if appropriate. + */ + if (vbio->vbio_abd != NULL) { + void *buf = abd_to_buf(vbio->vbio_abd); + abd_free(vbio->vbio_abd); + vbio->vbio_abd = NULL; + + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_abd, buf, zio->io_size); + else + abd_return_buf(zio->io_abd, buf, zio->io_size); + } + + /* Final cleanup */ + kmem_free(vbio, sizeof (vbio_t)); + + /* All done, submit for processing */ + zio_delay_interrupt(zio); +} + +/* + * Iterator callback to count ABD pages and check their size & alignment. + * + * On Linux, each BIO segment can take a page pointer, and an offset+length of + * the data within that page. A page can be arbitrarily large ("compound" + * pages) but we still have to ensure the data portion is correctly sized and + * aligned to the logical block size, to ensure that if the kernel wants to + * split the BIO, the two halves will still be properly aligned. + */ +typedef struct { + uint_t bmask; + uint_t npages; + uint_t end; +} vdev_disk_check_pages_t; + +static int +vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) +{ + vdev_disk_check_pages_t *s = priv; + + /* + * If we didn't finish on a block size boundary last time, then there + * would be a gap if we tried to use this ABD as-is, so abort. + */ + if (s->end != 0) + return (1); + + /* + * Note if we're taking less than a full block, so we can check it + * above on the next call. + */ + s->end = (off+len) & s->bmask; + + /* All blocks after the first must start on a block size boundary. */ + if (s->npages != 0 && (off & s->bmask) != 0) + return (1); + + s->npages++; + return (0); +} + +/* + * Check if we can submit the pages in this ABD to the kernel as-is. Returns + * the number of pages, or 0 if it can't be submitted like this. + */ +static boolean_t +vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) +{ + vdev_disk_check_pages_t s = { + .bmask = bdev_logical_block_size(bdev)-1, + .npages = 0, + .end = 0, + }; + + if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) + return (B_FALSE); + + return (B_TRUE); +} + +static int +vdev_disk_io_rw(zio_t *zio) +{ + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; + struct block_device *bdev = vd->vd_bdev; + int flags = 0; + + /* + * Accessing outside the block device is never allowed. + */ + if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) { + vdev_dbgmsg(zio->io_vd, + "Illegal access %llu size %llu, device size %llu", + (u_longlong_t)zio->io_offset, + (u_longlong_t)zio->io_size, + (u_longlong_t)i_size_read(bdev->bd_inode)); + return (SET_ERROR(EIO)); + } + + if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) + bio_set_flags_failfast(bdev, &flags); + + /* + * Check alignment of the incoming ABD. If any part of it would require + * submitting a page that is not aligned to the logical block size, + * then we take a copy into a linear buffer and submit that instead. + * This should be impossible on a 512b LBS, and fairly rare on 4K, + * usually requiring abnormally-small data blocks (eg gang blocks) + * mixed into the same ABD as larger ones (eg aggregated). + */ + abd_t *abd = zio->io_abd; + if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { + void *buf; + if (zio->io_type == ZIO_TYPE_READ) + buf = abd_borrow_buf(zio->io_abd, zio->io_size); + else + buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + + /* + * Wrap the copy in an abd_t, so we can use the same iterators + * to count and fill the vbio later. + */ + abd = abd_get_from_buf(buf, zio->io_size); + + /* + * False here would mean the borrowed copy has an invalid + * alignment too, which would mean we've somehow been passed a + * linear ABD with an interior page that has a non-zero offset + * or a size not a multiple of PAGE_SIZE. This is not possible. + * It would mean either zio_buf_alloc() or its underlying + * allocators have done something extremely strange, or our + * math in vdev_disk_check_pages() is wrong. In either case, + * something in seriously wrong and its not safe to continue. + */ + VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); + } + + /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ + vbio_t *vbio = vbio_alloc(zio, bdev, flags); + if (abd != zio->io_abd) + vbio->vbio_abd = abd; + + /* Fill it with data pages and submit it to the kernel */ + vbio_submit(vbio, abd, zio->io_size); + return (0); +} + +/* ========== */ + +/* + * This is the classic, battle-tested BIO submission code. Until we're totally + * sure that the new code is safe and correct in all cases, this will remain + * available and can be enabled by setting zfs_vdev_disk_classic=1 at module + * load time. + * + * These functions have been renamed to vdev_classic_* to make it clear what + * they belong to, but their implementations are unchanged. + */ + +/* + * Virtual device vector for disks. + */ +typedef struct dio_request { + zio_t *dr_zio; /* Parent ZIO */ + atomic_t dr_ref; /* References */ + int dr_error; /* Bio error */ + int dr_bio_count; /* Count of bio's */ + struct bio *dr_bio[]; /* Attached bio's */ +} dio_request_t; + +static dio_request_t * +vdev_classic_dio_alloc(int bio_count) +{ + dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + + sizeof (struct bio *) * bio_count, KM_SLEEP); + atomic_set(&dr->dr_ref, 0); + dr->dr_bio_count = bio_count; + dr->dr_error = 0; + + for (int i = 0; i < dr->dr_bio_count; i++) + dr->dr_bio[i] = NULL; + + return (dr); +} + +static void +vdev_classic_dio_free(dio_request_t *dr) +{ + int i; + + for (i = 0; i < dr->dr_bio_count; i++) + if (dr->dr_bio[i]) + bio_put(dr->dr_bio[i]); + + kmem_free(dr, sizeof (dio_request_t) + + sizeof (struct bio *) * dr->dr_bio_count); +} + +static void +vdev_classic_dio_get(dio_request_t *dr) +{ + atomic_inc(&dr->dr_ref); +} + +static void +vdev_classic_dio_put(dio_request_t *dr) +{ + int rc = atomic_dec_return(&dr->dr_ref); + + /* + * Free the dio_request when the last reference is dropped and + * ensure zio_interpret is called only once with the correct zio + */ + if (rc == 0) { + zio_t *zio = dr->dr_zio; + int error = dr->dr_error; + + vdev_classic_dio_free(dr); + + if (zio) { + zio->io_error = error; + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + + zio_delay_interrupt(zio); + } + } +} + +BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error) +{ + dio_request_t *dr = bio->bi_private; + + if (dr->dr_error == 0) { +#ifdef HAVE_1ARG_BIO_END_IO_T + dr->dr_error = BIO_END_IO_ERROR(bio); +#else + if (error) + dr->dr_error = -(error); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + dr->dr_error = EIO; +#endif + } + + /* Drop reference acquired by vdev_classic_physio */ + vdev_classic_dio_put(dr); +} + +static inline unsigned int +vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) +{ + unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, + bio_size, abd_offset); + +#ifdef HAVE_BIO_MAX_SEGS + return (bio_max_segs(nr_segs)); +#else + return (MIN(nr_segs, BIO_MAX_PAGES)); +#endif +} + +static int +vdev_classic_physio(zio_t *zio) +{ + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; + struct block_device *bdev = vd->vd_bdev; + size_t io_size = zio->io_size; + uint64_t io_offset = zio->io_offset; + int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; + int flags = 0; + dio_request_t *dr; uint64_t abd_offset; uint64_t bio_offset; @@ -637,7 +1013,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, } retry: - dr = vdev_disk_dio_alloc(bio_count); + dr = vdev_classic_dio_alloc(bio_count); if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) bio_set_flags_failfast(bdev, &flags); @@ -669,23 +1045,23 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, * this should be rare - see the comment above. */ if (dr->dr_bio_count == i) { - vdev_disk_dio_free(dr); + vdev_classic_dio_free(dr); bio_count *= 2; goto retry; } - nr_vecs = vdev_bio_max_segs(bdev); + nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); if (unlikely(dr->dr_bio[i] == NULL)) { - vdev_disk_dio_free(dr); + vdev_classic_dio_free(dr); return (SET_ERROR(ENOMEM)); } - /* Matching put called by vdev_disk_physio_completion */ - vdev_disk_dio_get(dr); + /* Matching put called by vdev_classic_physio_completion */ + vdev_classic_dio_get(dr); BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; - dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; + dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; dr->dr_bio[i]->bi_private = dr; bio_set_op_attrs(dr->dr_bio[i], rw, flags); @@ -707,7 +1083,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, } /* Extra reference to protect dio_request during vdev_submit_bio */ - vdev_disk_dio_get(dr); + vdev_classic_dio_get(dr); if (dr->dr_bio_count > 1) blk_start_plug(&plug); @@ -721,11 +1097,13 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, if (dr->dr_bio_count > 1) blk_finish_plug(&plug); - (void) vdev_disk_dio_put(dr); + (void) vdev_classic_dio_put(dr); return (error); } +/* ========== */ + BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) { zio_t *zio = bio->bi_private; @@ -795,12 +1173,14 @@ vdev_disk_io_trim(zio_t *zio) #endif } +int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; + static void vdev_disk_io_start(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; - int rw, error; + int error; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. @@ -879,13 +1259,6 @@ vdev_disk_io_start(zio_t *zio) rw_exit(&vd->vd_lock); zio_execute(zio); return; - case ZIO_TYPE_WRITE: - rw = WRITE; - break; - - case ZIO_TYPE_READ: - rw = READ; - break; case ZIO_TYPE_TRIM: zio->io_error = vdev_disk_io_trim(zio); @@ -893,23 +1266,34 @@ vdev_disk_io_start(zio_t *zio) zio_interrupt(zio); return; - default: + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + zio->io_target_timestamp = zio_handle_io_delay(zio); + error = vdev_disk_io_rw_fn(zio); rw_exit(&vd->vd_lock); - zio->io_error = SET_ERROR(ENOTSUP); - zio_interrupt(zio); + if (error) { + zio->io_error = error; + zio_interrupt(zio); + } return; - } - zio->io_target_timestamp = zio_handle_io_delay(zio); - error = __vdev_disk_physio(vd->vd_bdev, zio, - zio->io_size, zio->io_offset, rw, 0); - rw_exit(&vd->vd_lock); + default: + /* + * Getting here means our parent vdev has made a very strange + * request of us, and shouldn't happen. Assert here to force a + * crash in dev builds, but in production return the IO + * unhandled. The pool will likely suspend anyway but that's + * nicer than crashing the kernel. + */ + ASSERT3S(zio->io_type, ==, -1); - if (error) { - zio->io_error = error; + rw_exit(&vd->vd_lock); + zio->io_error = SET_ERROR(ENOTSUP); zio_interrupt(zio); return; } + + __builtin_unreachable(); } static void @@ -958,8 +1342,49 @@ vdev_disk_rele(vdev_t *vd) /* XXX: Implement me as a vnode rele for the device */ } +/* + * BIO submission method. See comment above about vdev_classic. + * Set zfs_vdev_disk_classic=0 for new, =1 for classic + */ +static uint_t zfs_vdev_disk_classic = 0; /* default new */ + +/* Set submission function from module parameter */ +static int +vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) +{ + int err = param_set_uint(buf, kp); + if (err < 0) + return (SET_ERROR(err)); + + vdev_disk_io_rw_fn = + zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; + + printk(KERN_INFO "ZFS: forcing %s BIO submission\n", + zfs_vdev_disk_classic ? "classic" : "new"); + + return (0); +} + +/* + * At first use vdev use, set the submission function from the default value if + * it hasn't been set already. + */ +static int +vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + (void) spa; + (void) nv; + (void) tsd; + + if (vdev_disk_io_rw_fn == NULL) + vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? + vdev_classic_physio : vdev_disk_io_rw; + + return (0); +} + vdev_ops_t vdev_disk_ops = { - .vdev_op_init = NULL, + .vdev_op_init = vdev_disk_init, .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, @@ -1049,5 +1474,12 @@ param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) ZFS_MODULE_PARAM(zfs_zio, zio_, suppress_zero_writes, INT, ZMOD_RW, "Do not send zero byte writes to hardware"); -ZFS_MODULE_PARAM(zfs_vdev_disk, vdev_disk_, max_segs, ULONG, ZMOD_RW, - "Maximum number of data segments to add to an IO request"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, + "Timeout before determining that a device is missing"); + +ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, + "Maximum number of data segments to add to an IO request (min 4)"); + +ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, + vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, + "Use classic BIO submission method"); diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 42bf3e3036f9..79de6378f813 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -816,6 +816,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, return (ret); } +#if defined(__linux__) && defined(_KERNEL) +int +abd_iterate_page_func(abd_t *abd, size_t off, size_t size, + abd_iter_page_func_t *func, void *private) +{ + struct abd_iter aiter; + int ret = 0; + + if (size == 0) + return (0); + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); + + while (size > 0) { + IMPLY(abd_is_gang(abd), c_abd != NULL); + + abd_iter_page(&aiter); + + size_t len = MIN(aiter.iter_page_dsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_page, aiter.iter_page_doff, + len, private); + + aiter.iter_page = NULL; + aiter.iter_page_doff = 0; + aiter.iter_page_dsize = 0; + + if (ret != 0) + break; + + size -= len; + c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); + } + + return (ret); +} +#endif + struct buf_arg { void *arg_buf; }; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1db68669a9f5..a690a7e6ebf1 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8491,11 +8491,11 @@ l2arc_dev_get_next(void) break; } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all); + next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all) + next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting) next = NULL; l2arc_dev_last = next; @@ -10145,7 +10145,8 @@ l2arc_spa_rebuild_start(spa_t *spa) void l2arc_spa_rebuild_stop(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); /* * Locate the spa's l2arc devices and kick off rebuild threads. diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index ba9e0aa7b3f9..10cbfce9f5c0 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -554,6 +554,21 @@ dbuf_evict_user(dmu_buf_impl_t *db) *dbu->dbu_clear_on_evict_dbufp = NULL; #endif + if (db->db_caching_status != DB_NO_CACHE) { + /* + * This is a cached dbuf, so the size of the user data is + * included in its cached amount. We adjust it here because the + * user data has already been detached from the dbuf, and the + * sync functions are not supposed to touch it (the dbuf might + * not exist anymore by the time the sync functions run. + */ + uint64_t size = dbu->dbu_size; + (void) zfs_refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, size, db); + if (db->db_caching_status == DB_DBUF_CACHE) + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); + } + /* * There are two eviction callbacks - one that we call synchronously * and one that we invoke via a taskq. The async one is useful for @@ -693,12 +708,12 @@ dbuf_evict_one(void) if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); + uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db); (void) zfs_refcount_remove_many( - &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db); + &dbuf_caches[DB_DBUF_CACHE].size, size, db); DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], - db->db.db_size); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); @@ -2808,6 +2823,8 @@ dbuf_destroy(dmu_buf_impl_t *db) db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); + + ASSERT0(dmu_buf_user_size(&db->db)); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); @@ -3540,17 +3557,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); + + uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db); (void) zfs_refcount_remove_many( - &dbuf_caches[db->db_caching_status].size, - db->db.db_size, db); + &dbuf_caches[db->db_caching_status].size, size, db); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], - db->db.db_size); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); } db->db_caching_status = DB_NO_CACHE; } @@ -3782,7 +3799,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) db->db_caching_status = dcs; multilist_insert(&dbuf_caches[dcs].cache, db); - uint64_t db_size = db->db.db_size; + uint64_t db_size = db->db.db_size + + dmu_buf_user_size(&db->db); size = zfs_refcount_add_many( &dbuf_caches[dcs].size, db_size, db); uint8_t db_level = db->db_level; @@ -3885,6 +3903,35 @@ dmu_buf_get_user(dmu_buf_t *db_fake) return (db->db_user); } +uint64_t +dmu_buf_user_size(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + if (db->db_user == NULL) + return (0); + return (atomic_load_64(&db->db_user->dbu_size)); +} + +void +dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); + ASSERT3P(db->db_user, !=, NULL); + ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd); + atomic_add_64(&db->db_user->dbu_size, nadd); +} + +void +dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); + ASSERT3P(db->db_user, !=, NULL); + ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub); + atomic_sub_64(&db->db_user->dbu_size, nsub); +} + void dmu_buf_user_evict_wait(void) { diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c index 037190a81bb3..6586947b1708 100644 --- a/module/zfs/dbuf_stats.c +++ b/module/zfs/dbuf_stats.c @@ -46,14 +46,14 @@ static int dbuf_stats_hash_table_headers(char *buf, size_t size) { (void) snprintf(buf, size, - "%-96s | %-119s | %s\n" - "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | " + "%-105s | %-119s | %s\n" + "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-8s %-5s %-5s %-7s %3s | " "%-5s %-5s %-9s %-6s %-8s %-12s " "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | " "%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n", "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", - "blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc", - "list", "atype", "flags", "count", "asize", "access", + "blkid", "offset", "dbsize", "usize", "meta", "state", "dbholds", + "dbc", "list", "atype", "flags", "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"); @@ -75,8 +75,8 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) __dmu_object_info_from_dnode(dn, &doi); nwritten = snprintf(buf, size, - "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d " - "%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu " + "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-8llu " + "%-5d %-5d %-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu " "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | " "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n", /* dmu_buf_impl_t */ @@ -87,6 +87,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) (longlong_t)db->db_blkid, (u_longlong_t)db->db.db_offset, (u_longlong_t)db->db.db_size, + (u_longlong_t)dmu_buf_user_size(&db->db), !!dbuf_is_metadata(db), db->db_state, (ulong_t)zfs_refcount_count(&db->db_holds), diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index c0a2b36857be..54b8475293e9 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1120,9 +1120,11 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots) return (B_TRUE); } -static void +static uint_t dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) { + uint_t reclaimed = 0; + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); for (int i = idx; i < idx + slots; i++) { @@ -1134,8 +1136,11 @@ dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE); dnode_destroy(dnh->dnh_dnode); dnh->dnh_dnode = DN_SLOT_FREE; + reclaimed++; } } + + return (reclaimed); } void @@ -1448,6 +1453,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, } else { dn = dnode_create(os, dn_block + idx, db, object, dnh); + dmu_buf_add_user_size(&db->db, + sizeof (dnode_t)); } } @@ -1505,8 +1512,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, * to be freed. Single slot dnodes can be safely * re-purposed as a performance optimization. */ - if (slots > 1) - dnode_reclaim_slots(dnc, idx + 1, slots - 1); + if (slots > 1) { + uint_t reclaimed = + dnode_reclaim_slots(dnc, idx + 1, slots - 1); + if (reclaimed > 0) + dmu_buf_sub_user_size(&db->db, + reclaimed * sizeof (dnode_t)); + } dnh = &dnc->dnc_children[idx]; if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { @@ -1514,6 +1526,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, } else { dn = dnode_create(os, dn_block + idx, db, object, dnh); + dmu_buf_add_user_size(&db->db, sizeof (dnode_t)); } mutex_enter(&dn->dn_mtx); diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index f67a4eb22a2d..73de01a0bad3 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -662,12 +662,13 @@ mmp_thread(void *arg) (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) { zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu " "mmp_last_write %llu mmp_interval %llu " - "mmp_fail_intervals %llu mmp_fail_ns %llu", + "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)mmp->mmp_last_write, (u_longlong_t)mmp_interval, (u_longlong_t)mmp_fail_intervals, - (u_longlong_t)mmp_fail_ns); + (u_longlong_t)mmp_fail_ns, + (u_longlong_t)spa->spa_uberblock.ub_txg); cmn_err(CE_WARN, "MMP writes to pool '%s' have not " "succeeded in over %llu ms; suspending pool. " "Hrtime %llu", diff --git a/module/zfs/slack.c b/module/zfs/slack.c index 6c9a0418068b..cbf797dd9c74 100644 --- a/module/zfs/slack.c +++ b/module/zfs/slack.c @@ -54,3 +54,11 @@ slack_compress(void *src, void *dst, size_t s_len, size_t d_len, int level) memcpy(dst, src, c_len); return (c_len); } + +int +slack_decompress(void *src, void *dst, size_t s_len, size_t d_len, int level) +{ + ASSERT3U(d_len, >=, s_len); + memcpy(dst, src, s_len); + return (0); +} diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9a176781c942..cb7d7908c668 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -33,6 +33,7 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley + * Copyright (c) 2024, Klara Inc. */ /* @@ -87,6 +88,7 @@ #include #include #include +#include #ifdef _KERNEL #include @@ -150,7 +152,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ -const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { +static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ @@ -172,6 +174,14 @@ uint_t zio_taskq_batch_tpq; /* threads per taskq */ boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ uint_t zio_taskq_basedc = 80; /* base duty cycle */ +/* + * If enabled, try to find an unlocked IO taskq to dispatch an IO onto before + * falling back to waiting on a lock. This should only be enabled in + * conjunction with careful performance testing, and will likely require + * zio_taskq_read/zio_taskq_write to be adjusted as well. + */ +boolean_t zio_taskq_trylock = B_FALSE; + boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ /* @@ -982,6 +992,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) uint_t cpus, flags = TASKQ_DYNAMIC; boolean_t batch = B_FALSE; + tqs->stqs_type = q; + tqs->stqs_zio_type = t; + switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >, 0); @@ -1114,29 +1127,313 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) tqs->stqs_taskq = NULL; } +#ifdef _KERNEL +/* + * The READ and WRITE rows of zio_taskqs are configurable at module load time + * by setting zio_taskq_read or zio_taskq_write. + * + * Example (the defaults for READ and WRITE) + * zio_taskq_read='fixed,1,8 null scale null' + * zio_taskq_write='batch fixed,1,5 scale fixed,1,5' + * + * Each sets the entire row at a time. + * + * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number + * of threads per taskq. + * + * 'null' can only be set on the high-priority queues (queue selection for + * high-priority queues will fall back to the regular queue if the high-pri + * is NULL. + */ +static const char *const modes[ZTI_NMODES] = { + "fixed", "batch", "scale", "null" +}; + +/* Parse the incoming config string. Modifies cfg */ +static int +spa_taskq_param_set(zio_type_t t, char *cfg) +{ + int err = 0; + + zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; + + char *next = cfg, *tok, *c; + + /* + * Parse out each element from the string and fill `row`. The entire + * row has to be set at once, so any errors are flagged by just + * breaking out of this loop early. + */ + uint_t q; + for (q = 0; q < ZIO_TASKQ_TYPES; q++) { + /* `next` is the start of the config */ + if (next == NULL) + break; + + /* Eat up leading space */ + while (isspace(*next)) + next++; + if (*next == '\0') + break; + + /* Mode ends at space or end of string */ + tok = next; + next = strchr(tok, ' '); + if (next != NULL) *next++ = '\0'; + + /* Parameters start after a comma */ + c = strchr(tok, ','); + if (c != NULL) *c++ = '\0'; + + /* Match mode string */ + uint_t mode; + for (mode = 0; mode < ZTI_NMODES; mode++) + if (strcmp(tok, modes[mode]) == 0) + break; + if (mode == ZTI_NMODES) + break; + + /* Invalid canary */ + row[q].zti_mode = ZTI_NMODES; + + /* Per-mode setup */ + switch (mode) { + + /* + * FIXED is parameterised: number of queues, and number of + * threads per queue. + */ + case ZTI_MODE_FIXED: { + /* No parameters? */ + if (c == NULL || *c == '\0') + break; + + /* Find next parameter */ + tok = c; + c = strchr(tok, ','); + if (c == NULL) + break; + + /* Take digits and convert */ + unsigned long long nq; + if (!(isdigit(*tok))) + break; + err = ddi_strtoull(tok, &tok, 10, &nq); + /* Must succeed and also end at the next param sep */ + if (err != 0 || tok != c) + break; + + /* Move past the comma */ + tok++; + /* Need another number */ + if (!(isdigit(*tok))) + break; + /* Remember start to make sure we moved */ + c = tok; + + /* Take digits */ + unsigned long long ntpq; + err = ddi_strtoull(tok, &tok, 10, &ntpq); + /* Must succeed, and moved forward */ + if (err != 0 || tok == c || *tok != '\0') + break; + + /* + * sanity; zero queues/threads make no sense, and + * 16K is almost certainly more than anyone will ever + * need and avoids silly numbers like UINT32_MAX + */ + if (nq == 0 || nq >= 16384 || + ntpq == 0 || ntpq >= 16384) + break; + + const zio_taskq_info_t zti = ZTI_P(ntpq, nq); + row[q] = zti; + break; + } + + case ZTI_MODE_BATCH: { + const zio_taskq_info_t zti = ZTI_BATCH; + row[q] = zti; + break; + } + + case ZTI_MODE_SCALE: { + const zio_taskq_info_t zti = ZTI_SCALE; + row[q] = zti; + break; + } + + case ZTI_MODE_NULL: { + /* + * Can only null the high-priority queues; the general- + * purpose ones have to exist. + */ + if (q != ZIO_TASKQ_ISSUE_HIGH && + q != ZIO_TASKQ_INTERRUPT_HIGH) + break; + + const zio_taskq_info_t zti = ZTI_NULL; + row[q] = zti; + break; + } + + default: + break; + } + + /* Ensure we set a mode */ + if (row[q].zti_mode == ZTI_NMODES) + break; + } + + /* Didn't get a full row, fail */ + if (q < ZIO_TASKQ_TYPES) + return (SET_ERROR(EINVAL)); + + /* Eat trailing space */ + if (next != NULL) + while (isspace(*next)) + next++; + + /* If there's anything left over then fail */ + if (next != NULL && *next != '\0') + return (SET_ERROR(EINVAL)); + + /* Success! Copy it into the real config */ + for (q = 0; q < ZIO_TASKQ_TYPES; q++) + zio_taskqs[t][q] = row[q]; + + return (0); +} + +static int +spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) +{ + int pos = 0; + + /* Build paramater string from live config */ + const char *sep = ""; + for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { + const zio_taskq_info_t *zti = &zio_taskqs[t][q]; + if (zti->zti_mode == ZTI_MODE_FIXED) + pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, + modes[zti->zti_mode], zti->zti_count, + zti->zti_value); + else + pos += sprintf(&buf[pos], "%s%s", sep, + modes[zti->zti_mode]); + sep = " "; + } + + if (add_newline) + buf[pos++] = '\n'; + buf[pos] = '\0'; + + return (pos); +} + +#ifdef __linux__ +static int +spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); +} + +static int +spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); +} +#else +/* + * On FreeBSD load-time parameters can be set up before malloc() is available, + * so we have to do all the parsing work on the stack. + */ +#define SPA_TASKQ_PARAM_MAX (128) + +static int +spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err; + + (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err || req->newptr == NULL) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); +} + +static int +spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err; + + (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err || req->newptr == NULL) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); +} +#endif +#endif /* _KERNEL */ + /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention - * on the taskq itself. In that case we choose which taskq at random by using - * the low bits of gethrtime(). + * on the taskq itself. In that case we try each one until it goes in, before + * falling back to waiting on a lock. */ void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - taskq_t *tq; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); + DTRACE_PROBE2(spa_taskqs_ent__dispatch, + spa_taskqs_t *, tqs, taskq_ent_t *, ent); + if (tqs->stqs_count == 1) { - tq = tqs->stqs_taskq[0]; - } else { - tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; + taskq_dispatch_ent(tqs->stqs_taskq[0], func, arg, flags, ent); + goto out; + } + + int select = ((uint64_t)gethrtime()) % tqs->stqs_count; + if (zio_taskq_trylock) { + for (int i = 0; i < tqs->stqs_count; i++) { + if (taskq_try_dispatch_ent( + tqs->stqs_taskq[select], func, arg, flags, ent)) + goto out; + select = (select+1) % tqs->stqs_count; + } } - taskq_dispatch_ent(tq, func, arg, flags, ent); + taskq_dispatch_ent(tqs->stqs_taskq[select], func, arg, flags, ent); + +out: + DTRACE_PROBE2(spa_taskqs_ent__dispatched, + spa_taskqs_t *, tqs, taskq_ent_t *, ent); } /* @@ -1619,7 +1916,8 @@ spa_unload(spa_t *spa, txg_wait_flag_t txg_how) vdev_t *vd; uint64_t t, txg; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); spa_import_progress_remove(spa_guid(spa)); @@ -2931,8 +3229,6 @@ spa_spawn_aux_threads(spa_t *spa) { ASSERT(spa_writeable(spa)); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); @@ -3035,6 +3331,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) spa->spa_load_state = state; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); + spa_import_progress_set_notes(spa, "spa_load()"); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, type, &ereport); @@ -3244,18 +3541,23 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) } /* - * Perform the import activity check. If the user canceled the import or - * we detected activity then fail. + * Remote host activity check. + * + * error results: + * 0 - no activity detected + * EREMOTEIO - remote activity detected + * EINTR - user canceled the operation */ static int -spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) +spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, + boolean_t importing) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; uint64_t mmp_config = ub->ub_mmp_config; uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; uint64_t import_delay; - hrtime_t import_expire; + hrtime_t import_expire, now; nvlist_t *mmp_label = NULL; vdev_t *rvd = spa->spa_root_vdev; kcondvar_t cv; @@ -3293,9 +3595,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) import_expire = gethrtime() + import_delay; - while (gethrtime() < import_expire) { - (void) spa_import_progress_set_mmp_check(spa_guid(spa), - NSEC2SEC(import_expire - gethrtime())); + if (importing) { + spa_import_progress_set_notes(spa, "Checking MMP activity, " + "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); + } + + int iterations = 0; + while ((now = gethrtime()) < import_expire) { + if (importing && iterations++ % 30 == 0) { + spa_import_progress_set_notes(spa, "Checking MMP " + "activity, %llu ms remaining", + (u_longlong_t)NSEC2MSEC(import_expire - now)); + } + + if (importing) { + (void) spa_import_progress_set_mmp_check(spa_guid(spa), + NSEC2SEC(import_expire - gethrtime())); + } vdev_uberblock_load(rvd, ub, &mmp_label); @@ -3377,6 +3693,61 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) return (error); } +/* + * Called from zfs_ioc_clear for a pool that was suspended + * after failing mmp write checks. + */ +boolean_t +spa_mmp_remote_host_activity(spa_t *spa) +{ + ASSERT(spa_multihost(spa) && spa_suspended(spa)); + + nvlist_t *best_label; + uberblock_t best_ub; + + /* + * Locate the best uberblock on disk + */ + vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); + if (best_label) { + /* + * confirm that the best hostid matches our hostid + */ + if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && + spa_get_hostid(spa) != + fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { + nvlist_free(best_label); + return (B_TRUE); + } + nvlist_free(best_label); + } else { + return (B_TRUE); + } + + if (!MMP_VALID(&best_ub) || + !MMP_FAIL_INT_VALID(&best_ub) || + MMP_FAIL_INT(&best_ub) == 0) { + return (B_TRUE); + } + + if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || + best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { + zfs_dbgmsg("txg mismatch detected during pool clear " + "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", + (u_longlong_t)spa->spa_uberblock.ub_txg, + (u_longlong_t)best_ub.ub_txg, + (u_longlong_t)spa->spa_uberblock.ub_timestamp, + (u_longlong_t)best_ub.ub_timestamp); + return (B_TRUE); + } + + /* + * Perform an activity check looking for any remote writer + */ + return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, + B_FALSE) != 0); +} + static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { @@ -3697,7 +4068,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } - int error = spa_activity_check(spa, ub, spa->spa_config); + int error = + spa_activity_check(spa, ub, spa->spa_config, B_TRUE); if (error) { nvlist_free(label); return (error); @@ -3904,6 +4276,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, rvd = mrvd; spa_config_exit(spa, SCL_ALL, FTAG); + /* + * If 'zpool import' used a cached config, then the on-disk hostid and + * hostname may be different to the cached config in ways that should + * prevent import. Userspace can't discover this without a scan, but + * we know, so we add these values to LOAD_INFO so the caller can know + * the difference. + * + * Note that we have to do this before the config is regenerated, + * because the new config will have the hostid and hostname for this + * host, in readiness for import. + */ + if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) + fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, + fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); + if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) + fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, + fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); + /* * We will use spa_config if we decide to reload the spa or if spa_load * fails and we rewind. We must thus regenerate the config using the @@ -4580,7 +4970,8 @@ spa_ld_read_checkpoint_txg(spa_t *spa) int error = 0; ASSERT0(spa->spa_checkpoint_txg); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), @@ -4827,6 +5218,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) boolean_t checkpoint_rewind = (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); boolean_t update_config_cache = B_FALSE; + hrtime_t load_start = gethrtime(); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); @@ -4871,12 +5263,19 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) return (error); } + /* + * Drop the namespace lock for the rest of the function. + */ + spa->spa_load_thread = curthread; + mutex_exit(&spa_namespace_lock); + /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ + spa_import_progress_set_notes(spa, "Loading checkpoint txg"); error = spa_ld_read_checkpoint_txg(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed @@ -4886,60 +5285,68 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * initiated. Otherwise we could be reading from indirect vdevs before * we have loaded their mappings. */ + spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the full list of active features from the MOS and check if * they are all supported. */ + spa_import_progress_set_notes(spa, "Checking feature flags"); error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) - return (error); + goto fail; /* * Load several special directories from the MOS needed by the dsl_pool * layer. */ + spa_import_progress_set_notes(spa, "Loading special MOS directories"); error = spa_ld_load_special_directories(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve pool properties from the MOS. */ + spa_import_progress_set_notes(spa, "Loading properties"); error = spa_ld_get_props(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the list of auxiliary devices - cache devices and spares - * and open them. */ + spa_import_progress_set_notes(spa, "Loading AUX vdevs"); error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) - return (error); + goto fail; /* * Load the metadata for all vdevs. Also check if unopenable devices * should be autoreplaced. */ + spa_import_progress_set_notes(spa, "Loading vdev metadata"); error = spa_ld_load_vdev_metadata(spa); if (error != 0) - return (error); + goto fail; + spa_import_progress_set_notes(spa, "Loading dedup tables"); error = spa_ld_load_dedup_tables(spa); if (error != 0) - return (error); + goto fail; /* * Verify the logs now to make sure we don't have any unexpected errors * when we claim log blocks later. */ + spa_import_progress_set_notes(spa, "Verifying Log Devices"); error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) - return (error); + goto fail; if (missing_feat_write) { ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); @@ -4949,8 +5356,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ - return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, - ENOTSUP)); + error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, + ENOTSUP); + goto fail; } /* @@ -4958,15 +5366,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * state. When performing an extreme rewind, we verify the whole pool, * which can take a very long time. */ + spa_import_progress_set_notes(spa, "Verifying pool data"); error = spa_ld_verify_pool_data(spa); if (error != 0) - return (error); + goto fail; /* * Calculate the deflated space for the pool. This must be done before * we write anything to the pool because we'd need to update the space * accounting using the deflated sizes. */ + spa_import_progress_set_notes(spa, "Calculating deflated space"); spa_update_dspace(spa); /* @@ -4974,6 +5384,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * pool. If we are importing the pool in read-write mode, a few * additional steps must be performed to finish the import. */ + spa_import_progress_set_notes(spa, "Starting import"); if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { uint64_t config_cache_txg = spa->spa_config_txg; @@ -4990,6 +5401,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); } + spa_import_progress_set_notes(spa, "Claiming ZIL blocks"); /* * Traverse the ZIL and claim all blocks. */ @@ -5009,6 +5421,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * will have been set for us by ZIL traversal operations * performed above. */ + spa_import_progress_set_notes(spa, "Syncing ZIL claims"); txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* @@ -5016,6 +5429,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * next sync, we would update the config stored in vdev labels * and the cachefile (by default /etc/zfs/zpool.cache). */ + spa_import_progress_set_notes(spa, "Updating configs"); spa_ld_check_for_config_update(spa, config_cache_txg, update_config_cache); @@ -5024,6 +5438,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * Then check all DTLs to see if anything needs resilvering. * The resilver will be deferred if a rebuild was started. */ + spa_import_progress_set_notes(spa, "Starting resilvers"); if (vdev_rebuild_active(spa->spa_root_vdev)) { vdev_rebuild_restart(spa); } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && @@ -5037,6 +5452,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) */ spa_history_log_version(spa, "open", NULL); + spa_import_progress_set_notes(spa, + "Restarting device removals"); spa_restart_removal(spa); spa_spawn_aux_threads(spa); @@ -5049,27 +5466,40 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * auxiliary threads above (from which the livelist * deletion zthr is part of). */ + spa_import_progress_set_notes(spa, + "Cleaning up inconsistent objsets"); (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ + spa_import_progress_set_notes(spa, + "Cleaning up temporary userrefs"); dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_import_progress_set_notes(spa, "Restarting initialize"); vdev_initialize_restart(spa->spa_root_vdev); + spa_import_progress_set_notes(spa, "Restarting TRIM"); vdev_trim_restart(spa->spa_root_vdev); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); + spa_import_progress_set_notes(spa, "Finished importing"); } + zio_handle_import_delay(spa, gethrtime() - load_start); spa_import_progress_remove(spa_guid(spa)); spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_load_note(spa, "LOADED"); +fail: + mutex_enter(&spa_namespace_lock); + spa->spa_load_thread = NULL; + cv_broadcast(&spa_namespace_cv); + + return (error); - return (0); } static int @@ -6337,9 +6767,14 @@ spa_tryimport(nvlist_t *tryconfig) /* * Create and initialize the spa structure. */ + char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", + TRYIMPORT_NAME, (u_longlong_t)curthread, poolname); + mutex_enter(&spa_namespace_lock); - spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); + spa = spa_add(name, tryconfig, NULL); spa_activate(spa, SPA_MODE_READ); + kmem_free(name, MAXPATHLEN); /* * Rewind pool if a max txg was provided. @@ -6476,9 +6911,10 @@ static int spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { - int error; + int error = 0; spa_t *spa; boolean_t force_removal, modifying; + hrtime_t export_start = gethrtime(); if (oldconfig) *oldconfig = NULL; @@ -6509,8 +6945,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, new_state == POOL_STATE_EXPORTED); /* - * Put a hold on the pool, drop the namespace lock, stop async tasks, - * reacquire the namespace lock, and see if we can export. + * Put a hold on the pool, drop the namespace lock, stop async tasks + * and see if we can export. */ spa_open_ref(spa, FTAG); @@ -6547,10 +6983,13 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, taskq_wait(spa->spa_zvol_taskq); } mutex_enter(&spa_namespace_lock); + spa->spa_export_thread = curthread; spa_close(spa, FTAG); - if (spa->spa_state == POOL_STATE_UNINITIALIZED) + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + mutex_exit(&spa_namespace_lock); goto export_spa; + } /* * The pool will be in core if it's openable, in which case we can @@ -6594,6 +7033,14 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, goto fail; } + mutex_exit(&spa_namespace_lock); + /* + * At this point we no longer hold the spa_namespace_lock and + * there were no references on the spa. Future spa_lookups will + * notice the spa->spa_export_thread and wait until we signal + * that we are finshed. + */ + if (spa->spa_sync_on) { /* * A pool cannot be exported if it has an active shared spare. @@ -6604,7 +7051,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { error = SET_ERROR(EXDEV); - goto fail; + goto fail_unlocked; } /* @@ -6670,13 +7117,20 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, error = spa_unload(spa, hardforce ? TXG_WAIT_F_FORCE_EXPORT : TXG_WAIT_F_NOSUSPEND); if (error != 0) - goto fail; + goto fail_unlocked; spa_deactivate(spa); } if (oldconfig && spa->spa_config) VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); + if (new_state == POOL_STATE_EXPORTED) + zio_handle_export_delay(spa, gethrtime() - export_start); + + /* + * Take the namewspace lock for the actual spa_t removal + */ + mutex_enter(&spa_namespace_lock); if (new_state != POOL_STATE_UNINITIALIZED) { if (!force_removal) spa_write_cachefile(spa, B_TRUE, B_TRUE); @@ -6688,16 +7142,29 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, * we make sure to reset the exporting flag. */ spa->spa_is_exporting = B_FALSE; + spa->spa_export_thread = NULL; } + /* + * Wake up any waiters in spa_lookup() + */ + cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (0); +fail_unlocked: + mutex_enter(&spa_namespace_lock); fail: if (force_removal) spa_set_export_initiator(spa, NULL); spa->spa_is_exporting = B_FALSE; + spa->spa_export_thread = NULL; + spa_async_resume(spa); + /* + * Wake up any waiters in spa_lookup() + */ + cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (error); } @@ -8311,15 +8778,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd) } static void -spa_async_probe(spa_t *spa, vdev_t *vd) +spa_async_fault_vdev(spa_t *spa, vdev_t *vd) { - if (vd->vdev_probe_wanted) { - vd->vdev_probe_wanted = B_FALSE; - vdev_reopen(vd); /* vdev_open() does the actual probe */ + if (vd->vdev_fault_wanted) { + vd->vdev_fault_wanted = B_FALSE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); } for (int c = 0; c < vd->vdev_children; c++) - spa_async_probe(spa, vd->vdev_child[c]); + spa_async_fault_vdev(spa, vd->vdev_child[c]); } static void @@ -8408,11 +8876,11 @@ spa_async_thread(void *arg) } /* - * See if any devices need to be probed. + * See if any devices need to be marked faulted. */ - if (tasks & SPA_ASYNC_PROBE) { + if (tasks & SPA_ASYNC_FAULT_VDEV) { spa_vdev_state_enter(spa, SCL_NONE); - spa_async_probe(spa, spa->spa_root_vdev); + spa_async_fault_vdev(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } @@ -10199,6 +10667,9 @@ ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, "Number of threads per IO worker taskqueue"); +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_trylock, UINT, ZMOD_RD, + "Try to dispatch IO to an unlocked IO taskqueue before sleeping"); + ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); @@ -10218,4 +10689,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); + +#ifdef _KERNEL +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, + spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD, + "Configure IO queues for read IO"); +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, + spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD, + "Configure IO queues for write IO"); +#endif /* END CSTYLED */ diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index 9044d3c1668b..325a533bf098 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -1155,6 +1155,7 @@ spa_ld_log_sm_data(spa_t *spa) uint_t pn = 0; uint64_t ps = 0; + uint64_t nsm = 0; psls = sls = avl_first(&spa->spa_sm_logs_by_txg); while (sls != NULL) { /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */ @@ -1187,6 +1188,10 @@ spa_ld_log_sm_data(spa_t *spa) summary_add_data(spa, sls->sls_txg, sls->sls_mscount, 0, sls->sls_nblocks); + spa_import_progress_set_notes_nolog(spa, + "Read %llu of %lu log space maps", (u_longlong_t)nsm, + avl_numnodes(&spa->spa_sm_logs_by_txg)); + struct spa_ld_log_sm_arg vla = { .slls_spa = spa, .slls_txg = sls->sls_txg @@ -1202,6 +1207,7 @@ spa_ld_log_sm_data(spa_t *spa) pn--; ps -= space_map_length(sls->sls_sm); + nsm++; space_map_close(sls->sls_sm); sls->sls_sm = NULL; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls); @@ -1212,11 +1218,11 @@ spa_ld_log_sm_data(spa_t *spa) hrtime_t read_logs_endtime = gethrtime(); spa_load_note(spa, - "read %llu log space maps (%llu total blocks - blksz = %llu bytes) " - "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg), + "Read %lu log space maps (%llu total blocks - blksz = %llu bytes) " + "in %lld ms", avl_numnodes(&spa->spa_sm_logs_by_txg), (u_longlong_t)spa_log_sm_nblocks(spa), (u_longlong_t)zfs_log_sm_blksz, - (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000)); + (longlong_t)NSEC2MSEC(read_logs_endtime - read_logs_starttime)); out: if (error != 0) { diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index ae95e8b64c1e..3e9ccda6f0c7 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -20,13 +20,14 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K . All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ #include @@ -79,7 +80,8 @@ * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices - * - Held for the duration of create/destroy/import/export + * - Held for the duration of create/destroy + * - Held at the start and end of import and export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by @@ -232,9 +234,9 @@ * locking is, always, based on spa_namespace_lock and spa_config_lock[]. */ -static avl_tree_t spa_namespace_avl; +avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; -static kcondvar_t spa_namespace_cv; +kcondvar_t spa_namespace_cv; int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; @@ -417,6 +419,8 @@ spa_load_note(spa_t *spa, const char *fmt, ...) zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); + + spa_import_progress_set_notes_nolog(spa, "%s", buf); } /* @@ -604,6 +608,7 @@ spa_lookup(const char *name) ASSERT(MUTEX_HELD(&spa_namespace_lock)); +retry: (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* @@ -615,6 +620,20 @@ spa_lookup(const char *name) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); + if (spa == NULL) + return (NULL); + + /* + * Avoid racing with import/export, which don't hold the namespace + * lock for their entire duration. + */ + if ((spa->spa_load_thread != NULL && + spa->spa_load_thread != curthread) || + (spa->spa_export_thread != NULL && + spa->spa_export_thread != curthread)) { + cv_wait(&spa_namespace_cv, &spa_namespace_lock); + goto retry; + } return (spa); } @@ -712,6 +731,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa_config_lock_init(spa); spa_stats_init(spa); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); avl_add(&spa_namespace_avl, spa); /* @@ -806,7 +826,6 @@ spa_remove(spa_t *spa) nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); - cv_broadcast(&spa_namespace_cv); if (spa->spa_root) spa_strfree(spa->spa_root); @@ -901,7 +920,8 @@ void spa_open_ref(spa_t *spa, void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref || - MUTEX_HELD(&spa_namespace_lock)); + MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); (void) zfs_refcount_add(&spa->spa_refcount, tag); } @@ -921,13 +941,15 @@ spa_close_common(spa_t *spa, const void *tag) /* * Remove a reference to the given spa_t. Must have at least one reference, or - * have the namespace lock held. + * have the namespace lock held or be part of a pool import/export. */ void spa_close(spa_t *spa, void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || - MUTEX_HELD(&spa_namespace_lock)); + MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread || + spa->spa_export_thread == curthread); spa_close_common(spa, tag); } @@ -947,13 +969,15 @@ spa_async_close(spa_t *spa, void *tag) /* * Check to see if the spa refcount is zero. Must be called with - * spa_namespace_lock held. We really compare against spa_minref, which is the - * number of references acquired when opening a pool + * spa_namespace_lock held or be the spa export thread. We really + * compare against spa_minref, which is the number of references + * acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref); } @@ -1201,6 +1225,8 @@ spa_vdev_enter(spa_t *spa) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); + ASSERT0(spa->spa_export_thread); + vdev_autotrim_stop_all(spa); return (spa_vdev_config_enter(spa)); @@ -1218,6 +1244,8 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); + ASSERT0(spa->spa_export_thread); + vdev_autotrim_stop_all(spa); if (guid != 0) { @@ -2215,6 +2243,7 @@ typedef struct spa_import_progress { uint64_t pool_guid; /* unique id for updates */ char *pool_name; spa_load_state_t spa_load_state; + char *spa_load_notes; uint64_t mmp_sec_remaining; /* MMP activity check */ uint64_t spa_load_max_txg; /* rewind txg */ procfs_list_node_t smh_node; @@ -2225,9 +2254,9 @@ spa_history_list_t *spa_import_progress_list = NULL; static int spa_import_progress_show_header(struct seq_file *f) { - seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid", + seq_printf(f, "%-20s %-14s %-14s %-12s %-16s %s\n", "pool_guid", "load_state", "multihost_secs", "max_txg", - "pool_name"); + "pool_name", "notes"); return (0); } @@ -2236,11 +2265,12 @@ spa_import_progress_show(struct seq_file *f, void *data) { spa_import_progress_t *sip = (spa_import_progress_t *)data; - seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n", + seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %-16s %s\n", (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state, (u_longlong_t)sip->mmp_sec_remaining, (u_longlong_t)sip->spa_load_max_txg, - (sip->pool_name ? sip->pool_name : "-")); + (sip->pool_name ? sip->pool_name : "-"), + (sip->spa_load_notes ? sip->spa_load_notes : "-")); return (0); } @@ -2254,6 +2284,8 @@ spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size) sip = list_remove_head(&shl->procfs_list.pl_list); if (sip->pool_name) spa_strfree(sip->pool_name); + if (sip->spa_load_notes) + kmem_strfree(sip->spa_load_notes); kmem_free(sip, sizeof (spa_import_progress_t)); shl->size--; } @@ -2309,6 +2341,10 @@ spa_import_progress_set_state(uint64_t pool_guid, sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_state = load_state; + if (sip->spa_load_notes != NULL) { + kmem_strfree(sip->spa_load_notes); + sip->spa_load_notes = NULL; + } error = 0; break; } @@ -2318,6 +2354,59 @@ spa_import_progress_set_state(uint64_t pool_guid, return (error); } +static void +spa_import_progress_set_notes_impl(spa_t *spa, boolean_t log_dbgmsg, + const char *fmt, va_list adx) +{ + spa_history_list_t *shl = spa_import_progress_list; + spa_import_progress_t *sip; + uint64_t pool_guid = spa_guid(spa); + + if (shl->size == 0) + return; + + char *notes = kmem_vasprintf(fmt, adx); + + mutex_enter(&shl->procfs_list.pl_lock); + for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; + sip = list_prev(&shl->procfs_list.pl_list, sip)) { + if (sip->pool_guid == pool_guid) { + if (sip->spa_load_notes != NULL) { + kmem_strfree(sip->spa_load_notes); + sip->spa_load_notes = NULL; + } + sip->spa_load_notes = notes; + if (log_dbgmsg) + zfs_dbgmsg("'%s' %s", sip->pool_name, notes); + notes = NULL; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + if (notes != NULL) + kmem_strfree(notes); +} + +void +spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + spa_import_progress_set_notes_impl(spa, B_TRUE, fmt, adx); + va_end(adx); +} + +void +spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + spa_import_progress_set_notes_impl(spa, B_FALSE, fmt, adx); + va_end(adx); +} + int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg) { @@ -2386,6 +2475,7 @@ spa_import_progress_add(spa_t *spa) poolname = spa_name(spa); sip->pool_name = spa_strdup(poolname); sip->spa_load_state = spa_load_state(spa); + sip->spa_load_notes = NULL; mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, sip); @@ -2405,6 +2495,8 @@ spa_import_progress_remove(uint64_t pool_guid) if (sip->pool_guid == pool_guid) { if (sip->pool_name) spa_strfree(sip->pool_name); + if (sip->spa_load_notes) + spa_strfree(sip->spa_load_notes); list_remove(&shl->procfs_list.pl_list, sip); shl->size--; kmem_free(sip, sizeof (spa_import_progress_t)); @@ -2801,8 +2893,7 @@ spa_state_to_name(spa_t *spa) vdev_state_t state = rvd->vdev_state; vdev_aux_t aux = rvd->vdev_stat.vs_aux; - if (spa_suspended(spa) && - (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)) + if (spa_suspended(spa)) return ("SUSPENDED"); switch (state) { diff --git a/module/zfs/txg.c b/module/zfs/txg.c index dbca5913348d..12ce303a34d6 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -585,6 +585,15 @@ txg_sync_thread(void *arg) timer = (delta > timeout ? 0 : timeout - delta); } + /* + * When we're suspended, nothing should be changing and for + * MMP we don't want to bump anything that would make it + * harder to detect if another host is changing it when + * resuming after a MMP suspend. + */ + if (spa_suspended(spa)) + continue; + /* * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index fc1b62a39ac3..b2434821ef4a 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1584,6 +1584,7 @@ vdev_metaslab_fini(vdev_t *vd) typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; + boolean_t vps_zio_done_probe; int vps_flags; } vdev_probe_stats_t; @@ -1627,6 +1628,17 @@ vdev_probe_done(zio_t *zio) (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); + + /* + * If this probe was initiated from zio pipeline, then + * change the state in a spa_async_request. Probes that + * were initiated from a vdev_open can change the state + * as part of the open call. + */ + if (vps->vps_zio_done_probe) { + vd->vdev_fault_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); + } } mutex_enter(&vd->vdev_probe_lock); @@ -1678,6 +1690,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; + vps->vps_zio_done_probe = (zio != NULL); if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* @@ -1704,15 +1717,6 @@ vdev_probe(vdev_t *vd, zio_t *zio) vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); - - /* - * We can't change the vdev state in this context, so we - * kick off an async task to do it on our behalf. - */ - if (zio != NULL) { - vd->vdev_probe_wanted = B_TRUE; - spa_async_request(spa, SPA_ASYNC_PROBE); - } } if (zio != NULL) diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index d4e08f1b7724..bff572d7d90c 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + * Copyright (c) 2016, 2024 by Delphix. All rights reserved. */ #include @@ -636,7 +636,8 @@ vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) (void) spa; vdev_t *vd; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_initialize_lock); @@ -678,7 +679,8 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, if (vd_list == NULL) { vdev_initialize_stop_wait_impl(vd); } else { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_export_thread == curthread); list_insert_tail(vd_list, vd); } } @@ -710,7 +712,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) spa_t *spa = vd->vdev_spa; list_t vd_list; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); @@ -729,7 +732,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) void vdev_initialize_restart(vdev_t *vd) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_load_thread == curthread); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7ea2f84951e4..58755cd1822b 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1894,6 +1894,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) /* * If this isn't a resync due to I/O errors, * and nothing changed in this transaction group, + * and multihost protection isn't enabled, * and the vdev configuration hasn't changed, * then there's nothing to do. */ @@ -1901,7 +1902,8 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, txg, spa->spa_mmp.mmp_delay); - if (!changed && list_is_empty(&spa->spa_config_dirty_list)) + if (!changed && list_is_empty(&spa->spa_config_dirty_list) && + !spa_multihost(spa)) return (0); } diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 59dcfc80c7a3..6a23ce6b2416 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -22,6 +22,7 @@ * * Copyright (c) 2018, Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + * Copyright (c) 2024 by Delphix. All rights reserved. */ #include @@ -1067,7 +1068,8 @@ vdev_rebuild_restart_impl(vdev_t *vd) void vdev_rebuild_restart(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); vdev_rebuild_restart_impl(spa->spa_root_vdev); } @@ -1081,7 +1083,8 @@ vdev_rebuild_stop_wait(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index f07c747d1f69..b71af92893d8 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2016, 2024 by Delphix. All rights reserved. * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. * Copyright (c) 2021 Hewlett Packard Enterprise Development LP */ @@ -1021,7 +1021,8 @@ vdev_trim_stop_wait(spa_t *spa, list_t *vd_list) (void) spa; vdev_t *vd; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_trim_lock); @@ -1060,7 +1061,8 @@ vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list) if (vd_list == NULL) { vdev_trim_stop_wait_impl(vd); } else { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_export_thread == curthread); list_insert_tail(vd_list, vd); } } @@ -1096,7 +1098,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) list_t vd_list; vdev_t *vd_l2cache; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); @@ -1129,7 +1132,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) void vdev_trim_restart(vdev_t *vd) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_load_thread == curthread); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { @@ -1523,8 +1527,8 @@ vdev_autotrim_stop_all(spa_t *spa) void vdev_autotrim_restart(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); if (spa->spa_autotrim) vdev_autotrim(spa); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index b78cb802199f..aa10e98fed24 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -241,6 +241,11 @@ unsigned long zfs_max_nvlist_src_size = 0; */ unsigned long zfs_history_output_max = 1024 * 1024; +/* + * Whether or not to allow compression=slack to be set on a dataset. + */ +int zfs_slack_compress_enabled = 0; + uint_t zfs_fsyncer_key; uint_t zfs_allow_log_key; @@ -4573,6 +4578,9 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) if (compval == ZIO_COMPRESS_SLACK) { spa_t *spa; + if (!zfs_slack_compress_enabled) + return (SET_ERROR(ENOTSUP)); + if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); @@ -5715,10 +5723,13 @@ zfs_ioc_clear(zfs_cmd_t *zc) /* * If multihost is enabled, resuming I/O is unsafe as another - * host may have imported the pool. + * host may have imported the pool. Check for remote activity. */ - if (spa_multihost(spa) && spa_suspended(spa)) - return (SET_ERROR(EINVAL)); + if (spa_multihost(spa) && spa_suspended(spa) && + spa_mmp_remote_host_activity(spa)) { + spa_close(spa, FTAG); + return (SET_ERROR(EREMOTEIO)); + } spa_vdev_state_enter(spa, SCL_NONE); @@ -7770,4 +7781,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW, "Maximum size in bytes of ZFS ioctl output that will be logged"); + +ZFS_MODULE_PARAM(zfs, zfs_, slack_compress_enabled, INT, ZMOD_RW, + "Allow slack compression feature to be set on a dataset"); /* END CSTYLED */ diff --git a/module/zfs/zil.c b/module/zfs/zil.c index edc5999b957c..b8477b2f9dfc 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -3607,7 +3607,7 @@ zil_commit(zilog_t *zilog, uint64_t foid) int zil_commit_impl(zilog_t *zilog, uint64_t foid) { - ASSERT0(zil_failed(zilog) || zilog->zl_suspend > 0); + ASSERT0(zil_failed(zilog)); ZIL_STAT_BUMP(zil_commit_count); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index bee032457840..3a46675f0da0 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2535,8 +2535,11 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) "is set to panic.", spa_name(spa)); if (!spa_suspended(spa)) { - cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable " - "I/O failure and has been suspended.\n", spa_name(spa)); + if (reason != ZIO_SUSPEND_MMP) { + cmn_err(CE_WARN, "Pool '%s' has encountered an " + "uncorrectable I/O failure and has been " + "suspended.\n", spa_name(spa)); + } (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index 7c8188f7d7fe..cafcc8879e1a 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -68,7 +68,7 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs, NULL}, {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress, zfs_zstd_decompress, zfs_zstd_decompress_level}, - {"slack", 0, slack_compress, NULL, NULL }, + {"slack", 0, slack_compress, slack_decompress, NULL }, }; uint8_t diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index feaf41dc65e3..ed502d3e2d52 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2024, Klara Inc. */ /* @@ -59,6 +60,7 @@ uint32_t zio_injection_enabled = 0; typedef struct inject_handler { int zi_id; spa_t *zi_spa; + char *zi_spa_name; /* ZINJECT_DELAY_IMPORT only */ zinject_record_t zi_record; uint64_t *zi_lanes; int zi_next_lane; @@ -699,6 +701,63 @@ zio_handle_io_delay(zio_t *zio) return (min_target); } +static void +zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command) +{ + inject_handler_t *handler; + hrtime_t delay = 0; + int id = 0; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); + handler != NULL && handler->zi_record.zi_cmd == command; + handler = list_next(&inject_handlers, handler)) { + ASSERT3P(handler->zi_spa_name, !=, NULL); + if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) { + uint64_t pause = + SEC2NSEC(handler->zi_record.zi_duration); + if (pause > elapsed) { + delay = pause - elapsed; + } + id = handler->zi_id; + break; + } + } + + rw_exit(&inject_lock); + + if (delay) { + if (command == ZINJECT_DELAY_IMPORT) { + spa_import_progress_set_notes(spa, "injecting %llu " + "sec delay", (u_longlong_t)NSEC2SEC(delay)); + } + zfs_sleep_until(gethrtime() + delay); + } + if (id) { + /* all done with this one-shot handler */ + zio_clear_fault(id); + } +} + +/* + * For testing, inject a delay during an import + */ +void +zio_handle_import_delay(spa_t *spa, hrtime_t elapsed) +{ + zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT); +} + +/* + * For testing, inject a delay during an export + */ +void +zio_handle_export_delay(spa_t *spa, hrtime_t elapsed) +{ + zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT); +} + static int zio_calculate_range(const char *pool, zinject_record_t *record) { @@ -756,6 +815,28 @@ zio_calculate_range(const char *pool, zinject_record_t *record) return (0); } +static boolean_t +zio_pool_handler_exists(const char *name, zinject_type_t command) +{ + boolean_t exists = B_FALSE; + + rw_enter(&inject_lock, RW_READER); + for (inject_handler_t *handler = list_head(&inject_handlers); + handler != NULL; handler = list_next(&inject_handlers, handler)) { + if (command != handler->zi_record.zi_cmd) + continue; + + const char *pool = (handler->zi_spa_name != NULL) ? + handler->zi_spa_name : spa_name(handler->zi_spa); + if (strcmp(name, pool) == 0) { + exists = B_TRUE; + break; + } + } + rw_exit(&inject_lock); + + return (exists); +} /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, @@ -806,16 +887,42 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) if (!(flags & ZINJECT_NULL)) { /* - * spa_inject_ref() will add an injection reference, which will - * prevent the pool from being removed from the namespace while - * still allowing it to be unloaded. + * Pool delays for import or export don't take an + * injection reference on the spa. Instead they + * rely on matching by name. */ - if ((spa = spa_inject_addref(name)) == NULL) - return (SET_ERROR(ENOENT)); + if (record->zi_cmd == ZINJECT_DELAY_IMPORT || + record->zi_cmd == ZINJECT_DELAY_EXPORT) { + if (record->zi_duration <= 0) + return (SET_ERROR(EINVAL)); + /* + * Only one import | export delay handler per pool. + */ + if (zio_pool_handler_exists(name, record->zi_cmd)) + return (SET_ERROR(EEXIST)); + + mutex_enter(&spa_namespace_lock); + boolean_t has_spa = spa_lookup(name) != NULL; + mutex_exit(&spa_namespace_lock); + + if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa) + return (SET_ERROR(EEXIST)); + if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa) + return (SET_ERROR(ENOENT)); + spa = NULL; + } else { + /* + * spa_inject_ref() will add an injection reference, + * which will prevent the pool from being removed + * from the namespace while still allowing it to be + * unloaded. + */ + if ((spa = spa_inject_addref(name)) == NULL) + return (SET_ERROR(ENOENT)); + } handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); - - handler->zi_spa = spa; + handler->zi_spa = spa; /* note: can be NULL */ handler->zi_record = *record; if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { @@ -828,6 +935,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) handler->zi_next_lane = 0; } + if (handler->zi_spa == NULL) + handler->zi_spa_name = spa_strdup(name); + else + handler->zi_spa_name = NULL; + rw_enter(&inject_lock, RW_WRITER); /* @@ -887,7 +999,11 @@ zio_inject_list_next(int *id, char *name, size_t buflen, if (handler) { *record = handler->zi_record; *id = handler->zi_id; - (void) strncpy(name, spa_name(handler->zi_spa), buflen); + ASSERT(handler->zi_spa || handler->zi_spa_name); + if (handler->zi_spa != NULL) + (void) strncpy(name, spa_name(handler->zi_spa), buflen); + else + (void) strncpy(name, handler->zi_spa_name, buflen); ret = 0; } else { ret = SET_ERROR(ENOENT); @@ -937,7 +1053,11 @@ zio_clear_fault(int id) ASSERT3P(handler->zi_lanes, ==, NULL); } - spa_inject_delref(handler->zi_spa); + if (handler->zi_spa_name != NULL) + spa_strfree(handler->zi_spa_name); + + if (handler->zi_spa != NULL) + spa_inject_delref(handler->zi_spa); kmem_free(handler, sizeof (inject_handler_t)); atomic_dec_32(&zio_injection_enabled); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 421933364fd1..4faa48ba1bad 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -376,7 +376,8 @@ tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg', 'zpool_export_004_pos', 'zpool_export_005_pos', - 'zpool_export_006_pos', 'zpool_export_007_pos'] + 'zpool_export_006_pos', 'zpool_export_007_pos', + 'zpool_export_parallel_pos', 'zpool_export_parallel_admin'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] @@ -401,6 +402,10 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', 'zpool_import_encrypted', 'zpool_import_encrypted_load', 'zpool_import_errata3', 'zpool_import_errata4', + 'zpool_import_hostid_changed', + 'zpool_import_hostid_changed_unclean_export', + 'zpool_import_hostid_changed_cachefile', + 'zpool_import_hostid_changed_cachefile_unclean_export', 'import_cachefile_device_added', 'import_cachefile_device_removed', 'import_cachefile_device_replaced', @@ -411,7 +416,9 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'import_devices_missing', 'import_paths_changed', 'import_rewind_config_changed', - 'import_rewind_device_replaced'] + 'import_rewind_device_replaced', + 'zpool_import_status', 'zpool_import_parallel_pos', + 'zpool_import_parallel_neg', 'zpool_import_parallel_admin'] tags = ['functional', 'cli_root', 'zpool_import'] timeout = 1200 diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 94c1cbbc3f9f..2d1b21a884ff 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -127,7 +127,7 @@ tags = ['functional', 'mmap'] tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', - 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid'] + 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk'] tags = ['functional', 'mmp'] [tests/functional/mount:Linux] diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am index 23c864db19ac..21ecc8c057de 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am @@ -8,7 +8,9 @@ dist_pkgdata_SCRIPTS = \ zpool_export_004_pos.ksh \ zpool_export_005_pos.ksh \ zpool_export_006_pos.ksh \ - zpool_export_007_pos.ksh + zpool_export_007_pos.ksh \ + zpool_export_parallel_admin.ksh \ + zpool_export_parallel_pos.ksh dist_pkgdata_DATA = \ zpool_export.cfg \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh new file mode 100755 index 000000000000..cab8fc2b4239 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh @@ -0,0 +1,72 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that admin commands cannot race a pool export +# +# STRATEGY: +# 1. Create a pool +# 2. Import the pool with an injected delay in the background +# 3. Execute some admin commands against the pool +# + +verify_runnable "global" + +DEVICE_DIR=$TEST_BASE_DIR/dev_export-test + +function cleanup +{ + zinject -c all + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + [[ -d $DEVICE_DIR ]] && log_must rm -rf $DEVICE_DIR +} + +log_assert "admin commands cannot race a pool export" + +log_onexit cleanup + +[[ ! -d $DEVICE_DIR ]] && log_must mkdir -p $DEVICE_DIR +log_must truncate -s $MINVDEVSIZE ${DEVICE_DIR}/disk0 ${DEVICE_DIR}/disk1 + +log_must zpool create -f $TESTPOOL1 mirror ${DEVICE_DIR}/disk0 ${DEVICE_DIR}/disk1 + +log_must zinject -P export -s 10 $TESTPOOL1 + +log_must zpool export $TESTPOOL1 & + +zpool set comment=hello $TESTPOOL1 +zpool reguid $TESTPOOL1 & +zpool split $TESTPOOL1 & + +log_pass "admin commands cannot race a pool export" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh new file mode 100755 index 000000000000..037d17d082bd --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh @@ -0,0 +1,129 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# test uses 8 vdevs +MAX_NUM=8 +DEVICE_DIR=$TEST_BASE_DIR/dev_import-test + + +# +# DESCRIPTION: +# Verify that pool exports can occur in parallel +# +# STRATEGY: +# 1. Create 8 pools +# 2. Inject an export delay using zinject +# 3. Export half of the pools synchronously to baseline sequential cost +# 4. Export the other half asynchronously to demonstrate parallel savings +# 6. Import 4 pools +# 7. Test zpool export -a +# + +verify_runnable "global" + +# +# override the minimum sized vdevs +# + +POOLNAME="test_pool" + +function cleanup +{ + zinject -c all + + for i in {0..$(($MAX_NUM - 1))}; do + poolexists $POOLNAME-$i && destroy_pool $POOLNAME-$i + done + + [[ -d $DEVICE_DIR ]] && log_must rm -rf $DEVICE_DIR +} + +log_assert "Pool exports can occur in parallel" + +log_onexit cleanup + +[[ ! -d $DEVICE_DIR ]] && log_must mkdir -p $DEVICE_DIR + +# +# Create some pools with export delay injectors +# +for i in {0..$(($MAX_NUM - 1))}; do + log_must truncate -s $MINVDEVSIZE ${DEVICE_DIR}/disk$i + log_must zpool create $POOLNAME-$i $DEVICE_DIR/disk$i + log_must zinject -P export -s 8 $POOLNAME-$i +done + +# +# Export half of the pools synchronously +# +SECONDS=0 +for i in {0..3}; do + log_must zpool export $POOLNAME-$i +done +sequential_time=$SECONDS +log_note "sequentially exported 4 pools in $sequential_time seconds" + +# +# Export half of the pools in parallel +# +SECONDS=0 +for i in {4..7}; do + log_must zpool export $POOLNAME-$i & +done +wait +parallel_time=$SECONDS +log_note "asyncronously exported 4 pools in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +# +# import 4 pools with export delay injectors +# +for i in {4..7}; do + log_must zpool import -d $DEVICE_DIR/disk$i $POOLNAME-$i + log_must zinject -P export -s 8 $POOLNAME-$i +done + +# +# now test zpool export -a +# +SECONDS=0 +log_must zpool export -a +parallel_time=$SECONDS +log_note "asyncronously exported 4 pools, using '-a', in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +log_pass "Pool exports occur in parallel" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am index a8c9a31dcfdc..c9486a57b1d4 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am @@ -36,10 +36,17 @@ dist_pkgdata_SCRIPTS = \ zpool_import_features_001_pos.ksh \ zpool_import_features_002_neg.ksh \ zpool_import_features_003_pos.ksh \ + zpool_import_hostid_changed.ksh \ + zpool_import_hostid_changed_unclean_export.ksh \ + zpool_import_hostid_changed_cachefile.ksh \ + zpool_import_hostid_changed_cachefile_unclean_export.ksh \ zpool_import_missing_001_pos.ksh \ zpool_import_missing_002_pos.ksh \ zpool_import_missing_003_pos.ksh \ zpool_import_rename_001_pos.ksh \ + zpool_import_parallel_admin.ksh \ + zpool_import_parallel_neg.ksh \ + zpool_import_parallel_pos.ksh \ zpool_import_encrypted.ksh \ zpool_import_encrypted_load.ksh \ zpool_import_errata3.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg index 25f541ebf185..91fd7edf8529 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg @@ -26,6 +26,7 @@ # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. # . $STF_SUITE/include/libtest.shlib @@ -63,3 +64,7 @@ export VDEV4=$DEVICE_DIR/${DEVICE_FILE}4 export VDEV5=$DEVICE_DIR/${DEVICE_FILE}5 export ALTER_ROOT=/alter_import-test + +export HOSTID_FILE="/etc/hostid" +export HOSTID1=01234567 +export HOSTID2=89abcdef diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib index 8bbd668a9317..d534ee69ece8 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib @@ -11,6 +11,7 @@ # # Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. # . $STF_SUITE/include/libtest.shlib diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh new file mode 100755 index 000000000000..bc82b7cc1ee8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh @@ -0,0 +1,59 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that was cleanly exported should be importable without force even if +# the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Export the pool. +# 4. Change the hostid. +# 5. Verify that importing the pool without force succeeds. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create $TESTPOOL1 $VDEV0 + +# 3. Export the pool. +log_must zpool export $TESTPOOL1 + +# 4. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 5. Verify that importing the pool without force succeeds. +log_must zpool import -d $DEVICE_DIR $TESTPOOL1 + +log_pass "zpool import can import cleanly exported pool when hostid changes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh new file mode 100755 index 000000000000..07c43482d68f --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that was cleanly exported should be importable from a cachefile +# without force even if the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool with a cachefile. +# 3. Backup the cachfile. +# 4. Export the pool. +# 5. Change the hostid. +# 6. Verify that importing the pool from the cachefile succeeds +# without force. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $CPATH $CPATHBKP + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0 + +# 3. Backup the cachfile. +log_must cp $CPATH $CPATHBKP + +# 4. Export the pool. +log_must zpool export $TESTPOOL1 + +# 5. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 6. Verify that importing the pool from the cachefile succeeds without force. +log_must zpool import -c $CPATHBKP $TESTPOOL1 + +log_pass "zpool import can import cleanly exported pool from cachefile " \ + "when hostid changes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh new file mode 100755 index 000000000000..dcb1ac1ab69f --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that wasn't cleanly exported should not be importable from a cachefile +# without force if the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Backup the cachefile. +# 4. Simulate the pool being torn down without export: +# 4.1. Copy the underlying device state. +# 4.2. Export the pool. +# 4.3. Restore the device state from the copy. +# 5. Change the hostid. +# 6. Verify that importing the pool from the cachefile fails. +# 7. Verify that importing the pool from the cachefile with force +# succeeds. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $CPATH $CPATHBKP $VDEV0.bak + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0 + +# 3. Backup the cachfile. +log_must cp $CPATH $CPATHBKP + +# 4. Simulate the pool being torn down without export. +log_must cp $VDEV0 $VDEV0.bak +log_must zpool export $TESTPOOL1 +log_must cp -f $VDEV0.bak $VDEV0 +log_must rm -f $VDEV0.bak + +# 5. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 6. Verify that importing the pool from the cachefile fails. +log_mustnot zpool import -c $CPATHBKP $TESTPOOL1 + +# 7. Verify that importing the pool from the cachefile with force succeeds. +log_must zpool import -f -c $CPATHBKP $TESTPOOL1 + +log_pass "zpool import from cachefile requires force if not cleanly " \ + "exported and hostid changes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh new file mode 100755 index 000000000000..ad8cca642dbc --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that wasn't cleanly exported should not be importable without force if +# the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Simulate the pool being torn down without export: +# 3.1. Copy the underlying device state. +# 3.2. Export the pool. +# 3.3. Restore the device state from the copy. +# 4. Change the hostid. +# 5. Verify that importing the pool fails. +# 6. Verify that importing the pool with force succeeds. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $VDEV0.bak + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create $TESTPOOL1 $VDEV0 + +# 3. Simulate the pool being torn down without export. +log_must cp $VDEV0 $VDEV0.bak +log_must zpool export $TESTPOOL1 +log_must cp -f $VDEV0.bak $VDEV0 +log_must rm -f $VDEV0.bak + +# 4. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 5. Verify that importing the pool fails. +log_mustnot zpool import -d $DEVICE_DIR $TESTPOOL1 + +# 6. Verify that importing the pool with force succeeds. +log_must zpool import -d $DEVICE_DIR -f $TESTPOOL1 + +log_pass "zpool import requires force if not cleanly exported " \ + "and hostid changed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh new file mode 100755 index 000000000000..c681d1b7dd23 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh @@ -0,0 +1,165 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# Verify that admin commands to different pool are not blocked by import +# +# STRATEGY: +# 1. Create 2 pools +# 2. Export one of the pools +# 4. Import the pool with an injected delay +# 5. Execute some admin commands against both pools +# 6. Verify that the admin commands to the non-imported pool don't stall +# + +verify_runnable "global" + +function cleanup +{ + zinject -c all + destroy_pool $TESTPOOL1 + destroy_pool $TESTPOOL2 +} + +function pool_import +{ + typeset dir=$1 + typeset pool=$2 + + SECONDS=0 + errmsg=$(zpool import -d $dir -f $pool 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: imported in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-import + else + echo ${pool}: import failed $errmsg in $SECONDS secs + fi +} + +function pool_add_device +{ + typeset pool=$1 + typeset device=$2 + typeset devtype=$3 + + SECONDS=0 + errmsg=$(zpool add $pool $devtype $device 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: added $devtype vdev in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-add + else + echo ${pool}: add $devtype vdev failed ${errmsg}, in $SECONDS secs + fi +} + +function pool_stats +{ + typeset stats=$1 + typeset pool=$2 + + SECONDS=0 + errmsg=$(zpool $stats $pool 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: $stats in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-${stats} + else + echo ${pool}: $stats failed ${errmsg}, in $SECONDS secs + fi +} + +function pool_create +{ + typeset pool=$1 + typeset device=$2 + + SECONDS=0 + errmsg=$(zpool create $pool $device 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + echo ${pool}: created in $SECONDS secs + echo $SECONDS > ${DEVICE_DIR}/${pool}-create + else + echo ${pool}: create failed ${errmsg}, in $SECONDS secs + fi +} + +log_assert "Simple admin commands to different pool not blocked by import" + +log_onexit cleanup + +# +# create two pools and export one +# +log_must zpool create $TESTPOOL1 $VDEV0 +log_must zpool export $TESTPOOL1 +log_must zpool create $TESTPOOL2 $VDEV1 + +# +# import pool asyncronously with an injected 10 second delay +# +log_must zinject -P import -s 10 $TESTPOOL1 +pool_import $DEVICE_DIR $TESTPOOL1 & + +sleep 2 + +# +# run some admin commands on the pools while the import is in progress +# + +pool_add_device $TESTPOOL1 $VDEV2 "log" & +pool_add_device $TESTPOOL2 $VDEV3 "cache" & +pool_stats "status" $TESTPOOL1 & +pool_stats "status" $TESTPOOL2 & +pool_stats "list" $TESTPOOL1 & +pool_stats "list" $TESTPOOL2 & +pool_create $TESTPOOL1 $VDEV4 & +wait + +log_must zpool sync $TESTPOOL1 $TESTPOOL2 + +zpool history $TESTPOOL1 +zpool history $TESTPOOL2 + +log_must test "5" -lt $(<${DEVICE_DIR}/${TESTPOOL1}-import) + +# +# verify that commands to second pool did not wait for import to finish +# +log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-status) +log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-list) +log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-add) +[[ -e ${DEVICE_DIR}/${TESTPOOL1}-create ]] && log_fail "unexpected pool create" + +log_pass "Simple admin commands to different pool not blocked by import" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh new file mode 100755 index 000000000000..339dc2575ede --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh @@ -0,0 +1,130 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# Verify that pool imports by same name only have one winner +# +# STRATEGY: +# 1. Create 4 single disk pools with the same name +# 2. Generate some ZIL records (for a longer import) +# 3. Export the pools +# 4. Import the pools in parallel +# 5. Repeat with using matching guids +# + +verify_runnable "global" + +POOLNAME="import_pool" +DEV_DIR_PREFIX="$DEVICE_DIR/$POOLNAME" +VDEVSIZE=$((512 * 1024 * 1024)) + +log_assert "parallel pool imports by same name only have one winner" + +# each pool has its own device directory +for i in {0..3}; do + log_must mkdir -p ${DEV_DIR_PREFIX}$i + log_must truncate -s $VDEVSIZE ${DEV_DIR_PREFIX}$i/${DEVICE_FILE}$i +done + +function cleanup +{ + zinject -c all + log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0 + log_must set_tunable64 METASLAB_DEBUG_LOAD 0 + + destroy_pool $POOLNAME + + log_must rm -rf $DEV_DIR_PREFIX* +} + +log_onexit cleanup + +log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1 +log_must set_tunable64 METASLAB_DEBUG_LOAD 1 + +function import_pool +{ + typeset dir=$1 + typeset pool=$2 + typeset newname=$3 + + SECONDS=0 + errmsg=$(zpool import -N -d $dir -f $pool $newname 2>&1 > /dev/null) + if [[ $? -eq 0 ]]; then + touch $dir/imported + echo "imported $pool in $SECONDS secs" + elif [[ $errmsg == *"cannot import"* ]]; then + echo "pool import failed: $errmsg, waited $SECONDS secs" + touch $dir/failed + fi +} + +# +# create four exported pools with the same name +# +for i in {0..3}; do + log_must zpool create $POOLNAME ${DEV_DIR_PREFIX}$i/${DEVICE_FILE}$i + log_must zpool export $POOLNAME +done +log_must zinject -P import -s 10 $POOLNAME + +# +# import the pools in parallel, expecting only one winner +# +for i in {0..3}; do + import_pool ${DEV_DIR_PREFIX}$i $POOLNAME & +done +wait + +# check the result of background imports +typeset num_imports=0 +typeset num_cannot=0 +for i in {0..3}; do + if [[ -f ${DEV_DIR_PREFIX}$i/imported ]]; then + ((num_imports += 1)) + fi + if [[ -f ${DEV_DIR_PREFIX}$i/failed ]]; then + ((num_cannot += 1)) + loser=$i + fi +done +[[ $num_imports -eq "1" ]] || log_fail "expecting an import" +[[ $num_cannot -eq "3" ]] || \ + log_fail "expecting 3 pool exists errors, found $num_cannot" + +log_note "$num_imports imported and $num_cannot failed (expected)" + +log_pass "parallel pool imports by same name only have one winner" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh new file mode 100755 index 000000000000..71b2437a37ec --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh @@ -0,0 +1,137 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# test uses 8 vdevs +export MAX_NUM=8 + +# +# DESCRIPTION: +# Verify that pool imports can occur in parallel +# +# STRATEGY: +# 1. Create 8 pools +# 2. Generate some ZIL records +# 3. Export the pools +# 4. Import half of the pools synchronously to baseline sequential cost +# 5. Import the other half asynchronously to demonstrate parallel savings +# 6. Export 4 pools +# 7. Test zpool import -a +# + +verify_runnable "global" + +# +# override the minimum sized vdevs +# +VDEVSIZE=$((512 * 1024 * 1024)) +increase_device_sizes $VDEVSIZE + +POOLNAME="import_pool" + +function cleanup +{ + zinject -c all + log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0 + log_must set_tunable64 METASLAB_DEBUG_LOAD 0 + + for i in {0..$(($MAX_NUM - 1))}; do + destroy_pool $POOLNAME-$i + done + # reset the devices + increase_device_sizes 0 + increase_device_sizes $FILE_SIZE +} + +log_assert "Pool imports can occur in parallel" + +log_onexit cleanup + +log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1 +log_must set_tunable64 METASLAB_DEBUG_LOAD 1 + + +# +# create some exported pools with import delay injectors +# +for i in {0..$(($MAX_NUM - 1))}; do + log_must zpool create $POOLNAME-$i $DEVICE_DIR/${DEVICE_FILE}$i + log_must zpool export $POOLNAME-$i + log_must zinject -P import -s 12 $POOLNAME-$i +done +wait + +# +# import half of the pools synchronously +# +SECONDS=0 +for i in {0..3}; do + log_must zpool import -d $DEVICE_DIR -f $POOLNAME-$i +done +sequential_time=$SECONDS +log_note "sequentially imported 4 pools in $sequential_time seconds" + +# +# import half of the pools in parallel +# +SECONDS=0 +for i in {4..7}; do + log_must zpool import -d $DEVICE_DIR -f $POOLNAME-$i & +done +wait +parallel_time=$SECONDS +log_note "asyncronously imported 4 pools in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +# +# export pools with import delay injectors +# +for i in {4..7}; do + log_must zpool export $POOLNAME-$i + log_must zinject -P import -s 12 $POOLNAME-$i +done +wait + +# +# now test zpool import -a +# +SECONDS=0 +log_must zpool import -a -d $DEVICE_DIR -f +parallel_time=$SECONDS +log_note "asyncronously imported 4 pools in $parallel_time seconds" + +log_must test $parallel_time -lt $(($sequential_time / 3)) + +log_pass "Pool imports occur in parallel" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh new file mode 100755 index 000000000000..c96961bf6419 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh @@ -0,0 +1,132 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg + +# +# DESCRIPTION: +# During a pool import, the 'import_progress' kstat contains details +# on the import progress. +# +# STRATEGY: +# 1. Create test pool with several devices +# 2. Generate some ZIL records and spacemap logs +# 3. Export the pool +# 4. Import the pool in the background and monitor the kstat content +# 5. Check the zfs debug messages for import progress +# + +verify_runnable "global" + +function cleanup +{ + log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0 + log_must set_tunable64 METASLAB_DEBUG_LOAD 0 + + destroy_pool $TESTPOOL1 +} + +log_assert "During a pool import, the 'import_progress' kstat contains " \ + "notes on the progress" + +log_onexit cleanup + +log_must zpool create $TESTPOOL1 $VDEV0 $VDEV1 $VDEV2 +typeset guid=$(zpool get -H -o value guid $TESTPOOL1) + +log_must zfs create -o recordsize=8k $TESTPOOL1/fs +# +# This dd command works around an issue where ZIL records aren't created +# after freezing the pool unless a ZIL header already exists. Create a file +# synchronously to force ZFS to write one out. +# +log_must dd if=/dev/zero of=/$TESTPOOL1/fs/sync conv=fsync bs=1 count=1 + +# +# Overwrite some blocks to populate spacemap logs +# +log_must dd if=/dev/urandom of=/$TESTPOOL1/fs/00 bs=1M count=200 +sync_all_pools +log_must dd if=/dev/urandom of=/$TESTPOOL1/fs/00 bs=1M count=200 +sync_all_pools + +# +# Freeze the pool to retain intent log records +# +log_must zpool freeze $TESTPOOL1 + +# fill_fs [destdir] [dirnum] [filenum] [bytes] [num_writes] [data] +log_must fill_fs /$TESTPOOL1/fs 1 2000 100 1024 R + +log_must zpool list -v $TESTPOOL1 + +# +# Unmount filesystem and export the pool +# +# At this stage the zfs intent log contains +# a set of records to replay. +# +log_must zfs unmount /$TESTPOOL1/fs + +log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1 +log_must zpool export $TESTPOOL1 + +log_must set_tunable64 METASLAB_DEBUG_LOAD 1 +log_note "Starting zpool import in background at" $(date +'%H:%M:%S') +zpool import -d $DEVICE_DIR -f $guid & +pid=$! + +# +# capture progress until import is finished +# +log_note waiting for pid $pid to exit +kstat import_progress +while [[ -d /proc/"$pid" ]]; do + line=$(kstat import_progress | grep -v pool_guid) + if [[ -n $line ]]; then + echo $line + fi + if [[ -f /$TESTPOOL1/fs/00 ]]; then + break; + fi + sleep 0.0001 +done +log_note "zpool import completed at" $(date +'%H:%M:%S') + +entries=$(kstat dbgmsg | grep "spa_import_progress_set_notes_impl(): 'testpool1'" | wc -l) +log_note "found $entries progress notes in dbgmsg" +log_must test $entries -gt 20 + +log_must zpool status $TESTPOOL1 + +log_pass "During a pool import, the 'import_progress' kstat contains " \ + "notes on the progress" diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh index 3e76347b029a..a3a422b99fd1 100755 --- a/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh @@ -67,7 +67,15 @@ log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio +timeout_handler() { + log_fail "${TIMEOUT_MESSAGE}" +} + +TIMEOUT_MESSAGE="Time out arcstat_quiescence_noecho l2_size before zpool offline" +trap timeout_handler USR1 +ppid="$$" && (sleep 600 && kill -USR1 "$ppid") & timeout_pid="$!" arcstat_quiescence_noecho l2_size +trap - USR1 log_must zpool offline $TESTPOOL $VDEV_CACHE arcstat_quiescence_noecho l2_size diff --git a/tests/zfs-tests/tests/functional/mmp/Makefile.am b/tests/zfs-tests/tests/functional/mmp/Makefile.am index 2848fd4ce692..211e026a87aa 100644 --- a/tests/zfs-tests/tests/functional/mmp/Makefile.am +++ b/tests/zfs-tests/tests/functional/mmp/Makefile.am @@ -8,6 +8,7 @@ dist_pkgdata_SCRIPTS = \ mmp_active_import.ksh \ mmp_inactive_import.ksh \ mmp_exported_import.ksh \ + mmp_write_slow_disk.ksh \ mmp_write_uberblocks.ksh \ mmp_reset_interval.ksh \ mmp_on_zdb.ksh \ diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh new file mode 100755 index 000000000000..8b118684aa7f --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh @@ -0,0 +1,97 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc +# + +# DESCRIPTION: +# Verify that long VDEV probes do not cause MMP checks to suspend pool +# Note: without PR-15839 fix, this test will suspend the pool. +# +# A device that is returning unexpected errors will trigger a vdev_probe. +# When the device additionally has slow response times, the probe can hold +# the spa config lock as a writer for a long period of time such that the +# mmp uberblock updates stall when trying to acquire the spa config lock. +# +# STRATEGY: +# 1. Create a pool with multiple leaf vdevs +# 2. Enable multihost and multihost_history +# 3. Delay for MMP writes to occur +# 4. Verify that a long VDEV probe didn't cause MMP check to suspend pool +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/mmp/mmp.cfg +. $STF_SUITE/tests/functional/mmp/mmp.kshlib + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + + if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then + log_must zpool clear $MMP_POOL + zpool get state $MMP_POOL $MMP_DIR/file.3 + zpool events | grep ".fs.zfs." | grep -v "history_event" + fi + + poolexists $MMP_POOL && destroy_pool $MMP_POOL + log_must rm -r $MMP_DIR + log_must mmp_clear_hostid +} + +log_assert "A long VDEV probe doesn't cause a MMP check suspend" +log_onexit cleanup + +MMP_HISTORY_URL=/proc/spl/kstat/zfs/$MMP_POOL/multihost + +# Create a multiple drive pool +log_must zpool events -c +log_must mkdir -p $MMP_DIR +log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5} +log_must zpool create -f $MMP_POOL \ + mirror $MMP_DIR/file.{0,1,2} \ + mirror $MMP_DIR/file.{3,4,5} + +# Enable MMP +log_must mmp_set_hostid $HOSTID1 +log_must zpool set multihost=on $MMP_POOL +clear_mmp_history + +# Inject vdev write error along with a delay +log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL +log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL +log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL + +log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5 +sleep 10 +sync_pool $MMP_POOL + +# Confirm mmp writes to the non-slow disks have taken place +for x in {0,1,2,4}; do + write_count=$(grep -c file.${x} $MMP_HISTORY_URL) + [[ $write_count -gt 0 ]] || log_fail "expecting mmp writes" +done + +# Expect that the pool was not suspended +log_must check_state $MMP_POOL "" "ONLINE" +health=$(zpool list -H -o health $MMP_POOL) +log_note "$MMP_POOL health is $health" +[[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected" + +log_pass "A long VDEV probe doesn't cause a MMP check suspend"