From 0196839ecb045c4bbec318fe30a17cc015e6665b Mon Sep 17 00:00:00 2001
From: Alexander <solbjorn@users.noreply.github.com>
Date: Tue, 20 Jul 2021 16:03:33 +0200
Subject: [PATCH 1/7] A few fixes of callback typecasting (for the upcoming
 ClangCFI)

* zio: avoid callback typecasting
* zil: avoid zil_itxg_clean() callback typecasting
* zpl: decouple zpl_readpage() into two separate callbacks
* nvpair: explicitly declare callbacks for xdr_array()
* linux/zfs_nvops: don't use external iput() as a callback
* zcp_synctask: don't use fnvlist_free() as a callback
* zvol: don't use ops->zv_free() as a callback for taskq_dispatch()

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Closes #12260
---
 include/sys/zio.h                  |  4 +-
 module/nvpair/nvpair.c             | 64 ++++++++++++++++++++++++++----
 module/os/linux/zfs/zfs_vnops_os.c |  8 +++-
 module/os/linux/zfs/zpl_file.c     | 19 +++++++--
 module/zfs/zcp_synctask.c          | 15 ++++---
 module/zfs/zil.c                   |  5 ++-
 module/zfs/zio.c                   | 18 ++++-----
 module/zfs/zvol.c                  | 10 ++++-
 8 files changed, 111 insertions(+), 32 deletions(-)

diff --git a/include/sys/zio.h b/include/sys/zio.h
index 94c6a2c5fa44..34d3d996cc38 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -576,8 +576,8 @@ extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
-extern void zio_execute(zio_t *zio);
-extern void zio_interrupt(zio_t *zio);
+extern void zio_execute(void *zio);
+extern void zio_interrupt(void *zio);
 extern void zio_delay_init(zio_t *zio);
 extern void zio_delay_interrupt(zio_t *zio);
 extern void zio_deadman(zio_t *zio, char *tag);
diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c
index dd06ccb73407..ed60b02a8c1d 100644
--- a/module/nvpair/nvpair.c
+++ b/module/nvpair/nvpair.c
@@ -3216,6 +3216,56 @@ nvs_xdr_nvl_fini(nvstream_t *nvs)
 	return (0);
 }
 
+/*
+ * xdrproc_t-compatible callbacks for xdr_array()
+ */
+
+#if defined(_KERNEL) && defined(__linux__) /* Linux kernel */
+
+#define	NVS_BUILD_XDRPROC_T(type)		\
+static bool_t					\
+nvs_xdr_nvp_##type(XDR *xdrs, void *ptr)	\
+{						\
+	return (xdr_##type(xdrs, ptr));		\
+}
+
+#elif !defined(_KERNEL) && defined(XDR_CONTROL) /* tirpc */
+
+#define	NVS_BUILD_XDRPROC_T(type)		\
+static bool_t					\
+nvs_xdr_nvp_##type(XDR *xdrs, ...)		\
+{						\
+	va_list args;				\
+	void *ptr;				\
+						\
+	va_start(args, xdrs);			\
+	ptr = va_arg(args, void *);		\
+	va_end(args);				\
+						\
+	return (xdr_##type(xdrs, ptr));		\
+}
+
+#else /* FreeBSD, sunrpc */
+
+#define	NVS_BUILD_XDRPROC_T(type)		\
+static bool_t					\
+nvs_xdr_nvp_##type(XDR *xdrs, void *ptr, ...)	\
+{						\
+	return (xdr_##type(xdrs, ptr));		\
+}
+
+#endif
+
+/* BEGIN CSTYLED */
+NVS_BUILD_XDRPROC_T(char);
+NVS_BUILD_XDRPROC_T(short);
+NVS_BUILD_XDRPROC_T(u_short);
+NVS_BUILD_XDRPROC_T(int);
+NVS_BUILD_XDRPROC_T(u_int);
+NVS_BUILD_XDRPROC_T(longlong_t);
+NVS_BUILD_XDRPROC_T(u_longlong_t);
+/* END CSTYLED */
+
 /*
  * The format of xdr encoded nvpair is:
  * encode_size, decode_size, name string, data type, nelem, data
@@ -3338,38 +3388,38 @@ nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
 	case DATA_TYPE_INT8_ARRAY:
 	case DATA_TYPE_UINT8_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t),
-		    (xdrproc_t)xdr_char);
+		    nvs_xdr_nvp_char);
 		break;
 
 	case DATA_TYPE_INT16_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t),
-		    sizeof (int16_t), (xdrproc_t)xdr_short);
+		    sizeof (int16_t), nvs_xdr_nvp_short);
 		break;
 
 	case DATA_TYPE_UINT16_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t),
-		    sizeof (uint16_t), (xdrproc_t)xdr_u_short);
+		    sizeof (uint16_t), nvs_xdr_nvp_u_short);
 		break;
 
 	case DATA_TYPE_BOOLEAN_ARRAY:
 	case DATA_TYPE_INT32_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t),
-		    sizeof (int32_t), (xdrproc_t)xdr_int);
+		    sizeof (int32_t), nvs_xdr_nvp_int);
 		break;
 
 	case DATA_TYPE_UINT32_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t),
-		    sizeof (uint32_t), (xdrproc_t)xdr_u_int);
+		    sizeof (uint32_t), nvs_xdr_nvp_u_int);
 		break;
 
 	case DATA_TYPE_INT64_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t),
-		    sizeof (int64_t), (xdrproc_t)xdr_longlong_t);
+		    sizeof (int64_t), nvs_xdr_nvp_longlong_t);
 		break;
 
 	case DATA_TYPE_UINT64_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t),
-		    sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t);
+		    sizeof (uint64_t), nvs_xdr_nvp_u_longlong_t);
 		break;
 
 	case DATA_TYPE_STRING_ARRAY: {
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 24c016c5fcf1..e0dc6ed95747 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -367,6 +367,12 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
 	return (error);
 }
 
+static void
+zfs_rele_async_task(void *arg)
+{
+	iput(arg);
+}
+
 void
 zfs_zrele_async(znode_t *zp)
 {
@@ -386,7 +392,7 @@ zfs_zrele_async(znode_t *zp)
 	 */
 	if (!atomic_add_unless(&ip->i_count, -1, 1)) {
 		VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
-		    (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
+		    zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
 	}
 }
 
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 524c43dcded4..0319148b983d 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -591,8 +591,8 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
  * only used to support mmap(2).  There will be an identical copy of the
  * data in the ARC which is kept up to date via .write() and .writepage().
  */
-static int
-zpl_readpage(struct file *filp, struct page *pp)
+static inline int
+zpl_readpage_common(struct page *pp)
 {
 	struct inode *ip;
 	struct page *pl[1];
@@ -620,6 +620,18 @@ zpl_readpage(struct file *filp, struct page *pp)
 	return (error);
 }
 
+static int
+zpl_readpage(struct file *filp, struct page *pp)
+{
+	return (zpl_readpage_common(pp));
+}
+
+static int
+zpl_readpage_filler(void *data, struct page *pp)
+{
+	return (zpl_readpage_common(pp));
+}
+
 /*
  * Populate a set of pages with data for the Linux page cache.  This
  * function will only be called for read ahead and never for demand
@@ -630,8 +642,7 @@ static int
 zpl_readpages(struct file *filp, struct address_space *mapping,
     struct list_head *pages, unsigned nr_pages)
 {
-	return (read_cache_pages(mapping, pages,
-	    (filler_t *)zpl_readpage, filp));
+	return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL));
 }
 
 static int
diff --git a/module/zfs/zcp_synctask.c b/module/zfs/zcp_synctask.c
index 4e0fa0d85cbf..c6ade59b9ced 100644
--- a/module/zfs/zcp_synctask.c
+++ b/module/zfs/zcp_synctask.c
@@ -54,6 +54,12 @@ typedef struct zcp_synctask_info {
 	int blocks_modified;
 } zcp_synctask_info_t;
 
+static void
+zcp_synctask_cleanup(void *arg)
+{
+	fnvlist_free(arg);
+}
+
 /*
  * Generic synctask interface for channel program syncfuncs.
  *
@@ -275,7 +281,7 @@ zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details)
 	fnvlist_add_boolean(ddsa.ddsa_snaps, dsname);
 
 	zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
-	    (zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps);
+	    zcp_synctask_cleanup, ddsa.ddsa_snaps);
 
 	err = zcp_sync_task(state, dsl_dataset_snapshot_check,
 	    dsl_dataset_snapshot_sync, &ddsa, sync, dsname);
@@ -363,7 +369,7 @@ zcp_synctask_inherit_prop(lua_State *state, boolean_t sync,
 	fnvlist_add_boolean(dpsa->dpsa_props, prop);
 
 	zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
-	    (zcp_cleanup_t *)&fnvlist_free, dpsa->dpsa_props);
+	    zcp_synctask_cleanup, dpsa->dpsa_props);
 
 	err = zcp_sync_task(state, zcp_synctask_inherit_prop_check,
 	    zcp_synctask_inherit_prop_sync, &zipa, sync, dsname);
@@ -402,7 +408,7 @@ zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details)
 	fnvlist_add_string(bmarks, new, source);
 
 	zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
-	    (zcp_cleanup_t *)&fnvlist_free, bmarks);
+	    zcp_synctask_cleanup, bmarks);
 
 	dsl_bookmark_create_arg_t dbca = {
 		.dbca_bmarks = bmarks,
@@ -467,8 +473,7 @@ zcp_synctask_wrapper(lua_State *state)
 	 * Make sure err_details is properly freed, even if a fatal error is
 	 * thrown during the synctask.
 	 */
-	zch = zcp_register_cleanup(state,
-	    (zcp_cleanup_t *)&fnvlist_free, err_details);
+	zch = zcp_register_cleanup(state, zcp_synctask_cleanup, err_details);
 
 	zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
 	boolean_t sync = lua_toboolean(state, lua_upvalueindex(2));
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 658513786b34..873129b8a3b3 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1823,12 +1823,13 @@ zil_itx_destroy(itx_t *itx)
  * so no locks are needed.
  */
 static void
-zil_itxg_clean(itxs_t *itxs)
+zil_itxg_clean(void *arg)
 {
 	itx_t *itx;
 	list_t *list;
 	avl_tree_t *t;
 	void *cookie;
+	itxs_t *itxs = arg;
 	itx_async_node_t *ian;
 
 	list = &itxs->i_sync_list;
@@ -2048,7 +2049,7 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
 	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
 	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
 	taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
-	    (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP);
+	    zil_itxg_clean, clean_me, TQ_NOSLEEP);
 	if (id == TASKQID_INVALID)
 		zil_itxg_clean(clean_me);
 }
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 7654755ec7b3..b18c11a6db92 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1891,8 +1891,8 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 	 * to dispatch the zio to another taskq at the same time.
 	 */
 	ASSERT(taskq_empty_ent(&zio->io_tqent));
-	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
-	    flags, &zio->io_tqent);
+	spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags,
+	    &zio->io_tqent);
 }
 
 static boolean_t
@@ -1923,7 +1923,7 @@ zio_issue_async(zio_t *zio)
 }
 
 void
-zio_interrupt(zio_t *zio)
+zio_interrupt(void *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
@@ -1981,8 +1981,8 @@ zio_delay_interrupt(zio_t *zio)
 				 * OpenZFS's timeout_generic().
 				 */
 				tid = taskq_dispatch_delay(system_taskq,
-				    (task_func_t *)zio_interrupt,
-				    zio, TQ_NOSLEEP, expire_at_tick);
+				    zio_interrupt, zio, TQ_NOSLEEP,
+				    expire_at_tick);
 				if (tid == TASKQID_INVALID) {
 					/*
 					 * Couldn't allocate a task.  Just
@@ -2103,7 +2103,7 @@ static zio_pipe_stage_t *zio_pipeline[];
  * it is externally visible.
  */
 void
-zio_execute(zio_t *zio)
+zio_execute(void *zio)
 {
 	fstrans_cookie_t cookie;
 
@@ -2296,8 +2296,9 @@ zio_nowait(zio_t *zio)
  */
 
 static void
-zio_reexecute(zio_t *pio)
+zio_reexecute(void *arg)
 {
+	zio_t *pio = arg;
 	zio_t *cio, *cio_next;
 
 	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
@@ -4792,8 +4793,7 @@ zio_done(zio_t *zio)
 			ASSERT(taskq_empty_ent(&zio->io_tqent));
 			spa_taskq_dispatch_ent(zio->io_spa,
 			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
-			    (task_func_t *)zio_reexecute, zio, 0,
-			    &zio->io_tqent);
+			    zio_reexecute, zio, 0, &zio->io_tqent);
 		}
 		return (NULL);
 	}
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 1bbdbceb3dde..40c6d619374a 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -1153,6 +1153,12 @@ zvol_create_minor(const char *name)
  * Remove minors for specified dataset including children and snapshots.
  */
 
+static void
+zvol_free_task(void *arg)
+{
+	ops->zv_free(arg);
+}
+
 void
 zvol_remove_minors_impl(const char *name)
 {
@@ -1204,8 +1210,8 @@ zvol_remove_minors_impl(const char *name)
 			mutex_exit(&zv->zv_state_lock);
 
 			/* Try parallel zv_free, if failed do it in place */
-			t = taskq_dispatch(system_taskq,
-			    (task_func_t *)ops->zv_free, zv, TQ_SLEEP);
+			t = taskq_dispatch(system_taskq, zvol_free_task, zv,
+			    TQ_SLEEP);
 			if (t == TASKQID_INVALID)
 				list_insert_head(&free_list, zv);
 		} else {

From cde45f93434a6a27d249938dca59ba0852cc1722 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 21 Jul 2021 08:40:36 -0400
Subject: [PATCH 2/7] Optimize allocation throttling

Remove mc_lock use from metaslab_class_throttle_*().  The math there
is based on refcounts and so atomic, so the only race possible there
is between zfs_refcount_count() and zfs_refcount_add().  But in most
cases metaslab_class_throttle_reserve() is called with the allocator
lock held, which covers the race.  In cases where the lock is not
held, GANG_ALLOCATION() or METASLAB_MUST_RESERVE are set, and so we
do not use zfs_refcount_count().  And even if we assume some other
non-existing scenario, the worst that may happen from this race is
few more I/Os get to allocation earlier, that is not a problem.

Move locks and data of different allocators into different cache
lines to avoid false sharing.  Group spa_alloc_* arrays together
into single array of aligned struct spa_alloc spa_allocs.  Align
struct metaslab_class_allocator.

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12314
---
 include/sys/metaslab_impl.h |  2 +-
 include/sys/spa_impl.h      | 15 +++++++++------
 module/zfs/metaslab.c       | 20 ++++----------------
 module/zfs/spa.c            | 12 ++++++------
 module/zfs/spa_misc.c       | 21 +++++++++------------
 module/zfs/zio.c            | 33 ++++++++++++++++-----------------
 6 files changed, 45 insertions(+), 58 deletions(-)

diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 9924c3ba0eaa..adf4c03a20db 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -157,7 +157,7 @@ typedef struct metaslab_class_allocator {
 	 */
 	uint64_t		mca_alloc_max_slots;
 	zfs_refcount_t		mca_alloc_slots;
-} metaslab_class_allocator_t;
+} ____cacheline_aligned metaslab_class_allocator_t;
 
 /*
  * A metaslab class encompasses a category of allocatable top-level vdevs.
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index bc88cfa15e8e..cb2c49e5852a 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -57,6 +57,11 @@
 extern "C" {
 #endif
 
+typedef struct spa_alloc {
+	kmutex_t	spaa_lock;
+	avl_tree_t	spaa_tree;
+} ____cacheline_aligned spa_alloc_t;
+
 typedef struct spa_error_entry {
 	zbookmark_phys_t	se_bookmark;
 	char			*se_name;
@@ -250,13 +255,11 @@ struct spa {
 	list_t		spa_config_dirty_list;	/* vdevs with dirty config */
 	list_t		spa_state_dirty_list;	/* vdevs with dirty state */
 	/*
-	 * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
-	 * stored in spa_alloc_count. There is one tree and one lock for each
-	 * allocator, to help improve allocation performance in write-heavy
-	 * workloads.
+	 * spa_allocs is an array, whose lengths is stored in spa_alloc_count.
+	 * There is one tree and one lock for each allocator, to help improve
+	 * allocation performance in write-heavy workloads.
 	 */
-	kmutex_t	*spa_alloc_locks;
-	avl_tree_t	*spa_alloc_trees;
+	spa_alloc_t	*spa_allocs;
 	int		spa_alloc_count;
 
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index c84b54e9233f..01c84a621aba 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -5611,19 +5611,11 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
     zio_t *zio, int flags)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
-	uint64_t available_slots = 0;
-	boolean_t slot_reserved = B_FALSE;
 	uint64_t max = mca->mca_alloc_max_slots;
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
-	mutex_enter(&mc->mc_lock);
-
-	uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots);
-	if (reserved_slots < max)
-		available_slots = max - reserved_slots;
-
-	if (slots <= available_slots || GANG_ALLOCATION(flags) ||
-	    flags & METASLAB_MUST_RESERVE) {
+	if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
+	    zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
 		/*
 		 * We reserve the slots individually so that we can unreserve
 		 * them individually when an I/O completes.
@@ -5631,11 +5623,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
 		for (int d = 0; d < slots; d++)
 			zfs_refcount_add(&mca->mca_alloc_slots, zio);
 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
-		slot_reserved = B_TRUE;
+		return (B_TRUE);
 	}
-
-	mutex_exit(&mc->mc_lock);
-	return (slot_reserved);
+	return (B_FALSE);
 }
 
 void
@@ -5645,10 +5635,8 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
-	mutex_enter(&mc->mc_lock);
 	for (int d = 0; d < slots; d++)
 		zfs_refcount_remove(&mca->mca_alloc_slots, zio);
-	mutex_exit(&mc->mc_lock);
 }
 
 static int
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 030e7e418e78..0111829a925a 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -9313,9 +9313,9 @@ spa_sync(spa_t *spa, uint64_t txg)
 	spa->spa_sync_pass = 0;
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		mutex_enter(&spa->spa_alloc_locks[i]);
-		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
-		mutex_exit(&spa->spa_alloc_locks[i]);
+		mutex_enter(&spa->spa_allocs[i].spaa_lock);
+		VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
+		mutex_exit(&spa->spa_allocs[i].spaa_lock);
 	}
 
 	/*
@@ -9426,9 +9426,9 @@ spa_sync(spa_t *spa, uint64_t txg)
 	dsl_pool_sync_done(dp, txg);
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		mutex_enter(&spa->spa_alloc_locks[i]);
-		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
-		mutex_exit(&spa->spa_alloc_locks[i]);
+		mutex_enter(&spa->spa_allocs[i].spaa_lock);
+		VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
+		mutex_exit(&spa->spa_allocs[i].spaa_lock);
 	}
 
 	/*
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index e6b8cc4b3d4b..436611fc909a 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -703,13 +703,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		spa->spa_root = spa_strdup(altroot);
 
 	spa->spa_alloc_count = spa_allocators;
-	spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
-	    sizeof (kmutex_t), KM_SLEEP);
-	spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
-	    sizeof (avl_tree_t), KM_SLEEP);
+	spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
+	    sizeof (spa_alloc_t), KM_SLEEP);
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
-		avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
+		mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
+		    NULL);
+		avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
 		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
 	}
 	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
@@ -802,13 +801,11 @@ spa_remove(spa_t *spa)
 	}
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		avl_destroy(&spa->spa_alloc_trees[i]);
-		mutex_destroy(&spa->spa_alloc_locks[i]);
+		avl_destroy(&spa->spa_allocs[i].spaa_tree);
+		mutex_destroy(&spa->spa_allocs[i].spaa_lock);
 	}
-	kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
-	    sizeof (kmutex_t));
-	kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
-	    sizeof (avl_tree_t));
+	kmem_free(spa->spa_allocs, spa->spa_alloc_count *
+	    sizeof (spa_alloc_t));
 
 	avl_destroy(&spa->spa_metaslabs_by_flushed);
 	avl_destroy(&spa->spa_sm_logs_by_txg);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index b18c11a6db92..f48234c4a2c6 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -877,8 +877,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
-		if (zio->io_metaslab_class == NULL)
-			zio->io_metaslab_class = pio->io_metaslab_class;
+		zio->io_metaslab_class = pio->io_metaslab_class;
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
@@ -3384,9 +3383,9 @@ zio_io_to_allocate(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
-	ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
+	ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock));
 
-	zio = avl_first(&spa->spa_alloc_trees[allocator]);
+	zio = avl_first(&spa->spa_allocs[allocator].spaa_tree);
 	if (zio == NULL)
 		return (NULL);
 
@@ -3398,11 +3397,11 @@ zio_io_to_allocate(spa_t *spa, int allocator)
 	 */
 	ASSERT3U(zio->io_allocator, ==, allocator);
 	if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
-	    zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
+	    zio->io_prop.zp_copies, allocator, zio, 0)) {
 		return (NULL);
 	}
 
-	avl_remove(&spa->spa_alloc_trees[allocator], zio);
+	avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio);
 	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
 
 	return (zio);
@@ -3426,8 +3425,8 @@ zio_dva_throttle(zio_t *zio)
 		return (zio);
 	}
 
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
-
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
@@ -3439,14 +3438,14 @@ zio_dva_throttle(zio_t *zio)
 	 * into 2^20 block regions, and then hash based on the objset, object,
 	 * level, and region to accomplish both of these goals.
 	 */
-	zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
+	int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object,
 	    bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
-	mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
-	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	zio->io_allocator = allocator;
 	zio->io_metaslab_class = mc;
-	avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
-	nio = zio_io_to_allocate(spa, zio->io_allocator);
-	mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
+	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
+	avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
+	nio = zio_io_to_allocate(spa, allocator);
+	mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
 	return (nio);
 }
 
@@ -3455,9 +3454,9 @@ zio_allocate_dispatch(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
-	mutex_enter(&spa->spa_alloc_locks[allocator]);
+	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
 	zio = zio_io_to_allocate(spa, allocator);
-	mutex_exit(&spa->spa_alloc_locks[allocator]);
+	mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
 	if (zio == NULL)
 		return;
 
@@ -3647,8 +3646,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 	 * some parallelism.
 	 */
 	int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
-	int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
-	    spa->spa_alloc_count;
+	int allocator = (uint_t)cityhash4(0, 0, 0,
+	    os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
 	    txg, NULL, flags, &io_alloc_list, NULL, allocator);
 	*slog = (error == 0);

From c85f1613b1fa1002582a0fc032d7be0339380d67 Mon Sep 17 00:00:00 2001
From: George Amanakis <gamanakis@gmail.com>
Date: Tue, 14 Feb 2023 01:37:46 +0100
Subject: [PATCH 3/7] Fix a race condition in dsl_dataset_sync() when
 activating features

The zio returned from arc_write() in dmu_objset_sync() uses
zio_nowait(). However we may reach the end of dsl_dataset_sync()
which checks if we need to activate features in the filesystem
without knowing if that zio has even run through the ZIO pipeline yet.
In that case we will flag features to be activated in
dsl_dataset_block_born() but dsl_dataset_sync() has already
completed its run and those features will not actually be activated.
Mitigate this by moving the feature activation code in
dsl_dataset_sync_done(). Also add new ASSERTs in
dsl_scan_visitbp() checking if a block contradicts any filesystem
flags.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #13816
---
 include/sys/dsl_dataset.h |  1 +
 module/zfs/dsl_dataset.c  | 23 ++++++++++++-----------
 module/zfs/dsl_scan.c     | 20 ++++++++++++++++++++
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h
index ed934f969e92..62863429ea6a 100644
--- a/include/sys/dsl_dataset.h
+++ b/include/sys/dsl_dataset.h
@@ -363,6 +363,7 @@ int dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive);
 int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
     minor_t cleanup_minor, const char *htag);
+boolean_t zfeature_active(spa_feature_t f, void *arg);
 
 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
 
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 238f2d29e74a..2eddd7634fc8 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -1055,7 +1055,7 @@ dsl_dataset_has_owner(dsl_dataset_t *ds)
 	return (rv);
 }
 
-static boolean_t
+boolean_t
 zfeature_active(spa_feature_t f, void *arg)
 {
 	switch (spa_feature_table[f].fi_type) {
@@ -2118,16 +2118,6 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 	}
 
 	dmu_objset_sync(ds->ds_objset, zio, tx);
-
-	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
-		if (zfeature_active(f, ds->ds_feature_activation[f])) {
-			if (zfeature_active(f, ds->ds_feature[f]))
-				continue;
-			dsl_dataset_activate_feature(ds->ds_object, f,
-			    ds->ds_feature_activation[f], tx);
-			ds->ds_feature[f] = ds->ds_feature_activation[f];
-		}
-	}
 }
 
 /*
@@ -2302,6 +2292,17 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
 	ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
 
 	dmu_buf_rele(ds->ds_dbuf, ds);
+
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (zfeature_active(f,
+		    ds->ds_feature_activation[f])) {
+			if (zfeature_active(f, ds->ds_feature[f]))
+				continue;
+			dsl_dataset_activate_feature(ds->ds_object, f,
+			    ds->ds_feature_activation[f], tx);
+			ds->ds_feature[f] = ds->ds_feature_activation[f];
+		}
+	}
 }
 
 int
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index b07be5b73395..570148bca280 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -1981,6 +1981,26 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
 		return;
 	}
 
+	/*
+	 * Check if this block contradicts any filesystem flags.
+	 */
+	spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS;
+	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
+		ASSERT(dsl_dataset_feature_is_active(ds, f));
+
+	f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
+	if (f != SPA_FEATURE_NONE)
+		ASSERT(dsl_dataset_feature_is_active(ds, f));
+
+	f = zio_compress_to_feature(BP_GET_COMPRESS(bp));
+	if (f != SPA_FEATURE_NONE)
+		ASSERT(dsl_dataset_feature_is_active(ds, f));
+
+	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
+		scn->scn_lt_min_this_txg++;
+		return;
+	}
+
 	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
 		scn->scn_lt_min_this_txg++;
 		return;

From 122aff033670985510e49ddc37a1d19d93071567 Mon Sep 17 00:00:00 2001
From: ednadolski-ix <137826107+ednadolski-ix@users.noreply.github.com>
Date: Mon, 6 Nov 2023 11:38:42 -0700
Subject: [PATCH 4/7] Improve ZFS objset sync parallelism

As part of transaction group commit, dsl_pool_sync() sequentially calls
dsl_dataset_sync() for each dirty dataset, which subsequently calls
dmu_objset_sync().  dmu_objset_sync() in turn uses up to 75% of CPU
cores to run sync_dnodes_task() in taskq threads to sync the dirty
dnodes (files).

There are two problems:

1. Each ZVOL in a pool is a separate dataset/objset having a single
   dnode.  This means the objsets are synchronized serially, which
   leads to a bottleneck of ~330K blocks written per second per pool.

2. In the case of multiple dirty dnodes/files on a dataset/objset on a
   big system they will be sync'd in parallel taskq threads. However,
   it is inefficient to to use 75% of CPU cores of a big system to do
   that, because of (a) bottlenecks on a single write issue taskq, and
   (b) allocation throttling.  In addition, if not for the allocation
   throttling sorting write requests by bookmarks (logical address),
   writes for different files may reach space allocators interleaved,
   leading to unwanted fragmentation.

The solution to both problems is to always sync no more and (if
possible) no fewer dnodes at the same time than there are allocators
the pool.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Edmund Nadolski <edmund.nadolski@ixsystems.com>
Closes #15197
---
 include/os/freebsd/spl/sys/taskq.h |   3 +
 include/os/linux/spl/sys/taskq.h   |   2 +
 include/sys/spa.h                  |   5 +
 include/sys/spa_impl.h             |  12 +-
 include/sys/zfs_context.h          |   2 +
 include/sys/zio.h                  |   6 +
 lib/libzpool/taskq.c               |  30 +++++
 man/man4/zfs.4                     |  26 ++--
 module/os/freebsd/spl/spl_taskq.c  |  82 +++++++++++++
 module/os/linux/spl/spl-taskq.c    |  36 ++++++
 module/zfs/dbuf.c                  |   9 +-
 module/zfs/dmu_objset.c            | 139 +++++++++++++++------
 module/zfs/dnode_sync.c            |   1 +
 module/zfs/dsl_dataset.c           |   5 +-
 module/zfs/dsl_pool.c              |  29 +++--
 module/zfs/spa.c                   | 187 ++++++++++++++++++++++++-----
 module/zfs/spa_misc.c              |  18 ++-
 module/zfs/zio.c                   |  43 ++++---
 18 files changed, 521 insertions(+), 114 deletions(-)

diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h
index 3040549e043d..fd7314498af6 100644
--- a/include/os/freebsd/spl/sys/taskq.h
+++ b/include/os/freebsd/spl/sys/taskq.h
@@ -42,6 +42,7 @@ extern "C" {
 
 typedef struct taskq {
 	struct taskqueue	*tq_queue;
+	int			tq_nthreads;
 } taskq_t;
 
 typedef uintptr_t taskqid_t;
@@ -93,6 +94,8 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
     taskq_ent_t *);
 extern int taskq_empty_ent(taskq_ent_t *);
 taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
+taskq_t	*taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
+    kthread_t ***);
 taskq_t	*taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t);
 taskq_t	*taskq_create_proc(const char *, int, pri_t, int, int,
     struct proc *, uint_t);
diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h
index b50175a10873..08e733a158f9 100644
--- a/include/os/linux/spl/sys/taskq.h
+++ b/include/os/linux/spl/sys/taskq.h
@@ -149,6 +149,8 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
 extern int taskq_empty_ent(taskq_ent_t *);
 extern void taskq_init_ent(taskq_ent_t *);
 extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
+extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
+    kthread_t ***);
 extern void taskq_destroy(taskq_t *);
 extern void taskq_wait_id(taskq_t *, taskqid_t);
 extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
diff --git a/include/sys/spa.h b/include/sys/spa.h
index c79cd0247d18..59f0eebfbfe5 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -828,6 +828,11 @@ extern void spa_sync_allpools(void);
 
 extern int zfs_sync_pass_deferred_free;
 
+/* spa sync taskqueues */
+taskq_t *spa_sync_tq_create(spa_t *spa, const char *name);
+void spa_sync_tq_destroy(spa_t *spa);
+void spa_select_allocator(zio_t *zio);
+
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;
 
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index cb2c49e5852a..e8f4b25523d3 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -187,6 +187,12 @@ typedef struct spa_taskqs {
 	taskq_t **stqs_taskq;
 } spa_taskqs_t;
 
+/* one for each thread in the spa sync taskq */
+typedef struct spa_syncthread_info {
+	kthread_t	*sti_thread;
+	taskq_t		*sti_wr_iss_tq;		/* assigned wr_iss taskq */
+} spa_syncthread_info_t;
+
 typedef enum spa_all_vdev_zap_action {
 	AVZ_ACTION_NONE = 0,
 	AVZ_ACTION_DESTROY,	/* Destroy all per-vdev ZAPs and the AVZ. */
@@ -262,6 +268,10 @@ struct spa {
 	spa_alloc_t	*spa_allocs;
 	int		spa_alloc_count;
 
+	/* per-allocator sync thread taskqs */
+	taskq_t		*spa_sync_tq;
+	spa_syncthread_info_t *spa_syncthreads;
+
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
 	nvlist_t	*spa_label_features;	/* Features for reading MOS */
@@ -445,7 +455,7 @@ extern char *spa_config_path;
 extern char *zfs_deadman_failmode;
 extern int spa_slop_shift;
 extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
-    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio);
 extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, void *arg, uint_t flags);
 extern void spa_load_spares(spa_t *spa);
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 4d67e652ab62..2b1d098a98c3 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -491,6 +491,8 @@ extern taskq_t *system_taskq;
 extern taskq_t *system_delay_taskq;
 
 extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
+extern taskq_t	*taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
+    kthread_t ***);
 #define	taskq_create_proc(a, b, c, d, e, p, f) \
 	    (taskq_create(a, b, c, d, e, f))
 #define	taskq_create_sysdc(a, b, d, e, p, dc, f) \
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 34d3d996cc38..041f4645bd75 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -222,6 +222,9 @@ enum zio_flag {
 	ZIO_FLAG_FASTWRITE	= 1 << 31,
 };
 
+#define	ZIO_ALLOCATOR_NONE	(-1)
+#define	ZIO_HAS_ALLOCATOR(zio)	((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
+
 #define	ZIO_FLAG_MUSTSUCCEED		0
 #define	ZIO_FLAG_RAW	(ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
 
@@ -509,6 +512,9 @@ struct zio {
 
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
+
+	/* write issue taskq selection, based upon sync thread */
+	taskq_t		*io_wr_iss_tq;
 };
 
 enum blk_verify_flag {
diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c
index 9b6d4e0d7e97..16b206e686ef 100644
--- a/lib/libzpool/taskq.c
+++ b/lib/libzpool/taskq.c
@@ -332,6 +332,36 @@ taskq_destroy(taskq_t *tq)
 	kmem_free(tq, sizeof (taskq_t));
 }
 
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+    int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+	taskq_t *tq;
+	kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+	    KM_SLEEP);
+
+	(void) pri; (void) minalloc; (void) maxalloc;
+
+	flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+	tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+	    flags | TASKQ_PREPOPULATE);
+	VERIFY(tq != NULL);
+	VERIFY(tq->tq_nthreads == nthreads);
+
+	for (int i = 0; i < nthreads; i++) {
+		kthreads[i] = tq->tq_threadlist[i];
+	}
+	*ktpp = kthreads;
+	return (tq);
+}
+
 int
 taskq_member(taskq_t *tq, kthread_t *t)
 {
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 6da8d42b42bd..660af4dc80e0 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -449,7 +449,14 @@ If we have less than this amount of free space,
 most ZPL operations (e.g. write, create) will return
 .Sy ENOSPC .
 .
-.It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32kB Pc Pq int
+.It Sy spa_num_allocators Ns = Ns Sy 4 Pq int
+Determines the number of block alloctators to use per spa instance.
+Capped by the number of actual CPUs in the system.
+.Pp
+Note that setting this value too high could result in performance
+degredation and/or excess fragmentation.
+.
+.It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint
 During top-level vdev removal, chunks of data are copied from the vdev
 which may include free space in order to trade bandwidth for IOPS.
 This parameter determines the maximum span of free space, in bytes,
@@ -1869,14 +1876,7 @@ and may need to load new metaslabs to satisfy these allocations.
 .It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq int
 Rewrite new block pointers starting in this pass.
 .
-.It Sy zfs_sync_taskq_batch_pct Ns = Ns Sy 75 Ns % Pq int
-This controls the number of threads used by
-.Sy dp_sync_taskq .
-The default value of
-.Sy 75%
-will create a maximum of one thread per CPU.
-.
-.It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128MB Pc Pq uint
+.It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
 Maximum size of TRIM command.
 Larger ranges will be split into chunks no larger than this value before issuing.
 .
@@ -2128,6 +2128,14 @@ If
 .Sy 0 ,
 generate a system-dependent value close to 6 threads per taskq.
 .
+.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint
+Determines the number of CPUs to run write issue taskqs.
+.Pp
+When 0 (the default), the value to use is computed internally
+as the number of actual CPUs in the system divided by the
+.Sy spa_num_allocators
+value.
+.
 .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Do not create zvol device nodes.
 This may slightly improve startup time on
diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c
index 3fa7939bdb3c..73431bb10040 100644
--- a/module/os/freebsd/spl/spl_taskq.c
+++ b/module/os/freebsd/spl/spl_taskq.c
@@ -218,6 +218,7 @@ taskq_create_impl(const char *name, int nthreads, pri_t pri,
 		nthreads = MAX((mp_ncpus * nthreads) / 100, 1);
 
 	tq = kmem_alloc(sizeof (*tq), KM_SLEEP);
+	tq->tq_nthreads = nthreads;
 	tq->tq_queue = taskqueue_create(name, M_WAITOK,
 	    taskqueue_thread_enqueue, &tq->tq_queue);
 	taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT,
@@ -252,6 +253,87 @@ taskq_destroy(taskq_t *tq)
 	kmem_free(tq, sizeof (*tq));
 }
 
+static void taskq_sync_assign(void *arg);
+
+typedef struct taskq_sync_arg {
+	kthread_t	*tqa_thread;
+	kcondvar_t	tqa_cv;
+	kmutex_t 	tqa_lock;
+	int		tqa_ready;
+} taskq_sync_arg_t;
+
+static void
+taskq_sync_assign(void *arg)
+{
+	taskq_sync_arg_t *tqa = arg;
+
+	mutex_enter(&tqa->tqa_lock);
+	tqa->tqa_thread = curthread;
+	tqa->tqa_ready = 1;
+	cv_signal(&tqa->tqa_cv);
+	while (tqa->tqa_ready == 1)
+		cv_wait(&tqa->tqa_cv, &tqa->tqa_lock);
+	mutex_exit(&tqa->tqa_lock);
+}
+
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+    int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+	taskq_t *tq;
+	taskq_sync_arg_t *tqs = kmem_zalloc(sizeof (*tqs) * nthreads, KM_SLEEP);
+	kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+	    KM_SLEEP);
+
+	flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+	tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+	    flags | TASKQ_PREPOPULATE);
+	VERIFY(tq != NULL);
+	VERIFY(tq->tq_nthreads == nthreads);
+
+	/* spawn all syncthreads */
+	for (int i = 0; i < nthreads; i++) {
+		cv_init(&tqs[i].tqa_cv, NULL, CV_DEFAULT, NULL);
+		mutex_init(&tqs[i].tqa_lock, NULL, MUTEX_DEFAULT, NULL);
+		(void) taskq_dispatch(tq, taskq_sync_assign,
+		    &tqs[i], TQ_FRONT);
+	}
+
+	/* wait on all syncthreads to start */
+	for (int i = 0; i < nthreads; i++) {
+		mutex_enter(&tqs[i].tqa_lock);
+		while (tqs[i].tqa_ready == 0)
+			cv_wait(&tqs[i].tqa_cv, &tqs[i].tqa_lock);
+		mutex_exit(&tqs[i].tqa_lock);
+	}
+
+	/* let all syncthreads resume, finish */
+	for (int i = 0; i < nthreads; i++) {
+		mutex_enter(&tqs[i].tqa_lock);
+		tqs[i].tqa_ready = 2;
+		cv_broadcast(&tqs[i].tqa_cv);
+		mutex_exit(&tqs[i].tqa_lock);
+	}
+	taskq_wait(tq);
+
+	for (int i = 0; i < nthreads; i++) {
+		kthreads[i] = tqs[i].tqa_thread;
+		mutex_destroy(&tqs[i].tqa_lock);
+		cv_destroy(&tqs[i].tqa_cv);
+	}
+	kmem_free(tqs, sizeof (*tqs) * nthreads);
+
+	*ktpp = kthreads;
+	return (tq);
+}
+
 int
 taskq_member(taskq_t *tq, kthread_t *thread)
 {
diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index 61631256c858..ca4676ac181c 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -1225,6 +1225,42 @@ taskq_destroy(taskq_t *tq)
 }
 EXPORT_SYMBOL(taskq_destroy);
 
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+    int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+	taskq_t *tq;
+	taskq_thread_t *tqt;
+	int i = 0;
+	kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+	    KM_SLEEP);
+
+	flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+	/* taskq_create spawns all the threads before returning */
+	tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+	    flags | TASKQ_PREPOPULATE);
+	VERIFY(tq != NULL);
+	VERIFY(tq->tq_nthreads == nthreads);
+
+	list_for_each_entry(tqt, &tq->tq_thread_list, tqt_thread_list) {
+		kthreads[i] = tqt->tqt_thread;
+		i++;
+	}
+
+	ASSERT3S(i, ==, nthreads);
+	*ktpp = kthreads;
+
+	return (tq);
+}
+EXPORT_SYMBOL(taskq_create_synced);
+
 static unsigned int spl_taskq_kick = 0;
 
 /*
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index e5b025990738..d9b8e653ffba 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -4461,6 +4461,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	}
 }
 
+/*
+ * Syncs out a range of dirty records for indirect or leaf dbufs.  May be
+ * called recursively from dbuf_sync_indirect().
+ */
 void
 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
@@ -4911,7 +4915,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
 }
 
 
-/* Issue I/O to commit a dirty buffer to disk. */
+/*
+ * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
+ * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
+ */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index e201e103eccc..a328514de7ea 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1593,28 +1593,90 @@ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
 	kmem_free(bp, sizeof (*bp));
 }
 
+typedef struct sync_objset_arg {
+	zio_t		*soa_zio;
+	objset_t	*soa_os;
+	dmu_tx_t	*soa_tx;
+	kmutex_t	soa_mutex;
+	int		soa_count;
+	taskq_ent_t	soa_tq_ent;
+} sync_objset_arg_t;
+
 typedef struct sync_dnodes_arg {
-	multilist_t *sda_list;
-	int sda_sublist_idx;
-	multilist_t *sda_newlist;
-	dmu_tx_t *sda_tx;
+	multilist_t	*sda_list;
+	int		sda_sublist_idx;
+	multilist_t	*sda_newlist;
+	sync_objset_arg_t *sda_soa;
 } sync_dnodes_arg_t;
 
+static void sync_meta_dnode_task(void *arg);
+
 static void
 sync_dnodes_task(void *arg)
 {
 	sync_dnodes_arg_t *sda = arg;
+	sync_objset_arg_t *soa = sda->sda_soa;
+	objset_t *os = soa->soa_os;
 
 	multilist_sublist_t *ms =
 	    multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
 
-	dmu_objset_sync_dnodes(ms, sda->sda_tx);
+	dmu_objset_sync_dnodes(ms, soa->soa_tx);
 
 	multilist_sublist_unlock(ms);
 
 	kmem_free(sda, sizeof (*sda));
+
+	mutex_enter(&soa->soa_mutex);
+	ASSERT(soa->soa_count != 0);
+	if (--soa->soa_count != 0) {
+		mutex_exit(&soa->soa_mutex);
+		return;
+	}
+	mutex_exit(&soa->soa_mutex);
+
+	taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
+	    sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
 }
 
+/*
+ * Issue the zio_nowait() for all dirty record zios on the meta dnode,
+ * then trigger the callback for the zil_sync. This runs once for each
+ * objset, only after any/all sublists in the objset have been synced.
+ */
+static void
+sync_meta_dnode_task(void *arg)
+{
+	sync_objset_arg_t *soa = arg;
+	objset_t *os = soa->soa_os;
+	dmu_tx_t *tx = soa->soa_tx;
+	int txgoff = tx->tx_txg & TXG_MASK;
+	dbuf_dirty_record_t *dr;
+
+	ASSERT0(soa->soa_count);
+
+	list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
+	while ((dr = list_remove_head(list)) != NULL) {
+		ASSERT0(dr->dr_dbuf->db_level);
+		zio_nowait(dr->dr_zio);
+	}
+
+	/* Enable dnode backfill if enough objects have been freed. */
+	if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
+		os->os_rescan_dnodes = B_TRUE;
+		os->os_freed_dnodes = 0;
+	}
+
+	/*
+	 * Free intent log blocks up to this tx.
+	 */
+	zil_sync(os->os_zil, tx);
+	os->os_phys->os_zil_header = os->os_zil_header;
+	zio_nowait(soa->soa_zio);
+
+	mutex_destroy(&soa->soa_mutex);
+	kmem_free(soa, sizeof (*soa));
+}
 
 /* called from dsl */
 void
@@ -1624,8 +1686,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
-	list_t *list;
-	dbuf_dirty_record_t *dr;
 	int num_sublists;
 	multilist_t *ml;
 	blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
@@ -1712,40 +1772,49 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 		    offsetof(dnode_t, dn_dirty_link[txgoff]));
 	}
 
+	/*
+	 * zio_nowait(zio) is done after any/all sublist and meta dnode
+	 * zios have been nowaited, and the zil_sync() has been performed.
+	 * The soa is freed at the end of sync_meta_dnode_task.
+	 */
+	sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP);
+	soa->soa_zio = zio;
+	soa->soa_os = os;
+	soa->soa_tx = tx;
+	taskq_init_ent(&soa->soa_tq_ent);
+	mutex_init(&soa->soa_mutex, NULL, MUTEX_DEFAULT, NULL);
+
 	ml = &os->os_dirty_dnodes[txgoff];
-	num_sublists = multilist_get_num_sublists(ml);
+	soa->soa_count = num_sublists = multilist_get_num_sublists(ml);
+
 	for (int i = 0; i < num_sublists; i++) {
 		if (multilist_sublist_is_empty_idx(ml, i))
-			continue;
-		sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
-		sda->sda_list = ml;
-		sda->sda_sublist_idx = i;
-		sda->sda_tx = tx;
-		(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
-		    sync_dnodes_task, sda, 0);
-		/* callback frees sda */
+			soa->soa_count--;
 	}
-	taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
 
-	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
-	while ((dr = list_head(list)) != NULL) {
-		ASSERT0(dr->dr_dbuf->db_level);
-		list_remove(list, dr);
-		zio_nowait(dr->dr_zio);
-	}
-
-	/* Enable dnode backfill if enough objects have been freed. */
-	if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
-		os->os_rescan_dnodes = B_TRUE;
-		os->os_freed_dnodes = 0;
+	if (soa->soa_count == 0) {
+		taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
+		    sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
+	} else {
+		/*
+		 * Sync sublists in parallel. The last to finish
+		 * (i.e., when soa->soa_count reaches zero) must
+		 *  dispatch sync_meta_dnode_task.
+		 */
+		for (int i = 0; i < num_sublists; i++) {
+			if (multilist_sublist_is_empty_idx(ml, i))
+				continue;
+			sync_dnodes_arg_t *sda =
+			    kmem_alloc(sizeof (*sda), KM_SLEEP);
+			sda->sda_list = ml;
+			sda->sda_sublist_idx = i;
+			sda->sda_soa = soa;
+			(void) taskq_dispatch(
+			    dmu_objset_pool(os)->dp_sync_taskq,
+			    sync_dnodes_task, sda, 0);
+			/* sync_dnodes_task frees sda */
+		}
 	}
-
-	/*
-	 * Free intent log blocks up to this tx.
-	 */
-	zil_sync(os->os_zil, tx);
-	os->os_phys->os_zil_header = os->os_zil_header;
-	zio_nowait(zio);
 }
 
 boolean_t
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index 56e0b06dd7b2..aecc19aee2f2 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -620,6 +620,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 
 /*
  * Write out the dnode's dirty buffers.
+ * Does not wait for zio completions.
  */
 void
 dnode_sync(dnode_t *dn, dmu_tx_t *tx)
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 2eddd7634fc8..3c47e4a2b3b6 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -2088,8 +2088,9 @@ dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
 	return (error);
 }
 
+/* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */
 void
-dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *rio, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(ds->ds_objset != NULL);
@@ -2117,7 +2118,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 		ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
 	}
 
-	dmu_objset_sync(ds->ds_objset, zio, tx);
+	dmu_objset_sync(ds->ds_objset, rio, tx);
 }
 
 /*
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 33080dfc7407..ca0b10badf0c 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -135,11 +135,14 @@ int zfs_delay_min_dirty_percent = 60;
 unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
 
 /*
+<<<<<<< HEAD
  * This determines the number of threads used by the dp_sync_taskq.
  */
 int zfs_sync_taskq_batch_pct = 75;
 
 /*
+=======
+>>>>>>> 85a589016 (Improve ZFS objset sync parallelism)
  * These tunables determine the behavior of how zil_itxg_clean() is
  * called via zil_clean() in the context of spa_sync(). When an itxg
  * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.
@@ -208,9 +211,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	txg_list_create(&dp->dp_early_sync_tasks, spa,
 	    offsetof(dsl_sync_task_t, dst_node));
 
-	dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
-	    zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
-	    TASKQ_THREADS_CPU_PCT);
+	dp->dp_sync_taskq = spa_sync_tq_create(spa, "dp_sync_taskq");
 
 	dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
 	    zfs_zil_clean_taskq_nthr_pct, minclsyspri,
@@ -399,7 +400,7 @@ dsl_pool_close(dsl_pool_t *dp)
 	txg_list_destroy(&dp->dp_dirty_dirs);
 
 	taskq_destroy(dp->dp_zil_clean_taskq);
-	taskq_destroy(dp->dp_sync_taskq);
+	spa_sync_tq_destroy(dp->dp_spa);
 
 	/*
 	 * We can't set retry to TRUE since we're explicitly specifying
@@ -619,7 +620,7 @@ dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
-	zio_t *zio;
+	zio_t *rio;	/* root zio for all dirty dataset syncs */
 	dmu_tx_t *tx;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
@@ -649,9 +650,10 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	}
 
 	/*
-	 * Write out all dirty blocks of dirty datasets.
+	 * Write out all dirty blocks of dirty datasets. Note, this could
+	 * create a very large (+10k) zio tree.
 	 */
-	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		/*
 		 * We must not sync any non-MOS datasets twice, because
@@ -660,9 +662,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 		 */
 		ASSERT(!list_link_active(&ds->ds_synced_link));
 		list_insert_tail(&synced_datasets, ds);
-		dsl_dataset_sync(ds, zio, tx);
+		dsl_dataset_sync(ds, rio, tx);
 	}
-	VERIFY0(zio_wait(zio));
+	VERIFY0(zio_wait(rio));
 
 	/*
 	 * Update the long range free counter after
@@ -693,13 +695,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	 * user accounting information (and we won't get confused
 	 * about which blocks are part of the snapshot).
 	 */
-	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		objset_t *os = ds->ds_objset;
 
 		ASSERT(list_link_active(&ds->ds_synced_link));
 		dmu_buf_rele(ds->ds_dbuf, ds);
-		dsl_dataset_sync(ds, zio, tx);
+		dsl_dataset_sync(ds, rio, tx);
 
 		/*
 		 * Release any key mappings created by calls to
@@ -712,7 +714,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 			key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
 		}
 	}
-	VERIFY0(zio_wait(zio));
+	VERIFY0(zio_wait(rio));
 
 	/*
 	 * Now that the datasets have been completely synced, we can
@@ -1405,9 +1407,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW,
 	"How quickly delay approaches infinity");
 
-ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW,
-	"Max percent of CPUs that are used to sync dirty data");
-
 ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW,
 	"Max percent of CPUs that are used per dp_sync_taskq");
 
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 0111829a925a..040786d32729 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -98,6 +98,7 @@
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
+#include <cityhash.h>
 
 /*
  * The interval, in seconds, at which failed configuration cache file writes
@@ -107,16 +108,16 @@ int zfs_ccw_retry_interval = 300;
 
 typedef enum zti_modes {
 	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
-	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
 	ZTI_MODE_SCALE,			/* Taskqs scale with CPUs. */
+	ZTI_MODE_SYNC,			/* sync thread assigned */
 	ZTI_MODE_NULL,			/* don't create a taskq */
 	ZTI_NMODES
 } zti_modes_t;
 
 #define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
 #define	ZTI_PCT(n)	{ ZTI_MODE_ONLINE_PERCENT, (n), 1 }
-#define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
 #define	ZTI_SCALE	{ ZTI_MODE_SCALE, 0, 1 }
+#define	ZTI_SYNC	{ ZTI_MODE_SYNC, 0, 1 }
 #define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
 
 #define	ZTI_N(n)	ZTI_P(n, 1)
@@ -137,14 +138,14 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
  * initializing a pool, we use this table to create an appropriately sized
  * taskq. Some operations are low volume and therefore have a small, static
  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
- * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macros. Other operations process a large amount of data; the ZTI_SCALE
  * macro causes us to create a taskq oriented for throughput. Some operations
  * are so high frequency and short-lived that the taskq itself can become a
  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  * additional degree of parallelism specified by the number of threads per-
  * taskq and the number of taskqs; when dispatching an event in this case, the
- * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
- * but with number of taskqs also scaling with number of CPUs.
+ * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
+ * that scales with the number of CPUs.
  *
  * The different taskq priorities are to handle the different contexts (issue
  * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
@@ -154,7 +155,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
 	{ ZTI_N(8),	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* READ */
-	{ ZTI_BATCH,	ZTI_N(5),	ZTI_SCALE,	ZTI_N(5) }, /* WRITE */
+	{ ZTI_SYNC,	ZTI_N(5),	ZTI_SCALE,	ZTI_N(5) }, /* WRITE */
 	{ ZTI_SCALE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
@@ -174,6 +175,8 @@ uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
 
 boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
 
+static uint_t	zio_taskq_wr_iss_ncpus = 0;
+
 /*
  * Report any spa_load_verify errors found, but do not fail spa_load.
  * This is used by zdb to analyze non-idle pools.
@@ -975,17 +978,34 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 	uint_t count = ztip->zti_count;
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	uint_t cpus, flags = TASKQ_DYNAMIC;
-	boolean_t batch = B_FALSE;
 
 	switch (mode) {
 	case ZTI_MODE_FIXED:
 		ASSERT3U(value, >, 0);
 		break;
 
-	case ZTI_MODE_BATCH:
-		batch = B_TRUE;
+	case ZTI_MODE_SYNC:
+
+		/*
+		 * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
+		 * not to exceed the number of spa allocators.
+		 */
+		if (zio_taskq_wr_iss_ncpus == 0) {
+			count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
+		} else {
+			count = MAX(1,
+			    boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
+		}
+		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
+		count = MIN(count, spa->spa_alloc_count);
+
+		/*
+		 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
+		 * single taskq may have more threads than 100% of online cpus.
+		 */
+		value = (zio_taskq_batch_pct + count / 2) / count;
+		value = MIN(value, 100);
 		flags |= TASKQ_THREADS_CPU_PCT;
-		value = MIN(zio_taskq_batch_pct, 100);
 		break;
 
 	case ZTI_MODE_SCALE:
@@ -1032,7 +1052,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 
 	default:
 		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
-		    "spa_activate()",
+		    "spa_taskqs_init()",
 		    zio_type_name[t], zio_taskq_types[q], mode, value);
 		break;
 	}
@@ -1053,9 +1073,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 			    zio_type_name[t], zio_taskq_types[q]);
 
 		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
-			if (batch)
-				flags |= TASKQ_DC_BATCH;
-
+			(void) zio_taskq_basedc;
 			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 			    spa->spa_proc, zio_taskq_basedc, flags);
 		} else {
@@ -1114,12 +1132,11 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 /*
  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
  * Note that a type may have multiple discrete taskqs to avoid lock contention
- * on the taskq itself. In that case we choose which taskq at random by using
- * the low bits of gethrtime().
+ * on the taskq itself.
  */
-void
-spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
-    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+static taskq_t *
+spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    zio_t *zio)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	taskq_t *tq;
@@ -1127,12 +1144,27 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
 	ASSERT3U(tqs->stqs_count, !=, 0);
 
+	if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+	    (zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
+		/* dispatch to assigned write issue taskq */
+		tq = zio->io_wr_iss_tq;
+		return (tq);
+	}
+
 	if (tqs->stqs_count == 1) {
 		tq = tqs->stqs_taskq[0];
 	} else {
 		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 	}
+	return (tq);
+}
 
+void
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent,
+    zio_t *zio)
+{
+	taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio);
 	taskq_dispatch_ent(tq, func, arg, flags, ent);
 }
 
@@ -1143,20 +1175,8 @@ void
 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, void *arg, uint_t flags)
 {
-	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
-	taskq_t *tq;
-	taskqid_t id;
-
-	ASSERT3P(tqs->stqs_taskq, !=, NULL);
-	ASSERT3U(tqs->stqs_count, !=, 0);
-
-	if (tqs->stqs_count == 1) {
-		tq = tqs->stqs_taskq[0];
-	} else {
-		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
-	}
-
-	id = taskq_dispatch(tq, func, arg, flags);
+	taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL);
+	taskqid_t id = taskq_dispatch(tq, func, arg, flags);
 	if (id)
 		taskq_wait_id(tq, id);
 }
@@ -9498,6 +9518,104 @@ spa_sync_allpools(void)
 	mutex_exit(&spa_namespace_lock);
 }
 
+taskq_t *
+spa_sync_tq_create(spa_t *spa, const char *name)
+{
+	kthread_t **kthreads;
+
+	ASSERT(spa->spa_sync_tq == NULL);
+	ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
+
+	/*
+	 * - do not allow more allocators than cpus.
+	 * - there may be more cpus than allocators.
+	 * - do not allow more sync taskq threads than allocators or cpus.
+	 */
+	int nthreads = spa->spa_alloc_count;
+	spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
+	    nthreads, KM_SLEEP);
+
+	spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
+	    nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
+	VERIFY(spa->spa_sync_tq != NULL);
+	VERIFY(kthreads != NULL);
+
+	spa_taskqs_t *tqs =
+	    &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
+
+	spa_syncthread_info_t *ti = spa->spa_syncthreads;
+	for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
+		ti->sti_thread = kthreads[i];
+		if (w == tqs->stqs_count) {
+			w = 0;
+		}
+		ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
+	}
+
+	kmem_free(kthreads, sizeof (*kthreads) * nthreads);
+	return (spa->spa_sync_tq);
+}
+
+void
+spa_sync_tq_destroy(spa_t *spa)
+{
+	ASSERT(spa->spa_sync_tq != NULL);
+
+	taskq_wait(spa->spa_sync_tq);
+	taskq_destroy(spa->spa_sync_tq);
+	kmem_free(spa->spa_syncthreads,
+	    sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
+	spa->spa_sync_tq = NULL;
+}
+
+void
+spa_select_allocator(zio_t *zio)
+{
+	zbookmark_phys_t *bm = &zio->io_bookmark;
+	spa_t *spa = zio->io_spa;
+
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+	/*
+	 * A gang block (for example) may have inherited its parent's
+	 * allocator, in which case there is nothing further to do here.
+	 */
+	if (ZIO_HAS_ALLOCATOR(zio))
+		return;
+
+	ASSERT(spa != NULL);
+	ASSERT(bm != NULL);
+
+	/*
+	 * First try to use an allocator assigned to the syncthread, and set
+	 * the corresponding write issue taskq for the allocator.
+	 * Note, we must have an open pool to do this.
+	 */
+	if (spa->spa_sync_tq != NULL) {
+		spa_syncthread_info_t *ti = spa->spa_syncthreads;
+		for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
+			if (ti->sti_thread == curthread) {
+				zio->io_allocator = i;
+				zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
+				return;
+			}
+		}
+	}
+
+	/*
+	 * We want to try to use as many allocators as possible to help improve
+	 * performance, but we also want logically adjacent IOs to be physically
+	 * adjacent to improve sequential read performance. We chunk each object
+	 * into 2^20 block regions, and then hash based on the objset, object,
+	 * level, and region to accomplish both of these goals.
+	 */
+	uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
+	    bm->zb_blkid >> 20);
+
+	zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
+	zio->io_wr_iss_tq = NULL;
+}
+
 /*
  * ==========================================================================
  * Miscellaneous routines
@@ -10077,3 +10195,6 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
 	"Whether extra ALLOC blkptrs were added to a livelist entry while it "
 	"was being condensed");
 /* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
+	"Number of CPUs to run write issue taskqs");
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 436611fc909a..904256323143 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -383,9 +383,13 @@ int spa_asize_inflation = 24;
  * See also the comments in zfs_space_check_t.
  */
 int spa_slop_shift = 5;
-uint64_t spa_min_slop = 128ULL * 1024 * 1024;
-uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
-int spa_allocators = 4;
+static const uint64_t spa_min_slop = 128ULL * 1024 * 1024;
+static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
+
+/*
+ * Number of allocators to use, per spa instance
+ */
+static int spa_num_allocators = 4;
 
 
 /*PRINTFLIKE2*/
@@ -702,7 +706,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	if (altroot)
 		spa->spa_root = spa_strdup(altroot);
 
-	spa->spa_alloc_count = spa_allocators;
+	/* Do not allow more allocators than CPUs. */
+	spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus);
+
 	spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
 	    sizeof (spa_alloc_t), KM_SLEEP);
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
@@ -711,6 +717,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
 		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
 	}
+
 	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
 	avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg,
@@ -2979,3 +2986,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, INT, ZMOD_RW,
 
 ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
 	param_get_int, ZMOD_RW, "Reserved free space in pool");
+
+ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
+	"Number of allocators per spa, capped by ncpus");
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index f48234c4a2c6..be06c386e3df 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -869,6 +869,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
+	zio->io_allocator = ZIO_ALLOCATOR_NONE;
 
 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
@@ -1891,7 +1892,7 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 	 */
 	ASSERT(taskq_empty_ent(&zio->io_tqent));
 	spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags,
-	    &zio->io_tqent);
+	    &zio->io_tqent, zio);
 }
 
 static boolean_t
@@ -1916,8 +1917,8 @@ zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 static zio_t *
 zio_issue_async(zio_t *zio)
 {
+	ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio));
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
-
 	return (NULL);
 }
 
@@ -2231,6 +2232,9 @@ zio_wait(zio_t *zio)
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		spa_select_allocator(zio);
+	}
 	__zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
@@ -2285,6 +2289,9 @@ zio_nowait(zio_t *zio)
 
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		spa_select_allocator(zio);
+	}
 	__zio_execute(zio);
 }
 
@@ -2742,6 +2749,13 @@ zio_gang_issue(zio_t *zio)
 	return (zio);
 }
 
+static void
+zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
+{
+	cio->io_allocator = pio->io_allocator;
+	cio->io_wr_iss_tq = pio->io_wr_iss_tq;
+}
+
 static void
 zio_write_gang_member_ready(zio_t *zio)
 {
@@ -2811,6 +2825,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 	if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP)
 		gbh_copies = SPA_DVAS_PER_BP - 1;
 
+	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
 	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
@@ -2874,6 +2889,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 	    zio_write_gang_done, NULL, pio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
+	zio_gang_inherit_allocator(pio, zio);
+
 	/*
 	 * Create and nowait the gang children.
 	 */
@@ -2904,6 +2921,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
+		zio_gang_inherit_allocator(zio, cio);
+
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(has_data);
@@ -3390,6 +3409,7 @@ zio_io_to_allocate(spa_t *spa, int allocator)
 		return (NULL);
 
 	ASSERT(IO_IS_ALLOCATING(zio));
+	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 	/*
 	 * Try to place a reservation for this zio. If we're unable to
@@ -3426,21 +3446,12 @@ zio_dva_throttle(zio_t *zio)
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
-	zbookmark_phys_t *bm = &zio->io_bookmark;
-	/*
-	 * We want to try to use as many allocators as possible to help improve
-	 * performance, but we also want logically adjacent IOs to be physically
-	 * adjacent to improve sequential read performance. We chunk each object
-	 * into 2^20 block regions, and then hash based on the objset, object,
-	 * level, and region to accomplish both of these goals.
-	 */
-	int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object,
-	    bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
-	zio->io_allocator = allocator;
+	int allocator = zio->io_allocator;
 	zio->io_metaslab_class = mc;
 	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
 	avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
@@ -3515,6 +3526,7 @@ zio_dva_allocate(zio_t *zio)
 	 * sync write performance.  If a log allocation fails, we will fall
 	 * back to spa_sync() which is abysmal for performance.
 	 */
+	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 	    &zio->io_alloc_list, zio, zio->io_allocator);
@@ -4380,6 +4392,7 @@ zio_ready(zio_t *zio)
 			ASSERT(IO_IS_ALLOCATING(zio));
 			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(zio->io_metaslab_class != NULL);
+			ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 			/*
 			 * We were unable to allocate anything, unreserve and
@@ -4466,6 +4479,7 @@ zio_dva_throttle_done(zio_t *zio)
 	}
 
 	ASSERT(IO_IS_ALLOCATING(pio));
+	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	ASSERT3P(zio, !=, zio->io_logical);
 	ASSERT(zio->io_logical != NULL);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
@@ -4528,6 +4542,7 @@ zio_done(zio_t *zio)
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(zio->io_bp != NULL);
+		ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 		metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
 		    zio->io_allocator);
@@ -4792,7 +4807,7 @@ zio_done(zio_t *zio)
 			ASSERT(taskq_empty_ent(&zio->io_tqent));
 			spa_taskq_dispatch_ent(zio->io_spa,
 			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
-			    zio_reexecute, zio, 0, &zio->io_tqent);
+			    zio_reexecute, zio, 0, &zio->io_tqent, NULL);
 		}
 		return (NULL);
 	}

From 0bda0e21c5fe6c2a5cff959053a190e06776d1d0 Mon Sep 17 00:00:00 2001
From: Jorgen Lundman <lundman@lundman.net>
Date: Mon, 11 Dec 2023 16:55:51 +0900
Subject: [PATCH 5/7] Windows: add taskq_create_synced()

Signed-off-by: Jorgen Lundman <lundman@lundman.net>
---
 include/os/windows/spl/sys/taskq.h |  2 +
 module/os/windows/spl/spl-taskq.c  | 79 ++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)

diff --git a/include/os/windows/spl/sys/taskq.h b/include/os/windows/spl/sys/taskq.h
index 8dccbccdc2a7..db9fe42f3026 100644
--- a/include/os/windows/spl/sys/taskq.h
+++ b/include/os/windows/spl/sys/taskq.h
@@ -91,6 +91,8 @@ extern taskq_t	*taskq_create_proc(const char *, int, pri_t, int, int,
     proc_t *, uint_t);
 extern taskq_t	*taskq_create_sysdc(const char *, int, int, int,
     proc_t *, uint_t, uint_t);
+extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
+    kthread_t ***);
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
 extern void	nulltask(void *);
 extern void	taskq_destroy(taskq_t *);
diff --git a/module/os/windows/spl/spl-taskq.c b/module/os/windows/spl/spl-taskq.c
index b307fe211808..47a72ae3ffc0 100644
--- a/module/os/windows/spl/spl-taskq.c
+++ b/module/os/windows/spl/spl-taskq.c
@@ -2795,6 +2795,85 @@ taskq_bucket_extend(void *arg)
 #endif /* __APPLE__ */
 }
 
+typedef struct taskq_sync_arg {
+	kthread_t *tqa_thread;
+	kcondvar_t tqa_cv;
+	kmutex_t tqa_lock;
+	int tqa_ready;
+} taskq_sync_arg_t;
+
+static void
+taskq_sync_assign(void *arg)
+{
+	taskq_sync_arg_t *tqa = arg;
+
+	mutex_enter(&tqa->tqa_lock);
+	tqa->tqa_thread = curthread;
+	tqa->tqa_ready = 1;
+	cv_signal(&tqa->tqa_cv);
+	while (tqa->tqa_ready == 1)
+		cv_wait(&tqa->tqa_cv, &tqa->tqa_lock);
+	mutex_exit(&tqa->tqa_lock);
+}
+
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+    int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+	taskq_t *tq;
+	taskq_sync_arg_t *tqs = kmem_zalloc(sizeof (*tqs) * nthreads, KM_SLEEP);
+	kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+	    KM_SLEEP);
+
+	flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+	tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+	    flags | TASKQ_PREPOPULATE);
+	VERIFY(tq != NULL);
+	VERIFY(tq->tq_nthreads == nthreads);
+
+	/* spawn all syncthreads */
+	for (int i = 0; i < nthreads; i++) {
+		cv_init(&tqs[i].tqa_cv, NULL, CV_DEFAULT, NULL);
+		mutex_init(&tqs[i].tqa_lock, NULL, MUTEX_DEFAULT, NULL);
+		(void) taskq_dispatch(tq, taskq_sync_assign,
+		    &tqs[i], TQ_FRONT);
+	}
+
+	/* wait on all syncthreads to start */
+	for (int i = 0; i < nthreads; i++) {
+		mutex_enter(&tqs[i].tqa_lock);
+		while (tqs[i].tqa_ready == 0)
+			cv_wait(&tqs[i].tqa_cv, &tqs[i].tqa_lock);
+		mutex_exit(&tqs[i].tqa_lock);
+	}
+
+	/* let all syncthreads resume, finish */
+	for (int i = 0; i < nthreads; i++) {
+		mutex_enter(&tqs[i].tqa_lock);
+		tqs[i].tqa_ready = 2;
+		cv_broadcast(&tqs[i].tqa_cv);
+		mutex_exit(&tqs[i].tqa_lock);
+	}
+	taskq_wait(tq);
+
+	for (int i = 0; i < nthreads; i++) {
+		kthreads[i] = tqs[i].tqa_thread;
+		mutex_destroy(&tqs[i].tqa_lock);
+		cv_destroy(&tqs[i].tqa_cv);
+	}
+	kmem_free(tqs, sizeof (*tqs) * nthreads);
+
+	*ktpp = kthreads;
+	return (tq);
+}
+
 static int
 taskq_kstat_update(kstat_t *ksp, int rw)
 {

From 0a5ac8f6830d2f36003c64a53ded4a5456c2b7a3 Mon Sep 17 00:00:00 2001
From: datacore-rm <raghunath.m@datacore.com>
Date: Fri, 9 Feb 2024 13:23:09 +0530
Subject: [PATCH 6/7] Have taskq_create_synced() wait for threads to be
 created.

---
 include/os/windows/spl/sys/sysmacros.h |  3 ++-
 include/os/windows/spl/sys/taskq.h     |  5 +++--
 module/os/windows/spl/spl-taskq.c      | 19 +++++++++++++++++--
 module/os/windows/spl/spl-windows.c    |  7 +++++++
 4 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/include/os/windows/spl/sys/sysmacros.h b/include/os/windows/spl/sys/sysmacros.h
index f0f38d98858e..cba1f5fc2d26 100644
--- a/include/os/windows/spl/sys/sysmacros.h
+++ b/include/os/windows/spl/sys/sysmacros.h
@@ -60,6 +60,8 @@ extern uint32_t cpu_number(void);
 #define	is_system_labeled()		0
 
 extern unsigned int max_ncpus;
+extern unsigned	int boot_ncpus;
+extern unsigned int num_ecores;
 
 #ifndef RLIM64_INFINITY
 #define	RLIM64_INFINITY			(~0ULL)
@@ -132,7 +134,6 @@ extern uint32_t zone_get_hostid(void *zone);
 extern void spl_setup(void);
 extern void spl_cleanup(void);
 
-#define	boot_ncpus max_ncpus
 #define	SET_ERROR(err) \
 	(__set_error(__FILE__, __func__, __LINE__, err), err)
 
diff --git a/include/os/windows/spl/sys/taskq.h b/include/os/windows/spl/sys/taskq.h
index db9fe42f3026..1459a778310d 100644
--- a/include/os/windows/spl/sys/taskq.h
+++ b/include/os/windows/spl/sys/taskq.h
@@ -58,8 +58,9 @@ struct proc;
 #define	TASKQ_DC_BATCH		0x0010	/* Taskq uses SDC in batch mode */
 
 #ifdef _WIN32
-#define	TASKQ_TIMESHARE         0x0020  /* macOS dynamic thread priority */
-#define	TASKQ_REALLY_DYNAMIC    0x0040  /* don't filter out TASKQ_DYNAMIC */
+#define	TASKQ_TIMESHARE		0x0020  /* macOS dynamic thread priority */
+#define	TASKQ_REALLY_DYNAMIC	0x0040  /* don't filter out TASKQ_DYNAMIC */
+#define	TASKQ_CREATE_SYNCED	0x0080	/* don't deflate ncpus */
 #endif
 /*
  * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as
diff --git a/module/os/windows/spl/spl-taskq.c b/module/os/windows/spl/spl-taskq.c
index 47a72ae3ffc0..499485d904c2 100644
--- a/module/os/windows/spl/spl-taskq.c
+++ b/module/os/windows/spl/spl-taskq.c
@@ -2374,6 +2374,8 @@ taskq_create_common(const char *name, int instance, int nthreads, pri_t pri,
 	taskq_t *tq = kmem_cache_alloc(taskq_cache, KM_SLEEP);
 #ifdef _WIN32
 	uint_t ncpus = max_ncpus;
+	if (!(flags & TASKQ_CREATE_SYNCED))
+		ncpus = boot_ncpus; /* possibly deflated by num_ecores */
 #else
 	uint_t ncpus = ((boot_max_ncpus == -1) ? max_ncpus : boot_max_ncpus);
 #endif
@@ -2834,9 +2836,22 @@ taskq_create_synced(const char *name, int nthreads, pri_t pri,
 	flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
 
 	tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
-	    flags | TASKQ_PREPOPULATE);
+	    flags | TASKQ_PREPOPULATE | TASKQ_CREATE_SYNCED);
+
 	VERIFY(tq != NULL);
-	VERIFY(tq->tq_nthreads == nthreads);
+
+	/* wait until our minalloc (nthreads) threads are created */
+	mutex_enter(&tq->tq_lock);
+	for (int i = 1; tq->tq_nthreads != nthreads; i++) {
+		dprintf("SPL: %s:%d: waiting for tq_nthreads (%d)"
+		    " to be nthreads (%d), (target = %d, pass %d)\n",
+		    __func__, __LINE__,
+		    tq->tq_nthreads, tq->tq_nthreads_target,  nthreads, i);
+		cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
+	}
+	mutex_exit(&tq->tq_lock);
+
+	VERIFY3U(tq->tq_nthreads, ==, nthreads);
 
 	/* spawn all syncthreads */
 	for (int i = 0; i < nthreads; i++) {
diff --git a/module/os/windows/spl/spl-windows.c b/module/os/windows/spl/spl-windows.c
index a58fdcc88198..ae3135208fef 100644
--- a/module/os/windows/spl/spl-windows.c
+++ b/module/os/windows/spl/spl-windows.c
@@ -42,6 +42,7 @@
 static utsname_t utsname_static = { { 0 } };
 
 unsigned int max_ncpus = 0;
+unsigned int boot_ncpus = 0;
 uint64_t  total_memory = 0;
 uint64_t  real_total_memory = 0;
 
@@ -480,6 +481,12 @@ spl_start(PUNICODE_STRING RegistryPath)
 	max_ncpus = KeQueryActiveProcessorCountEx(ALL_PROCESSOR_GROUPS);
 	if (!max_ncpus) max_ncpus = 1;
 	dprintf("SPL: total ncpu %d\n", max_ncpus);
+#if defined(__arm64__)
+	num_ecores = (max_ncpus > 4) ? 4 : 0; // Apple has 4 eCores, fixme
+	boot_ncpus = MAX(1, (int)max_ncpus - (int)num_ecores);
+#else
+	boot_ncpus = max_ncpus;
+#endif
 
 	// Not sure how to get physical RAM size in a Windows Driver
 	// So until then, pull some numbers out of the aether. Next

From 67e956d98d76498f078c7096ec1862f601257dd7 Mon Sep 17 00:00:00 2001
From: datacore-rm <raghunath.m@datacore.com>
Date: Tue, 27 Feb 2024 14:42:36 +0530
Subject: [PATCH 7/7] SSV-23250: Resolve merge conflicts.

---
 module/zfs/dsl_pool.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index ca0b10badf0c..a8f83e1a8b3c 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -135,14 +135,6 @@ int zfs_delay_min_dirty_percent = 60;
 unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
 
 /*
-<<<<<<< HEAD
- * This determines the number of threads used by the dp_sync_taskq.
- */
-int zfs_sync_taskq_batch_pct = 75;
-
-/*
-=======
->>>>>>> 85a589016 (Improve ZFS objset sync parallelism)
  * These tunables determine the behavior of how zil_itxg_clean() is
  * called via zil_clean() in the context of spa_sync(). When an itxg
  * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.