Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SSV 23250: Improve ZFS objset sync parallelism #94

Merged
merged 7 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/os/freebsd/spl/sys/taskq.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ extern "C" {

typedef struct taskq {
struct taskqueue *tq_queue;
int tq_nthreads;
} taskq_t;

typedef uintptr_t taskqid_t;
Expand Down Expand Up @@ -93,6 +94,8 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
taskq_ent_t *);
extern int taskq_empty_ent(taskq_ent_t *);
taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
kthread_t ***);
taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t);
taskq_t *taskq_create_proc(const char *, int, pri_t, int, int,
struct proc *, uint_t);
Expand Down
2 changes: 2 additions & 0 deletions include/os/linux/spl/sys/taskq.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
extern int taskq_empty_ent(taskq_ent_t *);
extern void taskq_init_ent(taskq_ent_t *);
extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
kthread_t ***);
extern void taskq_destroy(taskq_t *);
extern void taskq_wait_id(taskq_t *, taskqid_t);
extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
Expand Down
3 changes: 2 additions & 1 deletion include/os/windows/spl/sys/sysmacros.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ extern uint32_t cpu_number(void);
#define is_system_labeled() 0

extern unsigned int max_ncpus;
extern unsigned int boot_ncpus;
extern unsigned int num_ecores;

#ifndef RLIM64_INFINITY
#define RLIM64_INFINITY (~0ULL)
Expand Down Expand Up @@ -132,7 +134,6 @@ extern uint32_t zone_get_hostid(void *zone);
extern void spl_setup(void);
extern void spl_cleanup(void);

#define boot_ncpus max_ncpus
#define SET_ERROR(err) \
(__set_error(__FILE__, __func__, __LINE__, err), err)

Expand Down
7 changes: 5 additions & 2 deletions include/os/windows/spl/sys/taskq.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ struct proc;
#define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */

#ifdef _WIN32
#define TASKQ_TIMESHARE 0x0020 /* macOS dynamic thread priority */
#define TASKQ_REALLY_DYNAMIC 0x0040 /* don't filter out TASKQ_DYNAMIC */
#define TASKQ_TIMESHARE 0x0020 /* macOS dynamic thread priority */
#define TASKQ_REALLY_DYNAMIC 0x0040 /* don't filter out TASKQ_DYNAMIC */
#define TASKQ_CREATE_SYNCED 0x0080 /* don't deflate ncpus */
#endif
/*
* Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as
Expand Down Expand Up @@ -91,6 +92,8 @@ extern taskq_t *taskq_create_proc(const char *, int, pri_t, int, int,
proc_t *, uint_t);
extern taskq_t *taskq_create_sysdc(const char *, int, int, int,
proc_t *, uint_t, uint_t);
extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
kthread_t ***);
extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
extern void nulltask(void *);
extern void taskq_destroy(taskq_t *);
Expand Down
1 change: 1 addition & 0 deletions include/sys/dsl_dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ int dsl_dataset_rename_snapshot(const char *fsname,
const char *oldsnapname, const char *newsnapname, boolean_t recursive);
int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
minor_t cleanup_minor, const char *htag);
boolean_t zfeature_active(spa_feature_t f, void *arg);

blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);

Expand Down
2 changes: 1 addition & 1 deletion include/sys/metaslab_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ typedef struct metaslab_class_allocator {
*/
uint64_t mca_alloc_max_slots;
zfs_refcount_t mca_alloc_slots;
} metaslab_class_allocator_t;
} ____cacheline_aligned metaslab_class_allocator_t;

/*
* A metaslab class encompasses a category of allocatable top-level vdevs.
Expand Down
5 changes: 5 additions & 0 deletions include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,11 @@ extern void spa_sync_allpools(void);

extern int zfs_sync_pass_deferred_free;

/* spa sync taskqueues */
taskq_t *spa_sync_tq_create(spa_t *spa, const char *name);
void spa_sync_tq_destroy(spa_t *spa);
void spa_select_allocator(zio_t *zio);

/* spa namespace global mutex */
extern kmutex_t spa_namespace_lock;

Expand Down
27 changes: 20 additions & 7 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@
extern "C" {
#endif

typedef struct spa_alloc {
kmutex_t spaa_lock;
avl_tree_t spaa_tree;
} ____cacheline_aligned spa_alloc_t;

typedef struct spa_error_entry {
zbookmark_phys_t se_bookmark;
char *se_name;
Expand Down Expand Up @@ -182,6 +187,12 @@ typedef struct spa_taskqs {
taskq_t **stqs_taskq;
} spa_taskqs_t;

/* one for each thread in the spa sync taskq */
typedef struct spa_syncthread_info {
kthread_t *sti_thread;
taskq_t *sti_wr_iss_tq; /* assigned wr_iss taskq */
} spa_syncthread_info_t;

typedef enum spa_all_vdev_zap_action {
AVZ_ACTION_NONE = 0,
AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */
Expand Down Expand Up @@ -250,15 +261,17 @@ struct spa {
list_t spa_config_dirty_list; /* vdevs with dirty config */
list_t spa_state_dirty_list; /* vdevs with dirty state */
/*
* spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
* stored in spa_alloc_count. There is one tree and one lock for each
* allocator, to help improve allocation performance in write-heavy
* workloads.
* spa_allocs is an array, whose lengths is stored in spa_alloc_count.
* There is one tree and one lock for each allocator, to help improve
* allocation performance in write-heavy workloads.
*/
kmutex_t *spa_alloc_locks;
avl_tree_t *spa_alloc_trees;
spa_alloc_t *spa_allocs;
int spa_alloc_count;

/* per-allocator sync thread taskqs */
taskq_t *spa_sync_tq;
spa_syncthread_info_t *spa_syncthreads;

spa_aux_vdev_t spa_spares; /* hot spares */
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
nvlist_t *spa_label_features; /* Features for reading MOS */
Expand Down Expand Up @@ -442,7 +455,7 @@ extern char *spa_config_path;
extern char *zfs_deadman_failmode;
extern int spa_slop_shift;
extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio);
extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, void *arg, uint_t flags);
extern void spa_load_spares(spa_t *spa);
Expand Down
2 changes: 2 additions & 0 deletions include/sys/zfs_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,8 @@ extern taskq_t *system_taskq;
extern taskq_t *system_delay_taskq;

extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
kthread_t ***);
#define taskq_create_proc(a, b, c, d, e, p, f) \
(taskq_create(a, b, c, d, e, f))
#define taskq_create_sysdc(a, b, d, e, p, dc, f) \
Expand Down
10 changes: 8 additions & 2 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,9 @@ enum zio_flag {
ZIO_FLAG_FASTWRITE = 1 << 31,
};

#define ZIO_ALLOCATOR_NONE (-1)
#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE)

#define ZIO_FLAG_MUSTSUCCEED 0
#define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)

Expand Down Expand Up @@ -509,6 +512,9 @@ struct zio {

/* Taskq dispatching state */
taskq_ent_t io_tqent;

/* write issue taskq selection, based upon sync thread */
taskq_t *io_wr_iss_tq;
};

enum blk_verify_flag {
Expand Down Expand Up @@ -576,8 +582,8 @@ extern void zio_shrink(zio_t *zio, uint64_t size);

extern int zio_wait(zio_t *zio);
extern void zio_nowait(zio_t *zio);
extern void zio_execute(zio_t *zio);
extern void zio_interrupt(zio_t *zio);
extern void zio_execute(void *zio);
extern void zio_interrupt(void *zio);
extern void zio_delay_init(zio_t *zio);
extern void zio_delay_interrupt(zio_t *zio);
extern void zio_deadman(zio_t *zio, char *tag);
Expand Down
30 changes: 30 additions & 0 deletions lib/libzpool/taskq.c
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,36 @@ taskq_destroy(taskq_t *tq)
kmem_free(tq, sizeof (taskq_t));
}

/*
* Create a taskq with a specified number of pool threads. Allocate
* and return an array of nthreads kthread_t pointers, one for each
* thread in the pool. The array is not ordered and must be freed
* by the caller.
*/
taskq_t *
taskq_create_synced(const char *name, int nthreads, pri_t pri,
int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
{
taskq_t *tq;
kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
KM_SLEEP);

(void) pri; (void) minalloc; (void) maxalloc;

flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);

tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
flags | TASKQ_PREPOPULATE);
VERIFY(tq != NULL);
VERIFY(tq->tq_nthreads == nthreads);

for (int i = 0; i < nthreads; i++) {
kthreads[i] = tq->tq_threadlist[i];
}
*ktpp = kthreads;
return (tq);
}

int
taskq_member(taskq_t *tq, kthread_t *t)
{
Expand Down
26 changes: 17 additions & 9 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,14 @@ If we have less than this amount of free space,
most ZPL operations (e.g. write, create) will return
.Sy ENOSPC .
.
.It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32kB Pc Pq int
.It Sy spa_num_allocators Ns = Ns Sy 4 Pq int
Determines the number of block alloctators to use per spa instance.
Capped by the number of actual CPUs in the system.
.Pp
Note that setting this value too high could result in performance
degredation and/or excess fragmentation.
.
.It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint
During top-level vdev removal, chunks of data are copied from the vdev
which may include free space in order to trade bandwidth for IOPS.
This parameter determines the maximum span of free space, in bytes,
Expand Down Expand Up @@ -1869,14 +1876,7 @@ and may need to load new metaslabs to satisfy these allocations.
.It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq int
Rewrite new block pointers starting in this pass.
.
.It Sy zfs_sync_taskq_batch_pct Ns = Ns Sy 75 Ns % Pq int
This controls the number of threads used by
.Sy dp_sync_taskq .
The default value of
.Sy 75%
will create a maximum of one thread per CPU.
.
.It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128MB Pc Pq uint
.It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
Maximum size of TRIM command.
Larger ranges will be split into chunks no larger than this value before issuing.
.
Expand Down Expand Up @@ -2128,6 +2128,14 @@ If
.Sy 0 ,
generate a system-dependent value close to 6 threads per taskq.
.
.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint
Determines the number of CPUs to run write issue taskqs.
.Pp
When 0 (the default), the value to use is computed internally
as the number of actual CPUs in the system divided by the
.Sy spa_num_allocators
value.
.
.It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
Do not create zvol device nodes.
This may slightly improve startup time on
Expand Down
64 changes: 57 additions & 7 deletions module/nvpair/nvpair.c
Original file line number Diff line number Diff line change
Expand Up @@ -3216,6 +3216,56 @@ nvs_xdr_nvl_fini(nvstream_t *nvs)
return (0);
}

/*
* xdrproc_t-compatible callbacks for xdr_array()
*/

#if defined(_KERNEL) && defined(__linux__) /* Linux kernel */

#define NVS_BUILD_XDRPROC_T(type) \
static bool_t \
nvs_xdr_nvp_##type(XDR *xdrs, void *ptr) \
{ \
return (xdr_##type(xdrs, ptr)); \
}

#elif !defined(_KERNEL) && defined(XDR_CONTROL) /* tirpc */

#define NVS_BUILD_XDRPROC_T(type) \
static bool_t \
nvs_xdr_nvp_##type(XDR *xdrs, ...) \
{ \
va_list args; \
void *ptr; \
\
va_start(args, xdrs); \
ptr = va_arg(args, void *); \
va_end(args); \
\
return (xdr_##type(xdrs, ptr)); \
}

#else /* FreeBSD, sunrpc */

#define NVS_BUILD_XDRPROC_T(type) \
static bool_t \
nvs_xdr_nvp_##type(XDR *xdrs, void *ptr, ...) \
{ \
return (xdr_##type(xdrs, ptr)); \
}

#endif

/* BEGIN CSTYLED */
NVS_BUILD_XDRPROC_T(char);
NVS_BUILD_XDRPROC_T(short);
NVS_BUILD_XDRPROC_T(u_short);
NVS_BUILD_XDRPROC_T(int);
NVS_BUILD_XDRPROC_T(u_int);
NVS_BUILD_XDRPROC_T(longlong_t);
NVS_BUILD_XDRPROC_T(u_longlong_t);
/* END CSTYLED */

/*
* The format of xdr encoded nvpair is:
* encode_size, decode_size, name string, data type, nelem, data
Expand Down Expand Up @@ -3338,38 +3388,38 @@ nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
case DATA_TYPE_INT8_ARRAY:
case DATA_TYPE_UINT8_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t),
(xdrproc_t)xdr_char);
nvs_xdr_nvp_char);
break;

case DATA_TYPE_INT16_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t),
sizeof (int16_t), (xdrproc_t)xdr_short);
sizeof (int16_t), nvs_xdr_nvp_short);
break;

case DATA_TYPE_UINT16_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t),
sizeof (uint16_t), (xdrproc_t)xdr_u_short);
sizeof (uint16_t), nvs_xdr_nvp_u_short);
break;

case DATA_TYPE_BOOLEAN_ARRAY:
case DATA_TYPE_INT32_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t),
sizeof (int32_t), (xdrproc_t)xdr_int);
sizeof (int32_t), nvs_xdr_nvp_int);
break;

case DATA_TYPE_UINT32_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t),
sizeof (uint32_t), (xdrproc_t)xdr_u_int);
sizeof (uint32_t), nvs_xdr_nvp_u_int);
break;

case DATA_TYPE_INT64_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t),
sizeof (int64_t), (xdrproc_t)xdr_longlong_t);
sizeof (int64_t), nvs_xdr_nvp_longlong_t);
break;

case DATA_TYPE_UINT64_ARRAY:
ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t),
sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t);
sizeof (uint64_t), nvs_xdr_nvp_u_longlong_t);
break;

case DATA_TYPE_STRING_ARRAY: {
Expand Down
Loading
Loading