diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index 83190c6fbe3f..028bf0155bd1 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -132,6 +132,36 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T], [ ]) ]) +dnl # +dnl # Upstream patch for blkdev copy offload support +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_COPY_OFFLOAD], [ + ZFS_LINUX_TEST_SRC([blkdev_copy_offload], [ + #include + #include + ], [ + struct block_device *bdev_in = NULL, *bdev_out = NULL; + loff_t pos_in = 0, pos_out = 0; + ssize_t ret __attribute__ ((unused)); + ssize_t len = 0; + void *private = NULL; + void (*endio)(void *, int, ssize_t) = NULL; + ret = blkdev_copy_offload(bdev_in, pos_in, pos_out, len, + endio, private, GFP_KERNEL, bdev_out); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_COPY_OFFLOAD], [ + AC_MSG_CHECKING([whether blkdev_copy_offload exists]) + ZFS_LINUX_TEST_RESULT([blkdev_copy_offload], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_COPY_OFFLOAD, 1, + [blkdev_copy_offload exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + dnl # dnl # 2.6.38 API change, dnl # Added blkdev_put() @@ -759,6 +789,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T + ZFS_AC_KERNEL_SRC_BLKDEV_COPY_OFFLOAD ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ @@ -781,4 +812,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T + ZFS_AC_KERNEL_BLKDEV_COPY_OFFLOAD ]) diff --git a/include/sys/zvol.h b/include/sys/zvol.h index c79fe1d9ad22..e236a4cd18a3 100644 --- a/include/sys/zvol.h +++ b/include/sys/zvol.h @@ -56,6 +56,8 @@ extern int zvol_set_ro(const char *, boolean_t); extern zvol_state_handle_t *zvol_suspend(const char *); extern int zvol_resume(zvol_state_handle_t *); extern void *zvol_tag(zvol_state_handle_t *); +extern int zvol_clone_range(zvol_state_handle_t *, uint64_t, + zvol_state_handle_t *, uint64_t, uint64_t); extern int zvol_init(void); extern void zvol_fini(void); diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h index 3cd0d78c353d..55021a080076 100644 --- a/include/sys/zvol_impl.h +++ b/include/sys/zvol_impl.h @@ -83,6 +83,9 @@ void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len); void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, boolean_t commit); +void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, + uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, + size_t nbps); int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio); int zvol_init_impl(void); diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 7c9aae6a66af..e882e6e67f0f 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -68,6 +69,8 @@ static unsigned int zvol_threads = 0; static unsigned int zvol_blk_mq_threads = 0; static unsigned int zvol_blk_mq_actual_threads; static boolean_t zvol_use_blk_mq = B_FALSE; +static boolean_t zvol_bclone_enabled = B_TRUE; +static unsigned long zvol_max_copy_bytes = 0; /* * The maximum number of volblocksize blocks to process per thread. Typically, @@ -496,6 +499,85 @@ zvol_read_task(void *arg) zv_request_task_free(task); } +#ifdef HAVE_BLKDEV_COPY_OFFLOAD +static void zvol_clone_range_impl(zv_request_t *zvr) +{ + zvol_state_t *zv_src = zvr->zv, *zv_dst = NULL; + struct request *req = zvr->rq; + struct bio *bio = zvr->bio; + zfs_uio_t uio_src, uio_dst; + uint64_t len = 0; + int error = EINVAL, seg = 1; + struct blkdev_copy_offload_io *offload_io; + + if (!zvol_bclone_enabled) { + zvol_end_io(bio, req, -SET_ERROR(EOPNOTSUPP)); + return; + } + + memset(&uio_src, 0, sizeof (zfs_uio_t)); + memset(&uio_dst, 0, sizeof (zfs_uio_t)); + + if (bio) { + /* + * Single-Queue Request: driver_private contains the + * destination ZVOL. + */ + offload_io = bio->bi_private; + if (offload_io && offload_io->driver_private) + zv_dst = offload_io->driver_private; + if (bio->bi_iter.bi_size != + offload_io->dst_bio->bi_iter.bi_size) { + zvol_end_io(bio, req, -SET_ERROR(error)); + return; + } + zfs_uio_bvec_init(&uio_src, bio, NULL); + zfs_uio_bvec_init(&uio_dst, offload_io->dst_bio, NULL); + len = bio->bi_iter.bi_size; + } else { + /* + * Multi-Queue (MQ) Request: First bio contains information + * about destination and the second contains information + * about the source + */ + struct bio *bio_temp; + __rq_for_each_bio(bio_temp, req) { + if (seg == blk_rq_nr_phys_segments(req)) { + offload_io = bio_temp->bi_private; + zfs_uio_bvec_init(&uio_src, bio_temp, NULL); + if (len != bio_temp->bi_iter.bi_size) { + zvol_end_io(bio, req, + -SET_ERROR(error)); + return; + } + if (offload_io && offload_io->driver_private) + zv_dst = offload_io->driver_private; + } else { + zfs_uio_bvec_init(&uio_dst, bio_temp, NULL); + len = bio_temp->bi_iter.bi_size; + } + seg++; + } + } + + if (!zv_src || !zv_dst) { + zvol_end_io(bio, req, -SET_ERROR(error)); + return; + } + + error = zvol_clone_range(zv_src, uio_src.uio_loffset, zv_dst, + uio_dst.uio_loffset, len); + zvol_end_io(bio, req, -error); +} + +static void +zvol_clone_range_task(void *arg) +{ + zv_request_task_t *task = arg; + zvol_clone_range_impl(&task->zvr); + zv_request_task_free(task); +} +#endif /* * Process a BIO or request @@ -555,6 +637,24 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, blk_mq_hw_queue); tq_idx = taskq_hash % ztqs->tqs_cnt; +#ifdef HAVE_BLKDEV_COPY_OFFLOAD + if ((bio && op_is_copy(bio_op(bio))) || + (rq && op_is_copy(req_op(rq)))) { + if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { + zvol_end_io(bio, rq, -SET_ERROR(EROFS)); + goto out; + } + if (force_sync) { + zvol_clone_range_impl(&zvr); + } else { + task = zv_request_task_create(zvr); + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], + zvol_clone_range_task, task, 0, &task->ent); + } + goto out; + } +#endif + if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { zvol_end_io(bio, rq, -SET_ERROR(EROFS)); @@ -1607,6 +1707,10 @@ zvol_os_create_minor(const char *name) uint64_t hash = zvol_name_hash(name); uint64_t volthreading; bool replayed_zil = B_FALSE; +#ifdef HAVE_BLKDEV_COPY_OFFLOAD + struct queue_limits *lim; + uint64_t max_clone_blocks = 1022; +#endif if (zvol_inhibit_dev) return (0); @@ -1693,6 +1797,33 @@ zvol_os_create_minor(const char *name) else replayed_zil = zil_replay(os, zv, zvol_replay_vector); } +#ifdef HAVE_BLKDEV_COPY_OFFLOAD + lim = &zv->zv_zso->zvo_queue->limits; + lim->max_user_copy_sectors = UINT_MAX; + + /* + * When zvol_bclone_enabled is unset, blkdev_copy_offload() should + * return early and fall back to the default path. Existing zvols + * would require export/import to make this applicable. + */ + if (!zvol_bclone_enabled) { + lim->max_copy_hw_sectors = 0; + lim->max_copy_sectors = 0; + } else if (!zvol_max_copy_bytes) { + if (zv->zv_zilog) + max_clone_blocks = zil_max_log_data(zv->zv_zilog, + sizeof (lr_clone_range_t)) / sizeof (blkptr_t); + lim->max_copy_hw_sectors = MIN((doi->doi_data_block_size * + max_clone_blocks), BLK_COPY_MAX_BYTES) >> SECTOR_SHIFT; + lim->max_copy_sectors = MIN((doi->doi_data_block_size * + max_clone_blocks), BLK_COPY_MAX_BYTES) >> SECTOR_SHIFT; + } else { + lim->max_copy_hw_sectors = MIN(zvol_max_copy_bytes, + BLK_COPY_MAX_BYTES) >> SECTOR_SHIFT; + lim->max_copy_sectors = MIN(zvol_max_copy_bytes, + BLK_COPY_MAX_BYTES) >> SECTOR_SHIFT; + } +#endif if (replayed_zil) zil_close(zv->zv_zilog); zv->zv_zilog = NULL; @@ -1934,6 +2065,12 @@ module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, "Process volblocksize blocks per thread"); +module_param(zvol_max_copy_bytes, ulong, 0644); +MODULE_PARM_DESC(zvol_max_copy_bytes, "max copy bytes for zvol block cloning"); + +module_param(zvol_bclone_enabled, uint, 0644); +MODULE_PARM_DESC(zvol_bclone_enabled, "Disable block cloning for zvols"); + #ifndef HAVE_BLKDEV_GET_ERESTARTSYS module_param(zvol_open_timeout_ms, uint, 0644); MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 6c15a5c472ea..8c8ed255e686 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -71,7 +71,7 @@ int zfs_bclone_enabled = 1; * a copy of the file and is therefore not the default. However, in certain * scenarios this behavior may be desirable so a tunable is provided. */ -static int zfs_bclone_wait_dirty = 0; +int zfs_bclone_wait_dirty = 0; /* * Enable Direct I/O. If this setting is 0, then all I/O requests will be diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 78bf714170d2..b5d8d1b71111 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -93,6 +93,7 @@ unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; struct hlist_head *zvol_htable; static list_t zvol_state_list; krwlock_t zvol_state_lock; +extern int zfs_bclone_wait_dirty; typedef enum { ZVOL_ASYNC_REMOVE_MINORS, @@ -516,6 +517,241 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) return (error); } +/* + * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed + * after a system failure + */ +static int +zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) +{ + zvol_state_t *zv = arg1; + lr_clone_range_t *lr = arg2; + objset_t *os = zv->zv_objset; + dmu_tx_t *tx; + int error; + uint64_t blksz; + uint64_t off; + uint64_t len; + + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(spa_feature_is_enabled(dmu_objset_spa(os), + SPA_FEATURE_BLOCK_CLONING)); + + off = lr->lr_offset; + len = lr->lr_length; + blksz = lr->lr_blksz; + + if ((off % blksz) != 0) { + return (SET_ERROR(EINVAL)); + } + + error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); + if (error != 0 || !zv->zv_dn) + return (error); + tx = dmu_tx_create(os); + dmu_tx_hold_clone_by_dnode(tx, zv->zv_dn, off, len); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + goto out; + } + error = dmu_brt_clone(zv->zv_objset, ZVOL_OBJ, off, len, + tx, lr->lr_bps, lr->lr_nbps); + if (error != 0) { + dmu_tx_commit(tx); + goto out; + } + + /* + * zil_replaying() not only check if we are replaying ZIL, but also + * updates the ZIL header to record replay progress. + */ + VERIFY(zil_replaying(zv->zv_zilog, tx)); + dmu_tx_commit(tx); + +out: + dnode_rele(zv->zv_dn, zv); + zv->zv_dn = NULL; + return (error); +} + +int +zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst, + uint64_t outoff, uint64_t len) +{ + zilog_t *zilog_dst; + zfs_locked_range_t *inlr, *outlr; + objset_t *inos, *outos; + dmu_tx_t *tx; + blkptr_t *bps; + size_t maxblocks; + int error = EINVAL; + + rw_enter(&zv_dst->zv_suspend_lock, RW_READER); + if (zv_dst->zv_zilog == NULL) { + rw_exit(&zv_dst->zv_suspend_lock); + rw_enter(&zv_dst->zv_suspend_lock, RW_WRITER); + if (zv_dst->zv_zilog == NULL) { + zv_dst->zv_zilog = zil_open(zv_dst->zv_objset, + zvol_get_data, &zv_dst->zv_kstat.dk_zil_sums); + zv_dst->zv_flags |= ZVOL_WRITTEN_TO; + VERIFY0((zv_dst->zv_zilog->zl_header->zh_flags & + ZIL_REPLAY_NEEDED)); + } + rw_downgrade(&zv_dst->zv_suspend_lock); + } + if (zv_src != zv_dst) + rw_enter(&zv_src->zv_suspend_lock, RW_READER); + + inos = zv_src->zv_objset; + outos = zv_dst->zv_objset; + + /* + * Sanity checks + */ + if (!spa_feature_is_enabled(dmu_objset_spa(outos), + SPA_FEATURE_BLOCK_CLONING)) { + error = EOPNOTSUPP; + goto out; + } + if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { + error = EXDEV; + goto out; + } + if (inos->os_encrypted != outos->os_encrypted) { + error = EXDEV; + goto out; + } + if (zv_src->zv_volblocksize != zv_dst->zv_volblocksize) { + error = EINVAL; + goto out; + } + if (inoff >= zv_src->zv_volsize || outoff >= zv_dst->zv_volsize) { + error = 0; + goto out; + } + + /* + * Do not read beyond boundary + */ + if (len > zv_src->zv_volsize - inoff) + len = zv_src->zv_volsize - inoff; + if (len > zv_dst->zv_volsize - outoff) + len = zv_dst->zv_volsize - outoff; + if (len == 0) { + error = 0; + goto out; + } + + /* + * No overlapping if we are cloning within the same file + */ + if (zv_src == zv_dst) { + if (inoff < outoff + len && outoff < inoff + len) { + error = EINVAL; + goto out; + } + } + + /* + * Offsets and length must be at block boundaries + */ + if ((inoff % zv_src->zv_volblocksize) != 0 || + (outoff % zv_dst->zv_volblocksize) != 0) { + error = EINVAL; + goto out; + } + + /* + * Length must be multiple of block size + */ + if ((len % zv_src->zv_volblocksize) != 0) { + error = EINVAL; + goto out; + } + + zilog_dst = zv_dst->zv_zilog; + maxblocks = zil_max_log_data(zilog_dst, sizeof (lr_clone_range_t)) / + sizeof (bps[0]); + bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); + /* + * Maintain predictable lock order. + */ + if (zv_src < zv_dst || (zv_src == zv_dst && inoff < outoff)) { + inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, + RL_READER); + outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, + RL_WRITER); + } else { + outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, + RL_WRITER); + inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, + RL_READER); + } + + while (len > 0) { + uint64_t size, last_synced_txg; + size_t nbps = maxblocks; + size = MIN(zv_src->zv_volblocksize * maxblocks, len); + last_synced_txg = spa_last_synced_txg( + dmu_objset_spa(zv_src->zv_objset)); + error = dmu_read_l0_bps(zv_src->zv_objset, ZVOL_OBJ, inoff, + size, bps, &nbps); + if (error != 0) { + /* + * If we are trying to clone a block that was created + * in the current transaction group, the error will be + * EAGAIN here. Based on zfs_bclone_wait_dirty either + * return a shortened range to the caller so it can + * fallback, or wait for the next TXG and check again. + */ + if (error == EAGAIN && zfs_bclone_wait_dirty) { + txg_wait_synced(dmu_objset_pool + (zv_src->zv_objset), last_synced_txg + 1); + continue; + } + break; + } + + tx = dmu_tx_create(zv_dst->zv_objset); + dmu_tx_hold_clone_by_dnode(tx, zv_dst->zv_dn, outoff, size); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + break; + } + error = dmu_brt_clone(zv_dst->zv_objset, ZVOL_OBJ, outoff, size, + tx, bps, nbps); + if (error != 0) { + dmu_tx_commit(tx); + break; + } + zvol_log_clone_range(zilog_dst, tx, TX_CLONE_RANGE, outoff, + size, zv_src->zv_volblocksize, bps, nbps); + dmu_tx_commit(tx); + inoff += size; + outoff += size; + len -= size; + } + vmem_free(bps, sizeof (bps[0]) * maxblocks); + zfs_rangelock_exit(outlr); + zfs_rangelock_exit(inlr); + if (error == 0 && zv_dst->zv_objset->os_sync == ZFS_SYNC_ALWAYS) { + zil_commit(zilog_dst, ZVOL_OBJ); + } +out: + if (zv_src != zv_dst) + rw_exit(&zv_src->zv_suspend_lock); + rw_exit(&zv_dst->zv_suspend_lock); + return (SET_ERROR(error)); +} + static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { @@ -540,7 +776,9 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_write, /* TX_WRITE */ zvol_replay_truncate, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ + zvol_replay_err, /* TX_ACL_V0 */ zvol_replay_err, /* TX_ACL */ + zvol_replay_err, /* TX_CREATE_ACL */ zvol_replay_err, /* TX_CREATE_ATTR */ zvol_replay_err, /* TX_CREATE_ACL_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL */ @@ -550,7 +788,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_SETSAXATTR */ zvol_replay_err, /* TX_RENAME_EXCHANGE */ zvol_replay_err, /* TX_RENAME_WHITEOUT */ - zvol_replay_err, /* TX_CLONE_RANGE */ + zvol_replay_clone_range, /* TX_CLONE_RANGE */ }; /* @@ -625,6 +863,50 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, } } +/* + * Handles TX_CLONE_RANGE transactions. + */ +void +zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off, + uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps) +{ + itx_t *itx; + lr_clone_range_t *lr; + uint64_t partlen, max_log_data; + size_t partnbps; + + if (zil_replaying(zilog, tx)) + return; + + max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); + + while (nbps > 0) { + partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); + partlen = partnbps * blksz; + ASSERT3U(partlen, <, len + blksz); + partlen = MIN(partlen, len); + + itx = zil_itx_create(txtype, + sizeof (*lr) + sizeof (bps[0]) * partnbps); + lr = (lr_clone_range_t *)&itx->itx_lr; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = off; + lr->lr_length = partlen; + lr->lr_blksz = blksz; + lr->lr_nbps = partnbps; + memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps); + + zil_itx_assign(zilog, itx, tx); + + bps += partnbps; + ASSERT3U(nbps, >=, partnbps); + nbps -= partnbps; + off += partlen; + ASSERT3U(len, >=, partlen); + len -= partlen; + } +} + /* * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. */ diff --git a/tests/zfs-tests/cmd/clonefile.c b/tests/zfs-tests/cmd/clonefile.c index bc30bb7798e9..e09286f44455 100644 --- a/tests/zfs-tests/cmd/clonefile.c +++ b/tests/zfs-tests/cmd/clonefile.c @@ -228,14 +228,14 @@ main(int argc, char **argv) } } - int sfd = open(argv[optind], O_RDONLY); + int sfd = open(argv[optind], O_RDONLY | O_DIRECT); if (sfd < 0) { fprintf(stderr, "open: %s: %s\n", argv[optind], strerror(errno)); return (1); } - int dfd = open(argv[optind+1], O_WRONLY|O_CREAT, + int dfd = open(argv[optind+1], O_WRONLY|O_CREAT|O_DIRECT, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH); if (dfd < 0) { fprintf(stderr, "open: %s: %s\n",