Skip to content

Commit

Permalink
Add TRIM support
Browse files Browse the repository at this point in the history
UNMAP/TRIM support is a frequently-requested feature to help
prevent performance from degrading on SSDs and on various other
SAN-like storage back-ends.  By issuing UNMAP/TRIM commands for
sectors which are no longer allocated the underlying device can
often more efficiently manage itself.

This TRIM implementation is modeled on the `zpool initialize`
feature which writes a pattern to all unallocated space in the
pool.  The new `zpool trim` command uses the same vdev_xlate()
code to calculate what sectors are unallocated, the same per-
vdev TRIM thread model and locking, and the same basic CLI for
a consistent user experience.  The core difference is that
instead of writing a pattern it will issue UNMAP/TRIM commands
for those extents.

The zio pipeline was updated to accommodate this by adding a new
ZIO_TYPE_TRIM type and associated spa taskq.  This new type makes
is straight forward to add the platform specific TRIM/UNMAP calls
to vdev_disk.c and vdev_file.c.  These new ZIO_TYPE_TRIM zios are
handled largely the same way as ZIO_TYPE_READs or ZIO_TYPE_WRITEs.
This makes it possible to largely avoid changing the pipieline,
one exception is that TRIM zio's may exceed the 16M block size
limit since they contain no data.

In addition to the manual `zpool trim` command, a background
automatic TRIM was added and is controlled by the 'autotrim'
property.  It relies on the exact same infrastructure as the
manual TRIM.  However, instead of relying on the extents in a
metaslab's ms_allocatable range tree, a ms_trim tree is kept
per metaslab.  When 'autotrim=on', ranges added back to the
ms_allocatable tree are also added to the ms_free tree.  The
ms_free tree is then periodically consumed by an autotrim
thread which systematically walks a top level vdev's metaslabs.

Since the automatic TRIM will skip ranges it considers too small
there is value in occasionally running a full `zpool trim`.  This
may occur when the freed blocks are small and not enough time
was allowed to aggregate them.  An automatic TRIM and a manual
`zpool trim` may be run concurrently, in which case the automatic
TRIM will yield to the manual TRIM.

Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Tim Chase <tim@chase2k.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Contributions-by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Contributions-by: Tim Chase <tim@chase2k.com>
Contributions-by: Chunwei Chen <tuxoko@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes openzfs#8419 
Closes openzfs#598
  • Loading branch information
behlendorf authored Mar 29, 2019
1 parent f94b3cb commit 1b93956
Show file tree
Hide file tree
Showing 91 changed files with 5,584 additions and 430 deletions.
350 changes: 280 additions & 70 deletions cmd/zpool/zpool_main.c

Large diffs are not rendered by default.

85 changes: 83 additions & 2 deletions cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
#include <sys/spa_impl.h>
#include <sys/metaslab_impl.h>
#include <sys/dsl_prop.h>
Expand Down Expand Up @@ -374,6 +375,7 @@ ztest_func_t ztest_spa_upgrade;
ztest_func_t ztest_device_removal;
ztest_func_t ztest_spa_checkpoint_create_discard;
ztest_func_t ztest_initialize;
ztest_func_t ztest_trim;
ztest_func_t ztest_fletcher;
ztest_func_t ztest_fletcher_incr;
ztest_func_t ztest_verify_dnode_bt;
Expand Down Expand Up @@ -427,6 +429,7 @@ ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes),
ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely),
ZTI_INIT(ztest_initialize, 1, &zopt_sometimes),
ZTI_INIT(ztest_trim, 1, &zopt_sometimes),
ZTI_INIT(ztest_fletcher, 1, &zopt_rarely),
ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
Expand Down Expand Up @@ -4897,7 +4900,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
umem_free(bigcheck, bigsize);
}
if (i == 2) {
txg_wait_open(dmu_objset_pool(os), 0);
txg_wait_open(dmu_objset_pool(os), 0, B_TRUE);
} else if (i == 3) {
txg_wait_synced(dmu_objset_pool(os), 0);
}
Expand Down Expand Up @@ -5574,6 +5577,8 @@ ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
(void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO,
ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));

(void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2));

VERIFY0(spa_prop_get(ztest_spa, &props));

if (ztest_opts.zo_verbose >= 6)
Expand Down Expand Up @@ -6484,7 +6489,7 @@ ztest_initialize(ztest_ds_t *zd, uint64_t id)
(void) printf("\n");
}
break;
case POOL_INITIALIZE_DO:
case POOL_INITIALIZE_START:
if (ztest_opts.zo_verbose >= 4) {
(void) printf("Start initialize %s", path);
if (active && error == 0)
Expand All @@ -6507,6 +6512,82 @@ ztest_initialize(ztest_ds_t *zd, uint64_t id)
mutex_exit(&ztest_vdev_lock);
}

/* ARGSUSED */
void
ztest_trim(ztest_ds_t *zd, uint64_t id)
{
spa_t *spa = ztest_spa;
int error = 0;

mutex_enter(&ztest_vdev_lock);

spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);

/* Random leaf vdev */
vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev);
if (rand_vd == NULL) {
spa_config_exit(spa, SCL_VDEV, FTAG);
mutex_exit(&ztest_vdev_lock);
return;
}

/*
* The random vdev we've selected may change as soon as we
* drop the spa_config_lock. We create local copies of things
* we're interested in.
*/
uint64_t guid = rand_vd->vdev_guid;
char *path = strdup(rand_vd->vdev_path);
boolean_t active = rand_vd->vdev_trim_thread != NULL;

zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid);
spa_config_exit(spa, SCL_VDEV, FTAG);

uint64_t cmd = ztest_random(POOL_TRIM_FUNCS);
uint64_t rate = 1 << ztest_random(30);
boolean_t partial = (ztest_random(5) > 0);
boolean_t secure = (ztest_random(5) > 0);

nvlist_t *vdev_guids = fnvlist_alloc();
nvlist_t *vdev_errlist = fnvlist_alloc();
fnvlist_add_uint64(vdev_guids, path, guid);
error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial,
secure, vdev_errlist);
fnvlist_free(vdev_guids);
fnvlist_free(vdev_errlist);

switch (cmd) {
case POOL_TRIM_CANCEL:
if (ztest_opts.zo_verbose >= 4) {
(void) printf("Cancel TRIM %s", path);
if (!active)
(void) printf(" failed (no TRIM active)");
(void) printf("\n");
}
break;
case POOL_TRIM_START:
if (ztest_opts.zo_verbose >= 4) {
(void) printf("Start TRIM %s", path);
if (active && error == 0)
(void) printf(" failed (already active)");
else if (error != 0)
(void) printf(" failed (error %d)", error);
(void) printf("\n");
}
break;
case POOL_TRIM_SUSPEND:
if (ztest_opts.zo_verbose >= 4) {
(void) printf("Suspend TRIM %s", path);
if (!active)
(void) printf(" failed (no TRIM active)");
(void) printf("\n");
}
break;
}
free(path);
mutex_exit(&ztest_vdev_lock);
}

/*
* Verify pool integrity by running zdb.
*/
Expand Down
65 changes: 65 additions & 0 deletions config/kernel-blk-queue-discard.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
dnl #
dnl # 2.6.32 - 4.x API,
dnl # blk_queue_discard()
dnl #
AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISCARD], [
AC_MSG_CHECKING([whether blk_queue_discard() is available])
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct request_queue *q __attribute__ ((unused)) = NULL;
int value __attribute__ ((unused));
value = blk_queue_discard(q);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_QUEUE_DISCARD, 1,
[blk_queue_discard() is available])
],[
AC_MSG_RESULT(no)
])
])

dnl #
dnl # 4.8 - 4.x API,
dnl # blk_queue_secure_erase()
dnl #
dnl # 2.6.36 - 4.7 API,
dnl # blk_queue_secdiscard()
dnl #
dnl # 2.6.x - 2.6.35 API,
dnl # Unsupported by kernel
dnl #
AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE], [
AC_MSG_CHECKING([whether blk_queue_secure_erase() is available])
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct request_queue *q __attribute__ ((unused)) = NULL;
int value __attribute__ ((unused));
value = blk_queue_secure_erase(q);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_QUEUE_SECURE_ERASE, 1,
[blk_queue_secure_erase() is available])
],[
AC_MSG_RESULT(no)
AC_MSG_CHECKING([whether blk_queue_secdiscard() is available])
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct request_queue *q __attribute__ ((unused)) = NULL;
int value __attribute__ ((unused));
value = blk_queue_secdiscard(q);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_QUEUE_SECDISCARD, 1,
[blk_queue_secdiscard() is available])
],[
AC_MSG_RESULT(no)
])
])
])
2 changes: 2 additions & 0 deletions config/kernel.m4
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
ZFS_AC_KERNEL_IN_COMPAT_SYSCALL
ZFS_AC_KERNEL_KTIME_GET_COARSE_REAL_TS64
ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC
ZFS_AC_KERNEL_BLK_QUEUE_DISCARD
ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE
AS_IF([test "$LINUX_OBJ" != "$LINUX"], [
KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ"
Expand Down
2 changes: 2 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_sync/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/Makefile
tests/zfs-tests/tests/functional/cli_user/Makefile
Expand Down Expand Up @@ -328,6 +329,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/alloc_class/Makefile
tests/zfs-tests/tests/functional/threadsappend/Makefile
tests/zfs-tests/tests/functional/tmpfile/Makefile
tests/zfs-tests/tests/functional/trim/Makefile
tests/zfs-tests/tests/functional/truncate/Makefile
tests/zfs-tests/tests/functional/user_namespace/Makefile
tests/zfs-tests/tests/functional/userquota/Makefile
Expand Down
17 changes: 17 additions & 0 deletions include/libzfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ typedef enum zfs_error {
EZFS_INITIALIZING, /* currently initializing */
EZFS_NO_INITIALIZE, /* no active initialize */
EZFS_WRONG_PARENT, /* invalid parent dataset (e.g ZVOL) */
EZFS_TRIMMING, /* currently trimming */
EZFS_NO_TRIM, /* no active trim */
EZFS_TRIM_NOTSUP, /* device does not support trim */
EZFS_UNKNOWN
} zfs_error_t;

Expand Down Expand Up @@ -253,12 +256,26 @@ typedef struct splitflags {
int name_flags;
} splitflags_t;

typedef struct trimflags {
/* requested vdevs are for the entire pool */
boolean_t fullpool;

/* request a secure trim, requires support from device */
boolean_t secure;

/* trim at the requested rate in bytes/second */
uint64_t rate;
} trimflags_t;

/*
* Functions to manipulate pool and vdev state
*/
extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t);
extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t,
nvlist_t *);
extern int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *,
trimflags_t *);

extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
extern int zpool_reguid(zpool_handle_t *);
extern int zpool_reopen_one(zpool_handle_t *, void *);
Expand Down
2 changes: 2 additions & 0 deletions include/libzfs_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ int lzc_unload_key(const char *);
int lzc_change_key(const char *, uint64_t, nvlist_t *, uint8_t *, uint_t);
int lzc_initialize(const char *, pool_initialize_func_t, nvlist_t *,
nvlist_t **);
int lzc_trim(const char *, pool_trim_func_t, uint64_t, boolean_t,
nvlist_t *, nvlist_t **);

int lzc_snaprange_space(const char *, const char *, uint64_t *);

Expand Down
30 changes: 30 additions & 0 deletions include/linux/blkdev_compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,36 @@ blk_queue_discard_granularity(struct request_queue *q, unsigned int dg)
#define blk_queue_discard_granularity(x, dg) ((void)0)
#endif /* HAVE_DISCARD_GRANULARITY */

/*
* 2.6.32 - 4.x API,
* blk_queue_discard()
*/
#if !defined(HAVE_BLK_QUEUE_DISCARD)
#define blk_queue_discard(q) (0);
#endif

/*
* 4.8 - 4.x API,
* blk_queue_secure_erase()
*
* 2.6.36 - 4.7 API,
* blk_queue_secdiscard()
*
* 2.6.x - 2.6.35 API,
* Unsupported by kernel
*/
static inline int
blk_queue_discard_secure(struct request_queue *q)
{
#if defined(HAVE_BLK_QUEUE_SECURE_ERASE)
return (blk_queue_secure_erase(q));
#elif defined(HAVE_BLK_QUEUE_SECDISCARD)
return (blk_queue_secdiscard(q));
#else
return (0);
#endif
}

/*
* Default Linux IO Scheduler,
* Setting the scheduler to noop will allow the Linux IO scheduler to
Expand Down
1 change: 0 additions & 1 deletion include/spl/sys/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ KERNEL_H = \
$(top_srcdir)/include/spl/sys/ctype.h \
$(top_srcdir)/include/spl/sys/debug.h \
$(top_srcdir)/include/spl/sys/disp.h \
$(top_srcdir)/include/spl/sys/dkioc_free_util.h \
$(top_srcdir)/include/spl/sys/dkio.h \
$(top_srcdir)/include/spl/sys/errno.h \
$(top_srcdir)/include/spl/sys/fcntl.h \
Expand Down
58 changes: 0 additions & 58 deletions include/spl/sys/dkioc_free_util.h

This file was deleted.

1 change: 1 addition & 0 deletions include/sys/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/vdev_raidz.h \
$(top_srcdir)/include/sys/vdev_raidz_impl.h \
$(top_srcdir)/include/sys/vdev_removal.h \
$(top_srcdir)/include/sys/vdev_trim.h \
$(top_srcdir)/include/sys/xvattr.h \
$(top_srcdir)/include/sys/zap.h \
$(top_srcdir)/include/sys/zap_impl.h \
Expand Down
Loading

0 comments on commit 1b93956

Please sign in to comment.