Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Illumos 4958, 5164, 5165 #2697

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 37 additions & 6 deletions cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -832,7 +832,7 @@ static uint64_t
ztest_get_ashift(void)
{
if (ztest_opts.zo_ashift == 0)
return (SPA_MINBLOCKSHIFT + ztest_random(3));
return (SPA_MINBLOCKSHIFT + ztest_random(5));
return (ztest_opts.zo_ashift);
}

Expand Down Expand Up @@ -994,11 +994,28 @@ ztest_random_spa_version(uint64_t initial_version)
return (version);
}

/*
* Find the largest ashift used
*/
static uint64_t
ztest_spa_get_ashift(void) {
uint64_t i;
uint64_t ashift = SPA_MINBLOCKSHIFT;
vdev_t *rvd = ztest_spa->spa_root_vdev;

for (i = 0; i < rvd->vdev_children; i++) {
ashift = MAX(ashift, rvd->vdev_child[i]->vdev_ashift);
}
return (ashift);
}

static int
ztest_random_blocksize(void)
{
return (1 << (SPA_MINBLOCKSHIFT +
ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)));
// Choose a block size >= the ashift.
uint64_t block_shift =
ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1);
return (1 << (SPA_MINBLOCKSHIFT + block_shift));
}

static int
Expand Down Expand Up @@ -5936,17 +5953,31 @@ ztest_freeze(void)
*/
spa_freeze(spa);

/*
* Because it is hard to predict how much space a write will actually
* require beforehand, we leave ourselves some fudge space to write over
* capacity.
*/
uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;

/*
* Run tests that generate log records but don't alter the pool config
* or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
* We do a txg_wait_synced() after each iteration to force the txg
* to increase well beyond the last synced value in the uberblock.
* The ZIL should be OK with that.
*
* Run a random number of times less than zo_maxloops and ensure we do
* not run out of space on the pool.
*/
while (ztest_random(10) != 0 &&
numloops++ < ztest_opts.zo_maxloops) {
ztest_dmu_write_parallel(zd, 0);
ztest_dmu_object_alloc_free(zd, 0);
numloops++ < ztest_opts.zo_maxloops &&
metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
ztest_od_t od;
ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
ztest_io(zd, od.od_object,
ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
txg_wait_synced(spa_get_dsl(spa), 0);
}

Expand Down
11 changes: 0 additions & 11 deletions include/sys/space_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,17 +133,6 @@ typedef enum {
SM_FREE
} maptype_t;

/*
* The data for a given space map can be kept on blocks of any size.
* Larger blocks entail fewer i/o operations, but they also cause the
* DMU to keep more data in-core, and also to waste more i/o bandwidth
* when only a few blocks have changed since the last transaction group.
* Rather than having a fixed block size for all space maps the block size
* can adjust as needed (see space_map_max_blksz). Set the initial block
* size for the space map to 4k.
*/
#define SPACE_MAP_INITIAL_BLOCKSIZE (1ULL << 12)

int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);

void space_map_histogram_clear(space_map_t *sm);
Expand Down
7 changes: 5 additions & 2 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/

#ifndef _SYS_VDEV_IMPL_H
Expand Down Expand Up @@ -239,8 +239,11 @@ struct vdev {
#define VDEV_PHYS_SIZE (112 << 10)
#define VDEV_UBERBLOCK_RING (128 << 10)

/* The largest uberblock we support is 8k. */
#define MAX_UBERBLOCK_SHIFT (13)
#define VDEV_UBERBLOCK_SHIFT(vd) \
MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
MAX_UBERBLOCK_SHIFT)
#define VDEV_UBERBLOCK_COUNT(vd) \
(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
#define VDEV_UBERBLOCK_OFFSET(vd, n) \
Expand Down
47 changes: 24 additions & 23 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,48 +167,49 @@ enum zio_flag {
ZIO_FLAG_RESILVER = 1 << 3,
ZIO_FLAG_SCRUB = 1 << 4,
ZIO_FLAG_SCAN_THREAD = 1 << 5,
ZIO_FLAG_PHYSICAL = 1 << 6,

#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)

/*
* Flags inherited by ddt, gang, and vdev children.
*/
ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */
ZIO_FLAG_SPECULATIVE = 1 << 7,
ZIO_FLAG_CONFIG_WRITER = 1 << 8,
ZIO_FLAG_DONT_RETRY = 1 << 9,
ZIO_FLAG_DONT_CACHE = 1 << 10,
ZIO_FLAG_NODATA = 1 << 11,
ZIO_FLAG_INDUCE_DAMAGE = 1 << 12,
ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */
ZIO_FLAG_SPECULATIVE = 1 << 8,
ZIO_FLAG_CONFIG_WRITER = 1 << 9,
ZIO_FLAG_DONT_RETRY = 1 << 10,
ZIO_FLAG_DONT_CACHE = 1 << 11,
ZIO_FLAG_NODATA = 1 << 12,
ZIO_FLAG_INDUCE_DAMAGE = 1 << 13,

#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)

/*
* Flags inherited by vdev children.
*/
ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */
ZIO_FLAG_PROBE = 1 << 14,
ZIO_FLAG_TRYHARD = 1 << 15,
ZIO_FLAG_OPTIONAL = 1 << 16,
ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */
ZIO_FLAG_PROBE = 1 << 15,
ZIO_FLAG_TRYHARD = 1 << 16,
ZIO_FLAG_OPTIONAL = 1 << 17,

#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)

/*
* Flags not inherited by any children.
*/
ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */
ZIO_FLAG_DONT_PROPAGATE = 1 << 18,
ZIO_FLAG_IO_BYPASS = 1 << 19,
ZIO_FLAG_IO_REWRITE = 1 << 20,
ZIO_FLAG_RAW = 1 << 21,
ZIO_FLAG_GANG_CHILD = 1 << 22,
ZIO_FLAG_DDT_CHILD = 1 << 23,
ZIO_FLAG_GODFATHER = 1 << 24,
ZIO_FLAG_NOPWRITE = 1 << 25,
ZIO_FLAG_REEXECUTED = 1 << 26,
ZIO_FLAG_DELEGATED = 1 << 27,
ZIO_FLAG_FASTWRITE = 1 << 28
ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */
ZIO_FLAG_DONT_PROPAGATE = 1 << 19,
ZIO_FLAG_IO_BYPASS = 1 << 20,
ZIO_FLAG_IO_REWRITE = 1 << 21,
ZIO_FLAG_RAW = 1 << 22,
ZIO_FLAG_GANG_CHILD = 1 << 23,
ZIO_FLAG_DDT_CHILD = 1 << 24,
ZIO_FLAG_GODFATHER = 1 << 25,
ZIO_FLAG_NOPWRITE = 1 << 26,
ZIO_FLAG_REEXECUTED = 1 << 27,
ZIO_FLAG_DELEGATED = 1 << 28,
ZIO_FLAG_FASTWRITE = 1 << 29
};

#define ZIO_FLAG_MUSTSUCCEED 0
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/dsl_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
*/

Expand Down
51 changes: 44 additions & 7 deletions module/zfs/metaslab.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,21 @@ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
*/
int zfs_condense_pct = 200;

/*
* Condensing a metaslab is not guaranteed to actually reduce the amount of
* space used on disk. In particular, a space map uses data in increments of
* MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the
* same number of blocks after condensing. Since the goal of condensing is to
* reduce the number of IOPs required to read the space map, we only want to
* condense when we can be sure we will reduce the number of blocks used by the
* space map. Unfortunately, we cannot precisely compute whether or not this is
* the case in metaslab_should_condense since we are holding ms_lock. Instead,
* we apply the following heuristic: do not condense a spacemap unless the
* uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
* blocks.
*/
int zfs_metaslab_condense_block_threshold = 4;

/*
* The zfs_mg_noalloc_threshold defines which metaslab groups should
* be eligible for allocation. The value is defined as a percentage of
Expand Down Expand Up @@ -1633,6 +1648,8 @@ metaslab_group_preload(metaslab_group_t *mg)
* times the size than the free space range tree representation
* (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
*
* 3. The on-disk size of the space map should actually decrease.
*
* Checking the first condition is tricky since we don't want to walk
* the entire AVL tree calculating the estimated on-disk size. Instead we
* use the size-ordered range tree in the metaslab and calculate the
Expand All @@ -1643,13 +1660,21 @@ metaslab_group_preload(metaslab_group_t *mg)
* To determine the second criterion we use a best-case estimate and assume
* each segment can be represented on-disk as a single 64-bit entry. We refer
* to this best-case estimate as the space map's minimal form.
*
* Unfortunately, we cannot compute the on-disk size of the space map in this
* context because we cannot accurately compute the effects of compression, etc.
* Instead, we apply the heuristic described in the block comment for
* zfs_metaslab_condense_block_threshold - we only condense if the space used
* is greater than a threshold number of blocks.
*/
static boolean_t
metaslab_should_condense(metaslab_t *msp)
{
space_map_t *sm = msp->ms_sm;
range_seg_t *rs;
uint64_t size, entries, segsz;
uint64_t size, entries, segsz, object_size, optimal_size, record_size;
dmu_object_info_t doi;
uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;

ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loaded);
Expand All @@ -1674,9 +1699,15 @@ metaslab_should_condense(metaslab_t *msp)
entries = size / (MIN(size, SM_RUN_MAX));
segsz = entries * sizeof (uint64_t);

return (segsz <= space_map_length(msp->ms_sm) &&
space_map_length(msp->ms_sm) >= (zfs_condense_pct *
sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
object_size = space_map_length(msp->ms_sm);

dmu_object_info_from_db(sm->sm_dbuf, &doi);
record_size = MAX(doi.doi_data_block_size, vdev_blocksize);

return (segsz <= object_size &&
object_size >= (optimal_size * zfs_condense_pct / 100) &&
object_size > zfs_metaslab_condense_block_threshold * record_size);
}

/*
Expand Down Expand Up @@ -1833,6 +1864,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)

mutex_enter(&msp->ms_lock);

/*
* Note: metaslab_condense() clears the space_map's histogram.
* Therefore we muse verify and remove this histogram before
* condensing.
*/
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
metaslab_group_histogram_remove(mg, msp);

if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
metaslab_should_condense(msp)) {
metaslab_condense(msp, txg, tx);
Expand All @@ -1841,9 +1881,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
}

metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
metaslab_group_histogram_remove(mg, msp);
if (msp->ms_loaded) {
/*
* When the space map is loaded, we have an accruate
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/spa_misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
*/

Expand Down
Loading