diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 815b5d0c9cf1..b8bb2e364056 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -40,7 +40,7 @@ extern "C" { typedef struct metaslab_ops { const char *msop_name; - uint64_t (*msop_alloc)(metaslab_t *, uint64_t); + uint64_t (*msop_alloc)(metaslab_t *, uint64_t, uint64_t, uint64_t *); } metaslab_ops_t; @@ -85,7 +85,10 @@ uint64_t metaslab_largest_allocatable(metaslab_t *); int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, - int); + int); +int metaslab_alloc_range(spa_t *, metaslab_class_t *, uint64_t, uint64_t, + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, + int); int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); @@ -99,6 +102,7 @@ void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_stat_init(void); void metaslab_stat_fini(void); +void metaslab_trace_move(zio_alloc_list_t *, zio_alloc_list_t *); void metaslab_trace_init(zio_alloc_list_t *); void metaslab_trace_fini(zio_alloc_list_t *); @@ -129,6 +133,8 @@ uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); +void metaslab_group_alloc_increment_all(spa_t *, blkptr_t *, const void *, + int, int); void metaslab_group_alloc_decrement(spa_t *, uint64_t, const void *, int, int, boolean_t); void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, const void *, int); diff --git a/include/sys/zio.h b/include/sys/zio.h index cf303d47d524..a30d8b75b9e1 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -246,6 +246,7 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_REEXECUTED (1ULL << 30) #define ZIO_FLAG_DELEGATED (1ULL << 31) #define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 32) +#define ZIO_FLAG_PREALLOCATED (1ULL << 33) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index f2f8d2b66f12..cebf4719f6e2 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2499,20 +2499,6 @@ the xattr so as to not accumulate duplicates. . .It Sy zio_requeue_io_start_cut_in_line Ns = Ns Sy 0 Ns | Ns 1 Pq int Prioritize requeued I/O. -.It Sy zio_dynamic_gang_headers_enable Ns = Ns Sy 0 Ns | Ns 1 Pq int -Enable dynamically sized gang headers. -.Pp -When set to 0 (the default), the dynamic_gang_header feature will never be -activated, even if it is enabled. -All gang headers will store at most 3 children, regardless of the header's -allocated size on disk. -When set to 1, the dynamic_gang_header feature will be activated once a -gang allocation is larger than 512 bytes. -This will cause the gang header to store more more gang children on pools -with larger ashifts. -Enabling this flag and the feature will reduce multi-level gang trees, -but can result in more IOs for individual gang blocks. -. .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint Percentage of online CPUs which will run a worker thread for I/O. These workers are responsible for I/O work such as compression, encryption, diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 4bdd2a042a44..3405d17b1bd4 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -61,6 +61,8 @@ uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; */ uint_t metaslab_force_ganging_pct = 3; +uint_t metaslab_allow_deganging_pct = 0; + /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each @@ -1621,7 +1623,7 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, */ static uint64_t metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, - uint64_t max_search) + uint64_t max_size, uint64_t max_search, uint64_t *found_size) { if (*cursor == 0) *cursor = rt->rt_start; @@ -1638,7 +1640,9 @@ metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, max_search || count_searched < metaslab_min_search_count)) { uint64_t offset = rs_get_start(rs, rt); if (offset + size <= rs_get_end(rs, rt)) { - *cursor = offset + size; + *found_size = MIN(rs_get_end(rs, rt) - offset, + max_size); + *cursor = offset + *found_size; return (offset); } rs = zfs_btree_next(bt, &where, &where); @@ -1646,12 +1650,16 @@ metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, } *cursor = 0; + *found_size = 0; return (-1ULL); } -static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size); -static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size); -static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size); +static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size, + uint64_t max_size, uint64_t *found_size); +static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size, + uint64_t max_size, uint64_t *found_size); +static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size, + uint64_t max_size, uint64_t *found_size); metaslab_ops_t *metaslab_allocator(spa_t *spa); static metaslab_ops_t metaslab_allocators[] = { @@ -1737,7 +1745,8 @@ metaslab_allocator(spa_t *spa) * ========================================================================== */ static uint64_t -metaslab_df_alloc(metaslab_t *msp, uint64_t size) +metaslab_df_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, + uint64_t *found_size) { /* * Find the largest power of 2 block size that evenly divides the @@ -1746,7 +1755,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) * bucket) but it does not guarantee that other allocations sizes * may exist in the same region. */ - uint64_t align = size & -size; + uint64_t align = max_size & -max_size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; range_tree_t *rt = msp->ms_allocatable; uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size; @@ -1760,10 +1769,18 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) */ if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { + align = size & -size; + cursor = &msp->ms_lbas[highbit64(align) - 1]; offset = -1; } else { - offset = metaslab_block_picker(rt, - cursor, size, metaslab_df_max_search); + offset = metaslab_block_picker(rt, cursor, size, max_size, + metaslab_df_max_search, found_size); + if (max_size != size && offset == -1) { + align = size & -size; + cursor = &msp->ms_lbas[highbit64(align) - 1]; + offset = metaslab_block_picker(rt, cursor, max_size, + max_size, metaslab_df_max_search, found_size); + } } if (offset == -1) { @@ -1783,7 +1800,9 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs, rt)) { offset = rs_get_start(rs, rt); - *cursor = offset + size; + *found_size = MIN(rs_get_end(rs, rt) - offset, + max_size); + *cursor = offset + *found_size; } } @@ -1800,7 +1819,8 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) * ========================================================================== */ static uint64_t -metaslab_cf_alloc(metaslab_t *msp, uint64_t size) +metaslab_cf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, + uint64_t *found_size) { range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *t = &msp->ms_allocatable_by_size; @@ -1827,7 +1847,8 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) } offset = *cursor; - *cursor += size; + *found_size = MIN(*cursor_end - offset, max_size); + *cursor = offset + *found_size; return (offset); } @@ -1848,31 +1869,41 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) uint64_t metaslab_ndf_clump_shift = 4; static uint64_t -metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) +metaslab_ndf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, + uint64_t *found_size) { zfs_btree_t *t = &msp->ms_allocatable->rt_root; range_tree_t *rt = msp->ms_allocatable; zfs_btree_index_t where; range_seg_t *rs; range_seg_max_t rsearch; - uint64_t hbit = highbit64(size); + uint64_t hbit = highbit64(max_size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; - uint64_t max_size = metaslab_largest_allocatable(msp); + uint64_t max_possible_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); - if (max_size < size) + if (max_possible_size < size) return (-1ULL); rs_set_start(&rsearch, rt, *cursor); - rs_set_end(&rsearch, rt, *cursor + size); + rs_set_end(&rsearch, rt, *cursor + max_size); rs = zfs_btree_find(t, &rsearch, &where); + if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < + max_size) { + hbit = highbit64(size); + cursor = &msp->ms_lbas[hbit - 1]; + rs_set_start(&rsearch, rt, *cursor); + rs_set_end(&rsearch, rt, *cursor + size); + + rs = zfs_btree_find(t, &rsearch, &where); + } if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { t = &msp->ms_allocatable_by_size; rs_set_start(&rsearch, rt, 0); - rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + + rs_set_end(&rsearch, rt, MIN(max_possible_size, 1ULL << (hbit + metaslab_ndf_clump_shift))); rs = zfs_btree_find(t, &rsearch, &where); @@ -1882,7 +1913,9 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) } if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) { - *cursor = rs_get_start(rs, rt) + size; + *found_size = MIN(rs_get_end(rs, rt) - rs_get_start(rs, rt), + max_size); + *cursor = rs_get_start(rs, rt) + *found_size; return (rs_get_start(rs, rt)); } return (-1ULL); @@ -4550,6 +4583,15 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); } +void +metaslab_trace_move(zio_alloc_list_t *old, zio_alloc_list_t *new) +{ + ASSERT0(new->zal_size); + list_move_tail(&new->zal_list, &old->zal_list); + new->zal_size = old->zal_size; + list_destroy(&old->zal_list); +} + void metaslab_trace_init(zio_alloc_list_t *zal) { @@ -4580,7 +4622,7 @@ metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag, int flags, int allocator) { if (!(flags & METASLAB_ASYNC_ALLOC) || - (flags & METASLAB_DONT_THROTTLE)) + (flags & METASLAB_DONT_THROTTLE) || tag == NULL) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; @@ -4591,6 +4633,17 @@ metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag, (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag); } +void +metaslab_group_alloc_increment_all(spa_t *spa, blkptr_t *bp, const void *tag, + int flags, int allocator) +{ + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { + uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[d]); + metaslab_group_alloc_increment(spa, vdev, tag, flags, + allocator); + } +} + static void metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) { @@ -4614,7 +4667,7 @@ metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag, int flags, int allocator, boolean_t io_complete) { if (!(flags & METASLAB_ASYNC_ALLOC) || - (flags & METASLAB_DONT_THROTTLE)) + (flags & METASLAB_DONT_THROTTLE) || tag == NULL) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; @@ -4639,13 +4692,16 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag, uint64_t vdev = DVA_GET_VDEV(&dva[d]); metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; - VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag)); + VERIFYF(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, + tag), "for tag %px @ mga %px", tag, + &mga->mga_alloc_queue_depth); } #endif } static uint64_t -metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) +metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, + uint64_t txg, uint64_t *actual_size) { uint64_t start; range_tree_t *rt = msp->ms_allocatable; @@ -4656,8 +4712,9 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) VERIFY0(msp->ms_disabled); VERIFY0(msp->ms_new); - start = mc->mc_ops->msop_alloc(msp, size); + start = mc->mc_ops->msop_alloc(msp, size, max_size, actual_size); if (start != -1ULL) { + size = *actual_size; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; @@ -4798,8 +4855,9 @@ metaslab_active_mask_verify(metaslab_t *msp) static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, - int allocator, boolean_t try_hard) + uint64_t asize, uint64_t max_asize, uint64_t txg, boolean_t want_unique, + dva_t *dva, int d, int allocator, boolean_t try_hard, + uint64_t *actual_asize) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; @@ -5017,8 +5075,10 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, continue; } - offset = metaslab_block_alloc(msp, asize, txg); - metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); + offset = metaslab_block_alloc(msp, asize, max_asize, txg, + actual_asize); + metaslab_trace_add(zal, mg, msp, *actual_asize, d, offset, + allocator); if (offset != -1ULL) { /* Proactively passivate the metaslab, if needed */ @@ -5104,13 +5164,14 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, - int allocator, boolean_t try_hard) + uint64_t asize, uint64_t max_asize, uint64_t txg, boolean_t want_unique, + dva_t *dva, int d, int allocator, boolean_t try_hard, + uint64_t *actual_asize) { uint64_t offset; - offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, - dva, d, allocator, try_hard); + offset = metaslab_group_alloc_normal(mg, zal, asize, max_asize, txg, + want_unique, dva, d, allocator, try_hard, actual_asize); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { @@ -5137,13 +5198,10 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, return (offset); } -/* - * Allocate a block for the specified i/o. - */ -int -metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, - dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, - zio_alloc_list_t *zal, int allocator) +static int +metaslab_alloc_dva_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + uint64_t max_psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, + int flags, zio_alloc_list_t *zal, int allocator) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_group_t *mg, *rotor; @@ -5166,6 +5224,14 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, allocator); return (SET_ERROR(ENOSPC)); } + if (max_psize > psize && max_psize >= metaslab_force_ganging && + metaslab_allow_deganging_pct < 100 && + (random_in_range(100) >= MIN(metaslab_allow_deganging_pct, 100))) { + max_psize = (metaslab_force_ganging - psize) / 2 + psize; + } else if (max_psize > psize && max_psize >= metaslab_force_ganging && + metaslab_allow_deganging_pct == 100) { + psize = max_psize; + } /* * Start at the rotor and loop through all mgs until we find something. @@ -5270,10 +5336,13 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, goto next; } - ASSERT(mg->mg_class == mc); + ASSERT3P(mg->mg_class, ==, mc); uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg); - ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); + ASSERT0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); + uint64_t max_asize = vdev_psize_to_asize_txg(vd, max_psize, + txg); + ASSERT0(P2PHASE(max_asize, 1ULL << vd->vdev_ashift)); /* * If we don't need to try hard, then require that the @@ -5281,8 +5350,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * in this BP (unique=true). If we are trying hard, then * allow any metaslab to be used (unique=false). */ - uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - !try_hard, dva, d, allocator, try_hard); + uint64_t offset = metaslab_group_alloc(mg, zal, asize, + max_asize, txg, !try_hard, dva, d, allocator, try_hard, + &asize); if (offset != -1ULL) { /* @@ -5368,6 +5438,18 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, return (SET_ERROR(ENOSPC)); } +/* + * Allocate a block for the specified i/o. + */ +int +metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, + zio_alloc_list_t *zal, int allocator) +{ + return (metaslab_alloc_dva_range(spa, mc, psize, psize, dva, d, hintdva, + txg, flags, zal, allocator)); +} + void metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, boolean_t checkpoint) @@ -5844,6 +5926,16 @@ int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, zio_t *zio, int allocator) +{ + return (metaslab_alloc_range(spa, mc, psize, psize, bp, ndvas, txg, + hintbp, flags, zal, zio, allocator)); +} + +int +metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + uint64_t max_psize, blkptr_t *bp, int ndvas, uint64_t txg, + blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, zio_t *zio, + int allocator) { dva_t *dva = bp->blk_dva; dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; @@ -5866,8 +5958,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, ASSERT3P(zal, !=, NULL); for (int d = 0; d < ndvas; d++) { - error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, - txg, flags, zal, allocator); + error = metaslab_alloc_dva_range(spa, mc, psize, max_psize, + dva, d, hintdva, txg, flags, zal, allocator); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); @@ -6260,6 +6352,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW, ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW, "Percentage of large blocks that will be forced to be gang blocks"); +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, allow_deganging_pct, UINT, ZMOD_RW, + "Percentage of range allocs that will be allowed to degang"); + ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW, "Max distance (bytes) to search forward before using size tree"); diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index 0dd7da1aa197..05529c84afca 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -184,7 +184,8 @@ zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, ASSERT3U(rc->rc_count, >=, number); ref = avl_find(&rc->rc_tree, &s, NULL); if (unlikely(ref == NULL)) { - panic("No such hold %p on refcount %llx", holder, + PANIC("No such hold %llx on refcount %llx", + (u_longlong_t)(uintptr_t)holder, (u_longlong_t)(uintptr_t)rc); return (-1); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 74f826034ca3..904eeca81920 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -139,8 +139,6 @@ static uint_t zfs_sync_pass_rewrite = 2; int zio_exclude_metadata = 0; static int zio_requeue_io_start_cut_in_line = 1; -static int zio_dynamic_gang_headers_enable = 0; - #ifdef ZFS_DEBUG static const int zio_buf_debug_limit = 16384; #else @@ -1881,7 +1879,8 @@ zio_write_compress(zio_t *zio) ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); - if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) { + if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg && + !(zio->io_flags & ZIO_FLAG_PREALLOCATED)) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to @@ -2028,7 +2027,8 @@ zio_write_compress(zio_t *zio) zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { - BP_ZERO(bp); + if (!(zio->io_flags & ZIO_FLAG_PREALLOCATED)) + BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } @@ -3023,7 +3023,12 @@ zio_write_gang_member_ready(zio_t *zio) if (BP_IS_HOLE(zio->io_bp)) return; - ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); + /* + * If we're getting direct-invoked from zio_write_gang_block(), + * the bp_orig will be set. + */ + ASSERT(BP_IS_HOLE(&zio->io_bp_orig) || + zio->io_flags & ZIO_FLAG_PREALLOCATED); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); @@ -3072,7 +3077,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; - uint64_t lsize; int copies = gio->io_prop.zp_copies; zio_prop_t zp; int error; @@ -3117,8 +3121,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, &pio->io_alloc_list, pio, pio->io_allocator); - if (spa_feature_is_enabled(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER) && - zio_dynamic_gang_headers_enable) { + if (spa_feature_is_enabled(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) { gangblocksize = UINT64_MAX; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int dva = 0; dva < BP_GET_NDVAS(bp); dva++) { @@ -3175,40 +3178,111 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zio_gang_inherit_allocator(pio, zio); /* - * Create and nowait the gang children. + * Create and nowait the gang children. First, we try to do + * opportunistic allocations. If that fails to generate enough + * space, we fall back to normal zio_write calls. */ int g; - for (g = 0; resid != 0; resid -= lsize, g++) { - lsize = zio_roundup_alloc_size(spa, + flags &= METASLAB_ASYNC_ALLOC; + flags |= METASLAB_GANG_CHILD; + zp.zp_checksum = gio->io_prop.zp_checksum; + zp.zp_compress = ZIO_COMPRESS_OFF; + zp.zp_complevel = gio->io_prop.zp_complevel; + zp.zp_type = zp.zp_storage_type = DMU_OT_NONE; + zp.zp_level = 0; + zp.zp_copies = gio->io_prop.zp_copies; + zp.zp_dedup = B_FALSE; + zp.zp_dedup_verify = B_FALSE; + zp.zp_nopwrite = B_FALSE; + zp.zp_encrypt = gio->io_prop.zp_encrypt; + zp.zp_byteorder = gio->io_prop.zp_byteorder; + zp.zp_direct_write = B_FALSE; + memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN); + memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN); + memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); + + for (g = 0; resid != 0; g++) { + uint64_t min_size = zio_roundup_alloc_size(spa, resid / (gbh_nblkptrs(gangblocksize) - g)); - lsize = MIN(lsize, resid); - IMPLY(lsize < spa->spa_min_alloc, lsize == resid); - IMPLY(lsize >= spa->spa_min_alloc, lsize <= resid); - - zp.zp_checksum = gio->io_prop.zp_checksum; - zp.zp_compress = ZIO_COMPRESS_OFF; - zp.zp_complevel = gio->io_prop.zp_complevel; - zp.zp_type = zp.zp_storage_type = DMU_OT_NONE; - zp.zp_level = 0; - zp.zp_copies = gio->io_prop.zp_copies; - zp.zp_dedup = B_FALSE; - zp.zp_dedup_verify = B_FALSE; - zp.zp_nopwrite = B_FALSE; - zp.zp_encrypt = gio->io_prop.zp_encrypt; - zp.zp_byteorder = gio->io_prop.zp_byteorder; - zp.zp_direct_write = B_FALSE; - memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN); - memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN); - memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); - - zio_t *cio = zio_write(zio, spa, txg, &((blkptr_t *)gbh)[g], - has_data ? abd_get_offset(pio->io_abd, pio->io_size - - resid) : NULL, lsize, lsize, &zp, - zio_write_gang_member_ready, NULL, + min_size = MIN(min_size, resid); + IMPLY(min_size < spa->spa_min_alloc, min_size == resid); + IMPLY(min_size >= spa->spa_min_alloc, min_size <= resid); + bp = &((blkptr_t *)gbh)[g]; + + zio_alloc_list_t cio_list; + metaslab_trace_init(&cio_list); + error = metaslab_alloc_range(spa, mc, min_size, resid, + bp, gio->io_prop.zp_copies, txg, NULL, + flags, &cio_list, NULL, zio->io_allocator); + + uint64_t allocated_size = 0; + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { + uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[d]); + if (asize > allocated_size) + allocated_size = asize; + } + boolean_t allocated = allocated_size != 0; + if (g == 0 && error == 0 && allocated_size == pio->io_size) { + ASSERT3U(BP_GET_NDVAS(bp), ==, gio->io_prop.zp_copies); + /* + * De-gang case: We got an allocation big enough to + * satisfy the original allocation. Just do that + * instead of ganging. + */ + for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { + dva_t *dva = &zio->io_bp->blk_dva[d]; + metaslab_unalloc_dva(spa, + dva, txg); + metaslab_group_alloc_decrement(spa, + DVA_GET_VDEV(dva), pio, flags, + pio->io_allocator, B_FALSE); + } + metaslab_trace_move(&cio_list, &pio->io_alloc_list); + metaslab_group_alloc_increment_all(spa, bp, pio, flags, + pio->io_allocator); + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + metaslab_class_throttle_unreserve(mc, + gbh_copies - copies, pio->io_allocator, + pio); + } + zio->io_bp = pio->io_bp; + *zio->io_bp = zio->io_bp_orig = *bp; + + if (zio->io_abd != NULL) + abd_free(zio->io_abd); + zio->io_orig_abd = zio->io_abd = pio->io_abd; + + zio->io_size = zio->io_orig_size = allocated_size; + zio->io_lsize = allocated_size; + zio->io_done = NULL; + zio->io_orig_pipeline = zio->io_pipeline = + (zio->io_pipeline & ~ZIO_GANG_STAGES) | + ZIO_STAGE_WRITE_COMPRESS | ZIO_STAGE_DVA_ALLOCATE; + zio->io_flags |= ZIO_FLAG_PREALLOCATED; + zp.zp_type = zp.zp_storage_type = pio->io_prop.zp_type; + zp.zp_level = pio->io_prop.zp_level; + zio->io_prop = zp; + + goto end; + } + + uint64_t lsize = allocated ? allocated_size : min_size; + + zio_t *cio = zio_write(zio, spa, txg, bp, has_data ? + abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, + lsize, lsize, &zp, zio_write_gang_member_ready, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + ZIO_GANG_CHILD_FLAGS(pio) | + (allocated ? ZIO_FLAG_PREALLOCATED : 0), &pio->io_bookmark); + + resid -= lsize; zio_gang_inherit_allocator(zio, cio); + if (allocated) { + metaslab_trace_move(&cio_list, &cio->io_alloc_list); + metaslab_group_alloc_increment_all(spa, bp, cio, flags, + zio->io_allocator); + } if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); @@ -3242,6 +3316,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) gn->gn_gangblocksize; } +end: /* * Set pio's pipeline to just wait for zio to finish. */ @@ -4090,6 +4165,10 @@ zio_dva_allocate(zio_t *zio) ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } + if (zio->io_flags & ZIO_FLAG_PREALLOCATED) { + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_GANG); + return (zio); + } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); @@ -5096,7 +5175,8 @@ zio_ready(zio_t *zio) if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || - BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); + BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE) || + (zio->io_flags & ZIO_FLAG_PREALLOCATED)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); @@ -5801,6 +5881,3 @@ ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW, "Log all slow ZIOs, not just those with vdevs"); - -ZFS_MODULE_PARAM(zfs_zio, zio_, dynamic_gang_headers_enable, INT, ZMOD_RW, - "Enable dynamic gang header creation"); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index e2edfc9ebbb5..2a693726d97d 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -718,6 +718,12 @@ tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos'] tags = ['functional', 'features', 'large_dnode'] +[tests/functional/gang_blocks] +tests = ['gang_blocks_001_pos', 'gang_blocks_dyn_header_pos', + 'gang_blocks_dyn_header_neg', 'gang_blocks_dyn_degang', + 'gang_blocks_dyn_multi'] +tags = ['functional', 'gang_blocks'] + [tests/functional/grow] pre = post = diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index d6a791e3375d..289fa9fe4666 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -428,6 +428,10 @@ tests = ['large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_007_neg'] tags = ['functional', 'features', 'large_dnode'] +[tests/functional/gang_blocks] +tests = ['gang_blocks_001_pos'] +tags = ['functional', 'gang_blocks'] + [tests/functional/grow] pre = post = diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 2024c44cc138..db8073354e58 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -62,6 +62,8 @@ MAX_DATASET_NESTING max_dataset_nesting zfs_max_dataset_nesting MAX_MISSING_TVDS max_missing_tvds zfs_max_missing_tvds METASLAB_DEBUG_LOAD metaslab.debug_load metaslab_debug_load METASLAB_FORCE_GANGING metaslab.force_ganging metaslab_force_ganging +METASLAB_FORCE_GANGING_PCT metaslab.force_ganging_pct metaslab_force_ganging_pct +METASLAB_ALLOW_DEGANGING_PCT metaslab.allow_deganging_pct metaslab_allow_deganging_pct MULTIHOST_FAIL_INTERVALS multihost.fail_intervals zfs_multihost_fail_intervals MULTIHOST_HISTORY multihost.history zfs_multihost_history MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_intervals diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index dcefb26a4036..65043c2eb245 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -271,6 +271,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/events/events.cfg \ functional/events/events_common.kshlib \ functional/fault/fault.cfg \ + functional/gang_blocks/gang_blocks.kshlib \ functional/grow/grow.cfg \ functional/history/history.cfg \ functional/history/history_common.kshlib \ @@ -1552,6 +1553,13 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/features/large_dnode/large_dnode_008_pos.ksh \ functional/features/large_dnode/large_dnode_009_pos.ksh \ functional/features/large_dnode/setup.ksh \ + functional/gang_blocks/cleanup.ksh \ + functional/gang_blocks/gang_blocks_001_pos.ksh \ + functional/gang_blocks/gang_blocks_dyn_degang.ksh \ + functional/gang_blocks/gang_blocks_dyn_header_neg.ksh \ + functional/gang_blocks/gang_blocks_dyn_header_pos.ksh \ + functional/gang_blocks/gang_blocks_dyn_multi.ksh \ + functional/gang_blocks/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ functional/grow/grow_replicas_001_pos.ksh \ functional/history/cleanup.ksh \ diff --git a/tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh b/tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh new file mode 100755 index 000000000000..63b782e0353d --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh @@ -0,0 +1,29 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib new file mode 100644 index 000000000000..65cf032abb80 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib @@ -0,0 +1,92 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 By Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Get 0th DVA of first L0 block of file +# +# $1 filesystem +# $2 path to file +# +function get_first_block +{ + typeset fs=$1 + typeset path=$2 + + typeset full_path="$(get_prop mountpoint $fs)/$path" + typeset obj="$(ls -i $full_path | awk '{print $1}')" + + typeset l0_line="$(zdb -ddddd $fs $obj | grep L0 | grep -v Dataset | head -n 1)" + echo $l0_line | sed 's/.*L0 \([^ ]*\).*/\1/' + + return 0 +} + +function check_gang_dva +{ + typeset last_byte="$(echo -n $1 | tail -c 1)" + [[ "$last_byte" == "G" ]] || return 1 + return 0 +} + +function check_is_gang_dva +{ + check_gang_dva $1 || log_fail "Not a gang DVA: \"$1\"" +} + +function check_not_gang_dva +{ + check_gang_dva $1 && log_fail "Gang DVA: \"$1\"" +} + +# +# Get the gang header contents of the given dva in the given pool +# +# $1 pool +# $2 dva +# $3 size (in hexidecimal) +# +function read_gang_header +{ + typeset pool=$1 + typeset dva=$2 + typeset size=$3 + + check_is_gang_dva $dva + + zdb -R $pool "${dva%:*}:$size:g" 2>&1 | grep -v "Found vdev:" +} + +save_tunable METASLAB_FORCE_GANGING +save_tunable METASLAB_FORCE_GANGING_PCT +save_tunable METASLAB_ALLOW_DEGANGING_PCT +function cleanup +{ + destroy_pool $TESTPOOL + restore_tunable METASLAB_FORCE_GANGING + restore_tunable METASLAB_FORCE_GANGING_PCT + restore_tunable METASLAB_ALLOW_DEGANGING_PCT +} diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh new file mode 100755 index 000000000000..0ed44d79684c --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh @@ -0,0 +1,57 @@ +#!/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that gang block functionality behaves correctly. +# +# Strategy: +# 1. Create a pool without dynamic gang headers. +# 2. Set metaslab_force_ganging to force gang blocks to be created. +# 3. Verify that gang blocks can be read, written, and freed. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Gang blocks behave correctly." + +log_onexit cleanup + +log_must zpool create -f -o feature@dynamic_gang_header=disabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=128k $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 100000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=128k count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 200 | grep -v hole | wc -l) +[[ "$leaves" -gt 1 ]] || log_fail "Only one leaf in gang block, should not be possible" + +orig_checksum="$(cat $path | xxh128digest)" + +log_must verify_pool $TESTPOOL +log_must zinject -a +new_checksum="$(cat $path | xxh128digest)" +[[ "$orig_checksum" == "$new_checksum" ]] || log_fail "Checksum mismatch" + +log_must rm $path +log_must verify_pool $TESTPOOL + +log_pass "Gang blocks behave correctly." diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_degang.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_degang.ksh new file mode 100755 index 000000000000..2925598a8bfb --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_degang.ksh @@ -0,0 +1,52 @@ +#!/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that deganging works correctly +# +# Strategy: +# 1. Create a pool with dynamic gang headers and ashift=12. +# 2. Set metaslab_force_ganging to force ganging. +# 3. Set metaslab_allow_deganging_pct to allow it to degang. +# 3. Verify that the file degangs successfully. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that deganging works correctly" + +log_onexit cleanup + +log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 50000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 +set_tunable32 METASLAB_ALLOW_DEGANGING_PCT 100 + +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=1M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block $TESTPOOL/$TESTFS file) +check_not_gang_dva $first_block + +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers should not be active." + +log_pass "Deganging works correctly." diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh new file mode 100755 index 000000000000..f7021e3e9088 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh @@ -0,0 +1,51 @@ +#!/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that we don't use larger gang headers on ashift=9 pools +# +# Strategy: +# 1. Create a pool with dynamic gang headers. +# 2. Set metaslab_force_ganging to force multi-level ganging. +# 3. Verify that a large file has multi-level ganging +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that we don't use large gang headers on small-ashift pools". + +log_onexit cleanup + +log_must zpool create -f -o ashift=9 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 200000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=1M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 200) +gangs=$(echo "$leaves" | grep -c gang) +[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed" + +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers active on an ashift-9 pool" +log_pass "We don't use large gang headers on small-ashift pools". diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh new file mode 100755 index 000000000000..01a3dad8bd2c --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh @@ -0,0 +1,54 @@ +#!/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that we use larger gang headers on ashift=12 pools +# +# Strategy: +# 1. Create a pool with dynamic gang headers. +# 2. Set metaslab_force_ganging to force ganging. +# 3. Verify that a large file has more than 3 gang headers. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that we don't use large gang headers on small-ashift pools". + +log_onexit cleanup + +log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 200000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=1M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE) +first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*.*//') +check_not_gang_dva $first_dva + +num_leaves=$(echo "$leaves" | wc -l) +[[ "$num_leaves" -gt 3 ]] || log_fail "didn't use a larger gang header: \"$leaves\"" + +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active" +log_pass "We don't use large gang headers on small-ashift pools". diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh new file mode 100755 index 000000000000..afa82a81f1b1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh @@ -0,0 +1,52 @@ +#!/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that multi-level ganging still works with dynamic headers +# +# Strategy: +# 1. Create a pool with dynamic gang headers and ashift=12. +# 2. Set metaslab_force_ganging to force multi-level ganging. +# 3. Verify that a large file has multi-level ganging +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that we can still multi-level gang with large headers." + +log_onexit cleanup + +log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=16M $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 50000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=16M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 200) +gangs=$(echo "$leaves" | grep -c gang) +[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed" + +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active" + +log_pass "We can still multi-level gang with large headers." diff --git a/tests/zfs-tests/tests/functional/gang_blocks/setup.ksh b/tests/zfs-tests/tests/functional/gang_blocks/setup.ksh new file mode 100755 index 000000000000..fe19ce41cc18 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/setup.ksh @@ -0,0 +1,27 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib