From cb36c5f2fbb07aa8dea163bf9be18ca65ebcda5d Mon Sep 17 00:00:00 2001 From: Robert Evans Date: Sat, 20 Jan 2024 18:06:55 -0500 Subject: [PATCH] Replace L1 dirty walk with DNODE_FIND_DIRTY This walk is inherently racy w.r.t. dbuf eviction and sync. Consider: 0. A large sparse file with 3 levels of indirection. 1. A new L1 block is added to a brand new L2 block. 2. The L1 block syncs out and is immediately evicted. 3. Before the L3->L2 BP is updated in the L3 block, dnode_free_range attempts to free the new L1. In this case neither dnode_dirty_l1range nor dnode_next_offset can find the newly synced-out L1 block and its L0 blocks: - dnode_dirty_l1range uses in-memory index but the L1 is evicted - dnode_next_offset considers on-disk BPs but the L3->L2 is missing And then free_children will later PANIC because the L1 was not dirtied during open context when freeing the range. This case was found during testing llseek(SEEK_HOLE/SEEK_DATA) without txg sync and is distinct from the _other_ free_childen panic found and addressed by #16025. The fix is to replace dnode_dirty_l1range with dnode_next_offset(DNODE_FIND_DIRTY) which knows how to find all dirty L1 blocks. This PR also changes to use minlvl=1 to avoid redirtying L2 blocks that are only dirtied in a prior txg. Successive frees otherwise needlessly redirty already-empty L1s which wastes time during txg sync turning them back into holes. Signed-off-by: Robert Evans --- module/zfs/dnode.c | 82 +++++----------------------------------------- 1 file changed, 8 insertions(+), 74 deletions(-) diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 72fe2bfd22b7..4b110de0008b 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -2079,76 +2079,6 @@ dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) } } -/* - * Dirty all the in-core level-1 dbufs in the range specified by start_blkid - * and end_blkid. - */ -static void -dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, - dmu_tx_t *tx) -{ - dmu_buf_impl_t *db_search; - dmu_buf_impl_t *db; - avl_index_t where; - - db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP); - - mutex_enter(&dn->dn_dbufs_mtx); - - db_search->db_level = 1; - db_search->db_blkid = start_blkid + 1; - db_search->db_state = DB_SEARCH; - for (;;) { - - db = avl_find(&dn->dn_dbufs, db_search, &where); - if (db == NULL) - db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); - - if (db == NULL || db->db_level != 1 || - db->db_blkid >= end_blkid) { - break; - } - - /* - * Setup the next blkid we want to search for. - */ - db_search->db_blkid = db->db_blkid + 1; - ASSERT3U(db->db_blkid, >=, start_blkid); - - /* - * If the dbuf transitions to DB_EVICTING while we're trying - * to dirty it, then we will be unable to discover it in - * the dbuf hash table. This will result in a call to - * dbuf_create() which needs to acquire the dn_dbufs_mtx - * lock. To avoid a deadlock, we drop the lock before - * dirtying the level-1 dbuf. - */ - mutex_exit(&dn->dn_dbufs_mtx); - dnode_dirty_l1(dn, db->db_blkid, tx); - mutex_enter(&dn->dn_dbufs_mtx); - } - -#ifdef ZFS_DEBUG - /* - * Walk all the in-core level-1 dbufs and verify they have been dirtied. - */ - db_search->db_level = 1; - db_search->db_blkid = start_blkid + 1; - db_search->db_state = DB_SEARCH; - db = avl_find(&dn->dn_dbufs, db_search, &where); - if (db == NULL) - db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); - for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { - if (db->db_level != 1 || db->db_blkid >= end_blkid) - break; - if (db->db_state != DB_EVICTING) - ASSERT(db->db_dirtycnt > 0); - } -#endif - kmem_free(db_search, sizeof (dmu_buf_impl_t)); - mutex_exit(&dn->dn_dbufs_mtx); -} - void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag) { @@ -2332,8 +2262,6 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (last != first) dnode_dirty_l1(dn, last, tx); - dnode_dirty_l1range(dn, first, last, tx); - int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = first + 1; i < last; i++) { @@ -2342,10 +2270,16 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) * level-1 indirect block at or after i. Note * that dnode_next_offset() operates in terms of * level-0-equivalent bytes. + * N.B. this uses minlvl=1 to avoid redirtying L1s + * freed in prior txgs as minlvl=1 checks L0s and skips + * dirty L1s containing no L0 BPs or only freed L0s. + * minlvl=2 would also work, but that would then match + * every dirty L1 pointer unconditionally. */ uint64_t ibyte = i << shift; - int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, - &ibyte, 2, 1, 0); + int err = dnode_next_offset( + dn, DNODE_FIND_HAVELOCK | DNODE_FIND_DIRTY, + &ibyte, 1, 1, 0); i = ibyte >> shift; if (i >= last) break;