From 64d2c847ba380e07b9072d65a50aa6469d2aa43f Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 20 Jun 2024 15:05:45 +0900 Subject: [PATCH 1/5] btrfs: zoned: fix calc_available_free_space() for zoned mode calc_available_free_space() returns the total size of metadata (or system) block groups, which can be allocated from unallocated disk space. The logic is wrong on zoned mode in two places. First, the calculation of data_chunk_size is wrong. We always allocate one zone as one chunk, and no partial allocation of a zone. So, we should use zone_size (= data_sinfo->chunk_size) as it is. Second, the result "avail" may not be zone aligned. Since we always allocate one zone as one chunk on zoned mode, returning non-zone size aligned bytes will result in less pressure on the async metadata reclaim process. This is serious for the nearly full state with a large zone size device. Allowing over-commit too much will result in less async reclaim work and end up in ENOSPC. We can align down to the zone size to avoid that. Fixes: cb6cbab79055 ("btrfs: adjust overcommit logic when very close to full") CC: stable@vger.kernel.org # 6.9 Signed-off-by: Naohiro Aota Reviewed-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d620323d08eae3..ae8c56442549c7 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -373,11 +373,18 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, * "optimal" chunk size based on the fs size. However when we actually * allocate the chunk we will strip this down further, making it no more * than 10% of the disk or 1G, whichever is smaller. + * + * On the zoned mode, we need to use zone_size (= + * data_sinfo->chunk_size) as it is. */ data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); - data_chunk_size = min(data_sinfo->chunk_size, - mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); - data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); + if (!btrfs_is_zoned(fs_info)) { + data_chunk_size = min(data_sinfo->chunk_size, + mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); + data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); + } else { + data_chunk_size = data_sinfo->chunk_size; + } /* * Since data allocations immediately use block groups as part of the @@ -405,6 +412,17 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, avail >>= 3; else avail >>= 1; + + /* + * On the zoned mode, we always allocate one zone as one chunk. + * Returning non-zone size alingned bytes here will result in + * less pressure for the async metadata reclaim process, and it + * will over-commit too much leading to ENOSPC. Align down to the + * zone size to avoid that. + */ + if (btrfs_is_zoned(fs_info)) + avail = ALIGN_DOWN(avail, fs_info->zone_size); + return avail; } From 724d8042cef84496ddb4492dc120291f997ae26b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 24 Jun 2024 15:10:53 +0930 Subject: [PATCH 2/5] btrfs: always do the basic checks for btrfs_qgroup_inherit structure [BUG] Syzbot reports the following regression detected by KASAN: BUG: KASAN: slab-out-of-bounds in btrfs_qgroup_inherit+0x42e/0x2e20 fs/btrfs/qgroup.c:3277 Read of size 8 at addr ffff88814628ca50 by task syz-executor318/5171 CPU: 0 PID: 5171 Comm: syz-executor318 Not tainted 6.10.0-rc2-syzkaller-00010-g2ab795141095 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 btrfs_qgroup_inherit+0x42e/0x2e20 fs/btrfs/qgroup.c:3277 create_pending_snapshot+0x1359/0x29b0 fs/btrfs/transaction.c:1854 create_pending_snapshots+0x195/0x1d0 fs/btrfs/transaction.c:1922 btrfs_commit_transaction+0xf20/0x3740 fs/btrfs/transaction.c:2382 create_snapshot+0x6a1/0x9e0 fs/btrfs/ioctl.c:875 btrfs_mksubvol+0x58f/0x710 fs/btrfs/ioctl.c:1029 btrfs_mksnapshot+0xb5/0xf0 fs/btrfs/ioctl.c:1075 __btrfs_ioctl_snap_create+0x387/0x4b0 fs/btrfs/ioctl.c:1340 btrfs_ioctl_snap_create_v2+0x1f2/0x3a0 fs/btrfs/ioctl.c:1422 btrfs_ioctl+0x99e/0xc60 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fcbf1992509 RSP: 002b:00007fcbf1928218 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fcbf1a1f618 RCX: 00007fcbf1992509 RDX: 0000000020000280 RSI: 0000000050009417 RDI: 0000000000000003 RBP: 00007fcbf1a1f610 R08: 00007ffea1298e97 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007fcbf19eb660 R13: 00000000200002b8 R14: 00007fcbf19e60c0 R15: 0030656c69662f2e And it also pinned it down to commit b5357cb268c4 ("btrfs: qgroup: do not check qgroup inherit if qgroup is disabled"). [CAUSE] That offending commit skips the whole qgroup inherit check if qgroup is not enabled. But that also skips the very basic checks like num_ref_copies/num_excl_copies and the structure size checks. Meaning if a qgroup enable/disable race is happening at the background, and we pass a btrfs_qgroup_inherit structure when the qgroup is disabled, the check would be completely skipped. Then at the time of transaction commitment, qgroup is re-enabled and btrfs_qgroup_inherit() is going to use the incorrect structure and causing the above KASAN error. [FIX] Make btrfs_qgroup_check_inherit() only skip the source qgroup checks. So that even if invalid btrfs_qgroup_inherit structure is passed in, we can still reject invalid ones no matter if qgroup is enabled or not. Furthermore we do already have an extra safety inside btrfs_qgroup_inherit(), which would just ignore invalid qgroup sources, so even if we only skip the qgroup source check we're still safe. Reported-by: syzbot+a0d1f7e26910be4dc171@syzkaller.appspotmail.com Fixes: b5357cb268c4 ("btrfs: qgroup: do not check qgroup inherit if qgroup is disabled") Reviewed-by: Boris Burkov Reviewed-by: Jeongjun Park Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index bf0f81d59b6bca..39a15cca58ca95 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3062,8 +3062,6 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_inherit *inherit, size_t size) { - if (!btrfs_qgroup_enabled(fs_info)) - return 0; if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP) return -EOPNOTSUPP; if (size < sizeof(*inherit) || size > PAGE_SIZE) @@ -3084,6 +3082,14 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, if (size != struct_size(inherit, qgroups, inherit->num_qgroups)) return -EINVAL; + /* + * Skip the inherit source qgroups check if qgroup is not enabled. + * Qgroup can still be later enabled causing problems, but in that case + * btrfs_qgroup_inherit() would just ignore those invalid ones. + */ + if (!btrfs_qgroup_enabled(fs_info)) + return 0; + /* * Now check all the remaining qgroups, they should all: * From 9da45c88e124f13a3c4d480b89b298e007fbb9e4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 23 Jun 2024 12:50:26 +0100 Subject: [PATCH 3/5] btrfs: fix uninitialized return value in the ref-verify tool In the ref-verify tool, when processing the inline references of an extent item, we may end up returning with uninitialized return value, because: 1) The 'ret' variable is not initialized if there are no inline extent references ('ptr' == 'end' before the while loop starts); 2) If we find an extent owner inline reference we don't initialize 'ret'. So fix these cases by initializing 'ret' to 0 when declaring the variable and set it to -EINVAL if we find an extent owner inline references and simple quotas are not enabled (as well as print an error message). Reported-by: Mirsad Todorovac Link: https://lore.kernel.org/linux-btrfs/59b40ebe-c824-457d-8b24-0bbca69d472b@gmail.com/ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index cf531255ab76c6..9522a8b79d22b5 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -441,7 +441,8 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, u32 item_size = btrfs_item_size(leaf, slot); unsigned long end, ptr; u64 offset, flags, count; - int type, ret; + int type; + int ret = 0; ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); @@ -486,7 +487,11 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, key->objectid, key->offset); break; case BTRFS_EXTENT_OWNER_REF_KEY: - WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) { + btrfs_err(fs_info, + "found extent owner ref without simple quotas enabled"); + ret = -EINVAL; + } break; default: btrfs_err(fs_info, "invalid key type in iref"); From da0386c1c70da1a01b5fa8ec503b96116bc8734c Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 2 Jul 2024 07:31:13 -0700 Subject: [PATCH 4/5] btrfs: fix folio refcount in btrfs_do_encoded_write() The conversion to folios switched __free_page() to __folio_put() in the error path in btrfs_do_encoded_write(). However, this gets the page refcounting wrong. If we do hit that error path (I reproduced by modifying btrfs_do_encoded_write to pretend to always fail in a way that jumps to out_folios and running the fstests case btrfs/281), then we always hit the following BUG freeing the folio: BUG: Bad page state in process btrfs pfn:40ab0b page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x61be5 pfn:0x40ab0b flags: 0x5ffff0000000000(node=0|zone=2|lastcpupid=0x1ffff) raw: 05ffff0000000000 0000000000000000 dead000000000122 0000000000000000 raw: 0000000000061be5 0000000000000000 00000001ffffffff 0000000000000000 page dumped because: nonzero _refcount Call Trace: dump_stack_lvl+0x3d/0xe0 bad_page+0xea/0xf0 free_unref_page+0x8e1/0x900 ? __mem_cgroup_uncharge+0x69/0x90 __folio_put+0xe6/0x190 btrfs_do_encoded_write+0x445/0x780 ? current_time+0x25/0xd0 btrfs_do_write_iter+0x2cc/0x4b0 btrfs_ioctl_encoded_write+0x2b6/0x340 It turns out __free_page() decreases the page reference count while __folio_put() does not. Switch __folio_put() to folio_put() which decreases the folio reference count first. Fixes: 400b172b8cdc ("btrfs: compression: migrate compression/decompression paths to folios") Tested-by: Ed Tomlinson Reviewed-by: Qu Wenruo Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d0274324c75a0d..6c96a6086d3fdb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10382,7 +10382,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, out_folios: for (i = 0; i < nr_folios; i++) { if (folios[i]) - __folio_put(folios[i]); + folio_put(folios[i]); } kvfree(folios); out: From a56c85fa2d59ab0780514741550edf87989a66e9 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 2 Jul 2024 07:31:14 -0700 Subject: [PATCH 5/5] btrfs: fix folio refcount in __alloc_dummy_extent_buffer() Another improper use of __folio_put() in an error path after freshly allocating pages/folios which returns them with the refcount initialized to 1. The refactor from __free_pages() -> __folio_put() (instead of folio_put) removed a refcount decrement found in __free_pages() and folio_put but absent from __folio_put(). Fixes: 13df3775efca ("btrfs: cleanup metadata page pointer usage") CC: stable@vger.kernel.org # 6.8+ Tested-by: Ed Tomlinson Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f688fab55251ea..958155cc43a81c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3553,7 +3553,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, for (int i = 0; i < num_folios; i++) { if (eb->folios[i]) { detach_extent_buffer_folio(eb, eb->folios[i]); - __folio_put(eb->folios[i]); + folio_put(eb->folios[i]); } } __free_extent_buffer(eb);