From 5906333cc4af7b3fdb8cfff1cb3e8e579bd13174 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 12 Feb 2024 11:59:52 +0100 Subject: [PATCH 1/5] btrfs: zoned: don't skip block group profile checks on conventional zones On a zoned filesystem with conventional zones, we're skipping the block group profile checks for the conventional zones. This allows converting a zoned filesystem's data block groups to RAID when all of the zones backing the chunk are on conventional zones. But this will lead to problems, once we're trying to allocate chunks backed by sequential zones. So also check for conventional zones when loading a block group's profile on them. Reported-by: HAN Yuwei Link: https://lore.kernel.org/all/1ACD2E3643008A17+da260584-2c7f-432a-9e22-9d390aae84cc@bupt.moe/#t Reviewed-by: Boris Burkov Reviewed-by: Naohiro Aota Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index afeb1dc1f43a..a7f885ae56d3 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1656,6 +1656,15 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } out: + /* Reject non SINGLE data profiles without RST */ + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && + (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + if (cache->alloc_offset > cache->zone_capacity) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", From 9845664b9ee47ce7ee7ea93caf47d39a9d4552c4 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 14 Feb 2024 16:19:24 +0100 Subject: [PATCH 2/5] btrfs: dev-replace: properly validate device names There's a syzbot report that device name buffers passed to device replace are not properly checked for string termination which could lead to a read out of bounds in getname_kernel(). Add a helper that validates both source and target device name buffers. For devid as the source initialize the buffer to empty string in case something tries to read it later. This was originally analyzed and fixed in a different way by Edward Adam Davis (see links). Link: https://lore.kernel.org/linux-btrfs/000000000000d1a1d1060cc9c5e7@google.com/ Link: https://lore.kernel.org/linux-btrfs/tencent_44CA0665C9836EF9EEC80CB9E7E206DF5206@qq.com/ CC: stable@vger.kernel.org # 4.19+ CC: Edward Adam Davis Reported-and-tested-by: syzbot+33f23b49ac24f986c9e8@syzkaller.appspotmail.com Reviewed-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1502d664c892..79c4293ddf37 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -725,6 +725,23 @@ leave: return ret; } +static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args) +{ + if (args->start.srcdevid == 0) { + if (memchr(args->start.srcdev_name, 0, + sizeof(args->start.srcdev_name)) == NULL) + return -ENAMETOOLONG; + } else { + args->start.srcdev_name[0] = 0; + } + + if (memchr(args->start.tgtdev_name, 0, + sizeof(args->start.tgtdev_name)) == NULL) + return -ENAMETOOLONG; + + return 0; +} + int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { @@ -737,10 +754,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, default: return -EINVAL; } - - if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || - args->start.tgtdev_name[0] == '\0') - return -EINVAL; + ret = btrfs_check_replace_dev_names(args); + if (ret < 0) + return ret; ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name, args->start.srcdevid, From 5897710b28cabab04ea6c7547f27b7989de646ae Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 16 Feb 2024 22:17:10 +0000 Subject: [PATCH 3/5] btrfs: send: don't issue unnecessary zero writes for trailing hole If we have a sparse file with a trailing hole (from the last extent's end to i_size) and then create an extent in the file that ends before the file's i_size, then when doing an incremental send we will issue a write full of zeroes for the range that starts immediately after the new extent ends up to i_size. While this isn't incorrect because the file ends up with exactly the same data, it unnecessarily results in using extra space at the destination with one or more extents full of zeroes instead of having a hole. In same cases this results in using megabytes or even gigabytes of unnecessary space. Example, reproducer: $ cat test.sh #!/bin/bash DEV=/dev/sdh MNT=/mnt/sdh mkfs.btrfs -f $DEV mount $DEV $MNT # Create 1G sparse file. xfs_io -f -c "truncate 1G" $MNT/foobar # Create base snapshot. btrfs subvolume snapshot -r $MNT $MNT/mysnap1 # Create send stream (full send) for the base snapshot. btrfs send -f /tmp/1.snap $MNT/mysnap1 # Now write one extent at the beginning of the file and one somewhere # in the middle, leaving a gap between the end of this second extent # and the file's size. xfs_io -c "pwrite -S 0xab 0 128K" \ -c "pwrite -S 0xcd 512M 128K" \ $MNT/foobar # Now create a second snapshot which is going to be used for an # incremental send operation. btrfs subvolume snapshot -r $MNT $MNT/mysnap2 # Create send stream (incremental send) for the second snapshot. btrfs send -p $MNT/mysnap1 -f /tmp/2.snap $MNT/mysnap2 # Now recreate the filesystem by receiving both send streams and # verify we get the same content that the original filesystem had # and file foobar has only two extents with a size of 128K each. umount $MNT mkfs.btrfs -f $DEV mount $DEV $MNT btrfs receive -f /tmp/1.snap $MNT btrfs receive -f /tmp/2.snap $MNT echo -e "\nFile fiemap in the second snapshot:" # Should have: # # 128K extent at file range [0, 128K[ # hole at file range [128K, 512M[ # 128K extent file range [512M, 512M + 128K[ # hole at file range [512M + 128K, 1G[ xfs_io -r -c "fiemap -v" $MNT/mysnap2/foobar # File should be using 256K of data (two 128K extents). echo -e "\nSpace used by the file: $(du -h $MNT/mysnap2/foobar | cut -f 1)" umount $MNT Running the test, we can see with fiemap that we get an extent for the range [512M, 1G[, while in the source filesystem we have an extent for the range [512M, 512M + 128K[ and a hole for the rest of the file (the range [512M + 128K, 1G[): $ ./test.sh (...) File fiemap in the second snapshot: /mnt/sdh/mysnap2/foobar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..255]: 26624..26879 256 0x0 1: [256..1048575]: hole 1048320 2: [1048576..2097151]: 2156544..3205119 1048576 0x1 Space used by the file: 513M This happens because once we finish processing an inode, at finish_inode_if_needed(), we always issue a hole (write operations full of zeros) if there's a gap between the end of the last processed extent and the file's size, even if that range is already a hole in the parent snapshot. Fix this by issuing the hole only if the range is not already a hole. After this change, running the test above, we get the expected layout: $ ./test.sh (...) File fiemap in the second snapshot: /mnt/sdh/mysnap2/foobar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..255]: 26624..26879 256 0x0 1: [256..1048575]: hole 1048320 2: [1048576..1048831]: 26880..27135 256 0x1 3: [1048832..2097151]: hole 1048320 Space used by the file: 256K A test case for fstests will follow soon. CC: stable@vger.kernel.org # 6.1+ Reported-by: Dorai Ashok S A Link: https://lore.kernel.org/linux-btrfs/c0bf7818-9c45-46a8-b3d3-513230d0c86e@inix.me/ Reviewed-by: Sweet Tea Dorminy Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7902298c1f25..e48a063ef085 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6705,11 +6705,20 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret) goto out; } - if (sctx->cur_inode_last_extent < - sctx->cur_inode_size) { - ret = send_hole(sctx, sctx->cur_inode_size); - if (ret) + if (sctx->cur_inode_last_extent < sctx->cur_inode_size) { + ret = range_is_hole_in_parent(sctx, + sctx->cur_inode_last_extent, + sctx->cur_inode_size); + if (ret < 0) { goto out; + } else if (ret == 0) { + ret = send_hole(sctx, sctx->cur_inode_size); + if (ret < 0) + goto out; + } else { + /* Range is already a hole, skip. */ + ret = 0; + } } } if (need_truncate) { From e06cc89475eddc1f3a7a4d471524256152c68166 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 19 Feb 2024 19:41:23 +0000 Subject: [PATCH 4/5] btrfs: fix data races when accessing the reserved amount of block reserves At space_info.c we have several places where we access the ->reserved field of a block reserve without taking the block reserve's spinlock first, which makes KCSAN warn about a data race since that field is always updated while holding the spinlock. The reports from KCSAN are like the following: [117.193526] BUG: KCSAN: data-race in btrfs_block_rsv_release [btrfs] / need_preemptive_reclaim [btrfs] [117.195148] read to 0x000000017f587190 of 8 bytes by task 6303 on cpu 3: [117.195172] need_preemptive_reclaim+0x222/0x2f0 [btrfs] [117.195992] __reserve_bytes+0xbb0/0xdc8 [btrfs] [117.196807] btrfs_reserve_metadata_bytes+0x4c/0x120 [btrfs] [117.197620] btrfs_block_rsv_add+0x78/0xa8 [btrfs] [117.198434] btrfs_delayed_update_inode+0x154/0x368 [btrfs] [117.199300] btrfs_update_inode+0x108/0x1c8 [btrfs] [117.200122] btrfs_dirty_inode+0xb4/0x140 [btrfs] [117.200937] btrfs_update_time+0x8c/0xb0 [btrfs] [117.201754] touch_atime+0x16c/0x1e0 [117.201789] filemap_read+0x674/0x728 [117.201823] btrfs_file_read_iter+0xf8/0x410 [btrfs] [117.202653] vfs_read+0x2b6/0x498 [117.203454] ksys_read+0xa2/0x150 [117.203473] __s390x_sys_read+0x68/0x88 [117.203495] do_syscall+0x1c6/0x210 [117.203517] __do_syscall+0xc8/0xf0 [117.203539] system_call+0x70/0x98 [117.203579] write to 0x000000017f587190 of 8 bytes by task 11 on cpu 0: [117.203604] btrfs_block_rsv_release+0x2e8/0x578 [btrfs] [117.204432] btrfs_delayed_inode_release_metadata+0x7c/0x1d0 [btrfs] [117.205259] __btrfs_update_delayed_inode+0x37c/0x5e0 [btrfs] [117.206093] btrfs_async_run_delayed_root+0x356/0x498 [btrfs] [117.206917] btrfs_work_helper+0x160/0x7a0 [btrfs] [117.207738] process_one_work+0x3b6/0x838 [117.207768] worker_thread+0x75e/0xb10 [117.207797] kthread+0x21a/0x230 [117.207830] __ret_from_fork+0x6c/0xb8 [117.207861] ret_from_fork+0xa/0x30 So add a helper to get the reserved amount of a block reserve while holding the lock. The value may be not be up to date anymore when used by need_preemptive_reclaim() and btrfs_preempt_reclaim_metadata_space(), but that's ok since the worst it can do is cause more reclaim work do be done sooner rather than later. Reading the field while holding the lock instead of using the data_race() annotation is used in order to prevent load tearing. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.h | 16 ++++++++++++++++ fs/btrfs/space-info.c | 26 +++++++++++++------------- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index b0bd12b8652f..fb440a074700 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -101,4 +101,20 @@ static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv) return data_race(rsv->full); } +/* + * Get the reserved mount of a block reserve in a context where getting a stale + * value is acceptable, instead of accessing it directly and trigger data race + * warning from KCSAN. + */ +static inline u64 btrfs_block_rsv_reserved(struct btrfs_block_rsv *rsv) +{ + u64 ret; + + spin_lock(&rsv->lock); + ret = rsv->reserved; + spin_unlock(&rsv->lock); + + return ret; +} + #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 571bb13587d5..3b54eb583474 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -856,7 +856,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { - u64 global_rsv_size = fs_info->global_block_rsv.reserved; + const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; u64 thresh; u64 used; @@ -956,8 +956,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1; delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); if (ordered >= delalloc) - used += fs_info->delayed_refs_rsv.reserved + - fs_info->delayed_block_rsv.reserved; + used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) + + btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv); else used += space_info->bytes_may_use - global_rsv_size; @@ -1173,7 +1173,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) enum btrfs_flush_state flush; u64 delalloc_size = 0; u64 to_reclaim, block_rsv_size; - u64 global_rsv_size = global_rsv->reserved; + const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv); loops++; @@ -1185,9 +1185,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * assume it's tied up in delalloc reservations. */ block_rsv_size = global_rsv_size + - delayed_block_rsv->reserved + - delayed_refs_rsv->reserved + - trans_rsv->reserved; + btrfs_block_rsv_reserved(delayed_block_rsv) + + btrfs_block_rsv_reserved(delayed_refs_rsv) + + btrfs_block_rsv_reserved(trans_rsv); if (block_rsv_size < space_info->bytes_may_use) delalloc_size = space_info->bytes_may_use - block_rsv_size; @@ -1207,16 +1207,16 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) to_reclaim = delalloc_size; flush = FLUSH_DELALLOC; } else if (space_info->bytes_pinned > - (delayed_block_rsv->reserved + - delayed_refs_rsv->reserved)) { + (btrfs_block_rsv_reserved(delayed_block_rsv) + + btrfs_block_rsv_reserved(delayed_refs_rsv))) { to_reclaim = space_info->bytes_pinned; flush = COMMIT_TRANS; - } else if (delayed_block_rsv->reserved > - delayed_refs_rsv->reserved) { - to_reclaim = delayed_block_rsv->reserved; + } else if (btrfs_block_rsv_reserved(delayed_block_rsv) > + btrfs_block_rsv_reserved(delayed_refs_rsv)) { + to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv); flush = FLUSH_DELAYED_ITEMS_NR; } else { - to_reclaim = delayed_refs_rsv->reserved; + to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv); flush = FLUSH_DELAYED_REFS_NR; } From c7bb26b847e5b97814f522686068c5628e2b3646 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 19 Feb 2024 20:10:07 +0000 Subject: [PATCH 5/5] btrfs: fix data race at btrfs_use_block_rsv() when accessing block reserve At btrfs_use_block_rsv() we read the size of a block reserve without locking its spinlock, which makes KCSAN complain because the size of a block reserve is always updated while holding its spinlock. The report from KCSAN is the following: [653.313148] BUG: KCSAN: data-race in btrfs_update_delayed_refs_rsv [btrfs] / btrfs_use_block_rsv [btrfs] [653.314755] read to 0x000000017f5871b8 of 8 bytes by task 7519 on cpu 0: [653.314779] btrfs_use_block_rsv+0xe4/0x2f8 [btrfs] [653.315606] btrfs_alloc_tree_block+0xdc/0x998 [btrfs] [653.316421] btrfs_force_cow_block+0x220/0xe38 [btrfs] [653.317242] btrfs_cow_block+0x1ac/0x568 [btrfs] [653.318060] btrfs_search_slot+0xda2/0x19b8 [btrfs] [653.318879] btrfs_del_csums+0x1dc/0x798 [btrfs] [653.319702] __btrfs_free_extent.isra.0+0xc24/0x2028 [btrfs] [653.320538] __btrfs_run_delayed_refs+0xd3c/0x2390 [btrfs] [653.321340] btrfs_run_delayed_refs+0xae/0x290 [btrfs] [653.322140] flush_space+0x5e4/0x718 [btrfs] [653.322958] btrfs_preempt_reclaim_metadata_space+0x102/0x2f8 [btrfs] [653.323781] process_one_work+0x3b6/0x838 [653.323800] worker_thread+0x75e/0xb10 [653.323817] kthread+0x21a/0x230 [653.323836] __ret_from_fork+0x6c/0xb8 [653.323855] ret_from_fork+0xa/0x30 [653.323887] write to 0x000000017f5871b8 of 8 bytes by task 576 on cpu 3: [653.323906] btrfs_update_delayed_refs_rsv+0x1a4/0x250 [btrfs] [653.324699] btrfs_add_delayed_data_ref+0x468/0x6d8 [btrfs] [653.325494] btrfs_free_extent+0x76/0x120 [btrfs] [653.326280] __btrfs_mod_ref+0x6a8/0x6b8 [btrfs] [653.327064] btrfs_dec_ref+0x50/0x70 [btrfs] [653.327849] walk_up_proc+0x236/0xa50 [btrfs] [653.328633] walk_up_tree+0x21c/0x448 [btrfs] [653.329418] btrfs_drop_snapshot+0x802/0x1328 [btrfs] [653.330205] btrfs_clean_one_deleted_snapshot+0x184/0x238 [btrfs] [653.330995] cleaner_kthread+0x2b0/0x2f0 [btrfs] [653.331781] kthread+0x21a/0x230 [653.331800] __ret_from_fork+0x6c/0xb8 [653.331818] ret_from_fork+0xa/0x30 So add a helper to get the size of a block reserve while holding the lock. Reading the field while holding the lock instead of using the data_race() annotation is used in order to prevent load tearing. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 2 +- fs/btrfs/block-rsv.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index ceb5f586a2d5..1043a8142351 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -494,7 +494,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); - if (unlikely(block_rsv->size == 0)) + if (unlikely(btrfs_block_rsv_size(block_rsv) == 0)) goto try_reserve; again: ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index fb440a074700..43a9a6b5a79f 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -117,4 +117,20 @@ static inline u64 btrfs_block_rsv_reserved(struct btrfs_block_rsv *rsv) return ret; } +/* + * Get the size of a block reserve in a context where getting a stale value is + * acceptable, instead of accessing it directly and trigger data race warning + * from KCSAN. + */ +static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv) +{ + u64 ret; + + spin_lock(&rsv->lock); + ret = rsv->size; + spin_unlock(&rsv->lock); + + return ret; +} + #endif /* BTRFS_BLOCK_RSV_H */