linux/fs/btrfs/backref.h

495 lines
14 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2011 STRATO. All rights reserved.
*/
#ifndef BTRFS_BACKREF_H
#define BTRFS_BACKREF_H
#include <linux/btrfs.h>
#include "messages.h"
#include "ulist.h"
#include "disk-io.h"
#include "extent_io.h"
btrfs: use a single argument for extent offset in backref walking functions The interface for find_parent_nodes() has two extent offset related arguments: 1) One u64 pointer argument for the extent offset; 2) One boolean argument to tell if the extent offset should be ignored or not. These are confusing, becase the extent offset pointer can be NULL and in some cases callers pass a NULL value as a way to tell the backref walking code to ignore offsets in file extent items (and simply consider all file extent items that point to the target data extent). The boolean argument was added in commit c995ab3cda3f ("btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents"), but it was never really necessary, it was enough if it could find a way to get a NULL value passed to the "extent_item_pos" argument of find_parent_nodes(). The arguments are also passed to functions called by find_parent_nodes() and respective helper functions, which further makes everything more complicated than needed. Then we have several backref walking related functions that end up calling find_parent_nodes(), either directly or through some other function that they call, and for many we have to use an "extent_item_pos" (u64) argument and a boolean "ignore_offset" argument too. This is confusing and not really necessary. So use a single argument to specify the extent offset, as a simple u64 and not as a pointer, but using a special value of (u64)-1, defined as a documented constant, to indicate when the extent offset should be ignored. This is also preparation work for the upcoming patches in the series that add other arguments to find_parent_nodes() and other related functions that use it. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-01 16:15:46 +00:00
/*
* Context and arguments for backref walking functions. Some of the fields are
* to be filled by the caller of such functions while other are filled by the
* functions themselves, as described below.
btrfs: use a single argument for extent offset in backref walking functions The interface for find_parent_nodes() has two extent offset related arguments: 1) One u64 pointer argument for the extent offset; 2) One boolean argument to tell if the extent offset should be ignored or not. These are confusing, becase the extent offset pointer can be NULL and in some cases callers pass a NULL value as a way to tell the backref walking code to ignore offsets in file extent items (and simply consider all file extent items that point to the target data extent). The boolean argument was added in commit c995ab3cda3f ("btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents"), but it was never really necessary, it was enough if it could find a way to get a NULL value passed to the "extent_item_pos" argument of find_parent_nodes(). The arguments are also passed to functions called by find_parent_nodes() and respective helper functions, which further makes everything more complicated than needed. Then we have several backref walking related functions that end up calling find_parent_nodes(), either directly or through some other function that they call, and for many we have to use an "extent_item_pos" (u64) argument and a boolean "ignore_offset" argument too. This is confusing and not really necessary. So use a single argument to specify the extent offset, as a simple u64 and not as a pointer, but using a special value of (u64)-1, defined as a documented constant, to indicate when the extent offset should be ignored. This is also preparation work for the upcoming patches in the series that add other arguments to find_parent_nodes() and other related functions that use it. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-01 16:15:46 +00:00
*/
struct btrfs_backref_walk_ctx {
/*
* The address of the extent for which we are doing backref walking.
* Can be either a data extent or a metadata extent.
*
* Must always be set by the top level caller.
*/
u64 bytenr;
/*
* Offset relative to the target extent. This is only used for data
* extents, and it's meaningful because we can have file extent items
* that point only to a section of a data extent ("bookend" extents),
* and we want to filter out any that don't point to a section of the
* data extent containing the given offset.
*
* Must always be set by the top level caller.
*/
u64 extent_item_pos;
/*
* If true and bytenr corresponds to a data extent, then references from
* all file extent items that point to the data extent are considered,
* @extent_item_pos is ignored.
*/
bool ignore_extent_item_pos;
/* A valid transaction handle or NULL. */
struct btrfs_trans_handle *trans;
/*
* The file system's info object, can not be NULL.
*
* Must always be set by the top level caller.
*/
struct btrfs_fs_info *fs_info;
/*
* Time sequence acquired from btrfs_get_tree_mod_seq(), in case the
* caller joined the tree mod log to get a consistent view of b+trees
* while we do backref walking, or BTRFS_SEQ_LAST.
* When using BTRFS_SEQ_LAST, delayed refs are not checked and it uses
* commit roots when searching b+trees - this is a special case for
* qgroups used during a transaction commit.
*/
u64 time_seq;
/*
* Used to collect the bytenr of metadata extents that point to the
* target extent.
*/
struct ulist *refs;
/*
* List used to collect the IDs of the roots from which the target
* extent is accessible. Can be NULL in case the caller does not care
* about collecting root IDs.
*/
struct ulist *roots;
};
btrfs: use a single argument for extent offset in backref walking functions The interface for find_parent_nodes() has two extent offset related arguments: 1) One u64 pointer argument for the extent offset; 2) One boolean argument to tell if the extent offset should be ignored or not. These are confusing, becase the extent offset pointer can be NULL and in some cases callers pass a NULL value as a way to tell the backref walking code to ignore offsets in file extent items (and simply consider all file extent items that point to the target data extent). The boolean argument was added in commit c995ab3cda3f ("btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents"), but it was never really necessary, it was enough if it could find a way to get a NULL value passed to the "extent_item_pos" argument of find_parent_nodes(). The arguments are also passed to functions called by find_parent_nodes() and respective helper functions, which further makes everything more complicated than needed. Then we have several backref walking related functions that end up calling find_parent_nodes(), either directly or through some other function that they call, and for many we have to use an "extent_item_pos" (u64) argument and a boolean "ignore_offset" argument too. This is confusing and not really necessary. So use a single argument to specify the extent offset, as a simple u64 and not as a pointer, but using a special value of (u64)-1, defined as a documented constant, to indicate when the extent offset should be ignored. This is also preparation work for the upcoming patches in the series that add other arguments to find_parent_nodes() and other related functions that use it. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-01 16:15:46 +00:00
struct inode_fs_paths {
struct btrfs_path *btrfs_path;
struct btrfs_root *fs_root;
struct btrfs_data_container *fspath;
};
btrfs: speedup checking for extent sharedness during fiemap One of the most expensive tasks performed during fiemap is to check if an extent is shared. This task has two major steps: 1) Check if the data extent is shared. This implies checking the extent item in the extent tree, checking delayed references, etc. If we find the data extent is directly shared, we terminate immediately; 2) If the data extent is not directly shared (its extent item has a refcount of 1), then it may be shared if we have snapshots that share subtrees of the inode's subvolume b+tree. So we check if the leaf containing the file extent item is shared, then its parent node, then the parent node of the parent node, etc, until we reach the root node or we find one of them is shared - in which case we stop immediately. During fiemap we process the extents of a file from left to right, from file offset 0 to EOF. This means that we iterate b+tree leaves from left to right, and has the implication that we keep repeating that second step above several times for the same b+tree path of the inode's subvolume b+tree. For example, if we have two file extent items in leaf X, and the path to leaf X is A -> B -> C -> X, then when we try to determine if the data extent referenced by the first extent item is shared, we check if the data extent is shared - if it's not, then we check if leaf X is shared, if not, then we check if node C is shared, if not, then check if node B is shared, if not than check if node A is shared. When we move to the next file extent item, after determining the data extent is not shared, we repeat the checks for X, C, B and A - doing all the expensive searches in the extent tree, delayed refs, etc. If we have thousands of tile extents, then we keep repeating the sharedness checks for the same paths over and over. On a file that has no shared extents or only a small portion, it's easy to see that this scales terribly with the number of extents in the file and the sizes of the extent and subvolume b+trees. This change eliminates the repeated sharedness check on extent buffers by caching the results of the last path used. The results can be used as long as no snapshots were created since they were cached (for not shared extent buffers) or no roots were dropped since they were cached (for shared extent buffers). This greatly reduces the time spent by fiemap for files with thousands of extents and/or large extent and subvolume b+trees. Example performance test: $ cat fiemap-perf-test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi mkfs.btrfs -f $DEV mount -o compress=lzo $DEV $MNT # 40G gives 327680 128K file extents (due to compression). xfs_io -f -c "pwrite -S 0xab -b 1M 0 40G" $MNT/foobar umount $MNT mount -o compress=lzo $DEV $MNT start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata not cached)" start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata cached)" umount $MNT Before this patch: $ ./fiemap-perf-test.sh (...) /mnt/sdi/foobar: 327680 extents found fiemap took 3597 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 2107 milliseconds (metadata cached) After this patch: $ ./fiemap-perf-test.sh (...) /mnt/sdi/foobar: 327680 extents found fiemap took 1646 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 698 milliseconds (metadata cached) That's about 2.2x faster when no metadata is cached, and about 3x faster when all metadata is cached. On a real filesystem with many other files, data, directories, etc, the b+trees will be 2 or 3 levels higher, therefore this optimization will have a higher impact. Several reports of a slow fiemap show up often, the two Link tags below refer to two recent reports of such slowness. This patch, together with the next ones in the series, is meant to address that. Link: https://lore.kernel.org/linux-btrfs/21dd32c6-f1f9-f44a-466a-e18fdc6788a7@virtuozzo.com/ Link: https://lore.kernel.org/linux-btrfs/Ysace25wh5BbLd5f@atmark-techno.com/ Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-09-01 13:18:28 +00:00
struct btrfs_backref_shared_cache_entry {
u64 bytenr;
u64 gen;
bool is_shared;
};
btrfs: cache sharedness of the last few data extents during fiemap During fiemap we process all the file extent items of an inode, by their file offset order (left to right b+tree order), and then check if the data extent they point at is shared or not. Until now we didn't cache those results, we only did it for b+tree nodes/leaves since for each unique b+tree path we have access to hundreds of file extent items. However, it is also common to repeat checking the sharedness of a particular data extent in a very short time window, and the cases that lead to that are the following: 1) COW writes. If have a file extent item like this: [ bytenr X, offset = 0, num_bytes = 512K ] file offset 0 512K Then a 4K write into file offset 64K happens, we end up with the following file extent item layout: [ bytenr X, offset = 0, num_bytes = 64K ] file offset 0 64K [ bytenr Y, offset = 0, num_bytes = 4K ] file offset 64K 68K [ bytenr X, offset = 68K, num_bytes = 444K ] file offset 68K 512K So during fiemap we well check for the sharedness of the data extent with bytenr X twice. Typically for COW writes and for at least moderately updated files, we end up with many file extent items that point to different sections of the same data extent. 2) Writing into a NOCOW file after a snapshot is taken. This happens if the target extent was created in a generation older than the generation where the last snapshot for the root (the tree the inode belongs to) was made. This leads to a scenario like the previous one. 3) Writing into sections of a preallocated extent. For example if a file has the following layout: [ bytenr X, offset = 0, num_bytes = 1M, type = prealloc ] 0 1M After doing a 4K write into file offset 0 and another 4K write into offset 512K, we get the following layout: [ bytenr X, offset = 0, num_bytes = 4K, type = regular ] 0 4K [ bytenr X, offset = 4K, num_bytes = 508K, type = prealloc ] 4K 512K [ bytenr X, offset = 512K, num_bytes = 4K, type = regular ] 512K 516K [ bytenr X, offset = 516K, num_bytes = 508K, type = prealloc ] 516K 1M So we end up with 4 consecutive file extent items pointing to the data extent at bytenr X. 4) Hole punching in the middle of an extent. For example if a file has the following file extent item: [ bytenr X, offset = 0, num_bytes = 8M ] 0 8M And then hole is punched for the file range [4M, 6M[, we our file extent item split into two: [ bytenr X, offset = 0, num_bytes = 4M ] 0 4M [ 2M hole, implicit or explicit depending on NO_HOLES feature ] 4M 6M [ bytenr X, offset = 6M, num_bytes = 2M ] 6M 8M Again, we end up with two file extent items pointing to the same data extent. 5) When reflinking (clone and deduplication) within the same file. This is probably the least common case of all. In cases 1, 2, 4 and 4, when we have multiple file extent items that point to the same data extent, their distance is usually short, typically separated by a few slots in a b+tree leaf (or across sibling leaves). For case 5, the distance can vary a lot, but it's typically the less common case. This change caches the result of the sharedness checks for data extents, but only for the last 8 extents that we notice that our inode refers to with multiple file extent items. Whenever we want to check if a data extent is shared, we lookup the cache which consists of doing a linear scan of an 8 elements array, and if we find the data extent there, we return the result and don't check the extent tree and delayed refs. The array/cache is small so that doing the search has no noticeable negative impact on the performance in case we don't have file extent items within a distance of 8 slots that point to the same data extent. Slots in the cache/array are overwritten in a simple round robin fashion, as that approach fits very well. Using this simple approach with only the last 8 data extents seen is effective as usually when multiple file extents items point to the same data extent, their distance is within 8 slots. It also uses very little memory and the time to cache a result or lookup the cache is negligible. The following test was run on non-debug kernel (Debian's default kernel config) to measure the impact in the case of COW writes (first example given above), where we run fiemap after overwriting 33% of the blocks of a file: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount $DEV $MNT FILE_SIZE=$((1 * 1024 * 1024 * 1024)) # Create the file full of 1M extents. xfs_io -f -s -c "pwrite -b 1M -S 0xab 0 $FILE_SIZE" $MNT/foobar block_count=$((FILE_SIZE / 4096)) # Overwrite about 33% of the file blocks. overwrite_count=$((block_count / 3)) echo -e "\nOverwriting $overwrite_count 4K blocks (out of $block_count)..." RANDOM=123 for ((i = 1; i <= $overwrite_count; i++)); do off=$(((RANDOM % block_count) * 4096)) xfs_io -c "pwrite -S 0xcd $off 4K" $MNT/foobar > /dev/null echo -ne "\r$i blocks overwritten..." done echo -e "\n" # Unmount and mount to clear all cached metadata. umount $MNT mount $DEV $MNT start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds" umount $MNT Result before applying this patch: fiemap took 128 milliseconds Result after applying this patch: fiemap took 92 milliseconds (-28.1%) The test is somewhat limited in the sense the gains may be higher in practice, because in the test the filesystem is small, so we have small fs and extent trees, plus there's no concurrent access to the trees as well, therefore no lock contention there. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-11 12:17:06 +00:00
#define BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE 8
struct btrfs_backref_share_check_ctx {
/* Ulists used during backref walking. */
struct ulist refs;
btrfs: avoid duplicated resolution of indirect backrefs during fiemap During fiemap, when determining if a data extent is shared or not, if we don't find the extent is directly shared, then we need to determine if it's shared through subtrees. For that we need to resolve the indirect reference we found in order to figure out the path in the inode's fs tree, which is a path starting at the fs tree's root node and going down to the leaf that contains the file extent item that points to the data extent. We then proceed to determine if any extent buffer in that path is shared with other trees or not. Currently whenever we find the data extent that a file extent item points to is not directly shared, we always resolve the path in the fs tree, and then check if any extent buffer in the path is shared. This is a lot of work and when we have file extent items that belong to the same leaf, we have the same path, so we only need to calculate it once. This change does that, it keeps track of the current and previous leaf, and when we find that a data extent is not directly shared, we try to compute the fs tree path only once and then use it for every other file extent item in the same leaf, using the existing cached path result for the leaf as long as the cache results are valid. This saves us from doing expensive b+tree searches in the fs tree of our target inode, as well as other minor work. The following test was run on a non-debug kernel (Debian's default kernel config): $ cat test-with-snapshots.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi umount $DEV &> /dev/null mkfs.btrfs -f $DEV # Use compression to quickly create files with a lot of extents # (each with a size of 128K). mount -o compress=lzo $DEV $MNT # 40G gives 327680 extents, each with a size of 128K. xfs_io -f -c "pwrite -S 0xab -b 1M 0 40G" $MNT/foobar # Add some more files to increase the size of the fs and extent # trees (in the real world there's a lot of files and extents # from other files). xfs_io -f -c "pwrite -S 0xcd -b 1M 0 20G" $MNT/file1 xfs_io -f -c "pwrite -S 0xef -b 1M 0 20G" $MNT/file2 xfs_io -f -c "pwrite -S 0x73 -b 1M 0 20G" $MNT/file3 # Create a snapshot so all the extents become indirectly shared # through subtrees, with a generation less than or equals to the # generation used to create the snapshot. btrfs subvolume snapshot -r $MNT $MNT/snap1 umount $MNT mount -o compress=lzo $DEV $MNT start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata not cached)" echo start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata cached)" umount $MNT Result before applying this patch: (...) /mnt/sdi/foobar: 327680 extents found fiemap took 1204 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 729 milliseconds (metadata cached) Result after applying this patch: (...) /mnt/sdi/foobar: 327680 extents found fiemap took 732 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 421 milliseconds (metadata cached) That's a -46.1% total reduction for the metadata not cached case, and a -42.2% reduction for the cached metadata case. The test is somewhat limited in the sense the gains may be higher in practice, because in the test the filesystem is small, so we have small fs and extent trees, plus there's no concurrent access to the trees as well, therefore no lock contention there. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-11 12:17:08 +00:00
/*
* The current leaf the caller of btrfs_is_data_extent_shared() is at.
* Typically the caller (at the moment only fiemap) tries to determine
* the sharedness of data extents point by file extent items from entire
* leaves.
*/
u64 curr_leaf_bytenr;
/*
* The previous leaf the caller was at in the previous call to
* btrfs_is_data_extent_shared(). This may be the same as the current
* leaf. On the first call it must be 0.
*/
u64 prev_leaf_bytenr;
btrfs: speedup checking for extent sharedness during fiemap One of the most expensive tasks performed during fiemap is to check if an extent is shared. This task has two major steps: 1) Check if the data extent is shared. This implies checking the extent item in the extent tree, checking delayed references, etc. If we find the data extent is directly shared, we terminate immediately; 2) If the data extent is not directly shared (its extent item has a refcount of 1), then it may be shared if we have snapshots that share subtrees of the inode's subvolume b+tree. So we check if the leaf containing the file extent item is shared, then its parent node, then the parent node of the parent node, etc, until we reach the root node or we find one of them is shared - in which case we stop immediately. During fiemap we process the extents of a file from left to right, from file offset 0 to EOF. This means that we iterate b+tree leaves from left to right, and has the implication that we keep repeating that second step above several times for the same b+tree path of the inode's subvolume b+tree. For example, if we have two file extent items in leaf X, and the path to leaf X is A -> B -> C -> X, then when we try to determine if the data extent referenced by the first extent item is shared, we check if the data extent is shared - if it's not, then we check if leaf X is shared, if not, then we check if node C is shared, if not, then check if node B is shared, if not than check if node A is shared. When we move to the next file extent item, after determining the data extent is not shared, we repeat the checks for X, C, B and A - doing all the expensive searches in the extent tree, delayed refs, etc. If we have thousands of tile extents, then we keep repeating the sharedness checks for the same paths over and over. On a file that has no shared extents or only a small portion, it's easy to see that this scales terribly with the number of extents in the file and the sizes of the extent and subvolume b+trees. This change eliminates the repeated sharedness check on extent buffers by caching the results of the last path used. The results can be used as long as no snapshots were created since they were cached (for not shared extent buffers) or no roots were dropped since they were cached (for shared extent buffers). This greatly reduces the time spent by fiemap for files with thousands of extents and/or large extent and subvolume b+trees. Example performance test: $ cat fiemap-perf-test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi mkfs.btrfs -f $DEV mount -o compress=lzo $DEV $MNT # 40G gives 327680 128K file extents (due to compression). xfs_io -f -c "pwrite -S 0xab -b 1M 0 40G" $MNT/foobar umount $MNT mount -o compress=lzo $DEV $MNT start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata not cached)" start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata cached)" umount $MNT Before this patch: $ ./fiemap-perf-test.sh (...) /mnt/sdi/foobar: 327680 extents found fiemap took 3597 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 2107 milliseconds (metadata cached) After this patch: $ ./fiemap-perf-test.sh (...) /mnt/sdi/foobar: 327680 extents found fiemap took 1646 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 698 milliseconds (metadata cached) That's about 2.2x faster when no metadata is cached, and about 3x faster when all metadata is cached. On a real filesystem with many other files, data, directories, etc, the b+trees will be 2 or 3 levels higher, therefore this optimization will have a higher impact. Several reports of a slow fiemap show up often, the two Link tags below refer to two recent reports of such slowness. This patch, together with the next ones in the series, is meant to address that. Link: https://lore.kernel.org/linux-btrfs/21dd32c6-f1f9-f44a-466a-e18fdc6788a7@virtuozzo.com/ Link: https://lore.kernel.org/linux-btrfs/Ysace25wh5BbLd5f@atmark-techno.com/ Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-09-01 13:18:28 +00:00
/*
* A path from a root to a leaf that has a file extent item pointing to
* a given data extent should never exceed the maximum b+tree height.
*/
struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL];
bool use_path_cache;
btrfs: cache sharedness of the last few data extents during fiemap During fiemap we process all the file extent items of an inode, by their file offset order (left to right b+tree order), and then check if the data extent they point at is shared or not. Until now we didn't cache those results, we only did it for b+tree nodes/leaves since for each unique b+tree path we have access to hundreds of file extent items. However, it is also common to repeat checking the sharedness of a particular data extent in a very short time window, and the cases that lead to that are the following: 1) COW writes. If have a file extent item like this: [ bytenr X, offset = 0, num_bytes = 512K ] file offset 0 512K Then a 4K write into file offset 64K happens, we end up with the following file extent item layout: [ bytenr X, offset = 0, num_bytes = 64K ] file offset 0 64K [ bytenr Y, offset = 0, num_bytes = 4K ] file offset 64K 68K [ bytenr X, offset = 68K, num_bytes = 444K ] file offset 68K 512K So during fiemap we well check for the sharedness of the data extent with bytenr X twice. Typically for COW writes and for at least moderately updated files, we end up with many file extent items that point to different sections of the same data extent. 2) Writing into a NOCOW file after a snapshot is taken. This happens if the target extent was created in a generation older than the generation where the last snapshot for the root (the tree the inode belongs to) was made. This leads to a scenario like the previous one. 3) Writing into sections of a preallocated extent. For example if a file has the following layout: [ bytenr X, offset = 0, num_bytes = 1M, type = prealloc ] 0 1M After doing a 4K write into file offset 0 and another 4K write into offset 512K, we get the following layout: [ bytenr X, offset = 0, num_bytes = 4K, type = regular ] 0 4K [ bytenr X, offset = 4K, num_bytes = 508K, type = prealloc ] 4K 512K [ bytenr X, offset = 512K, num_bytes = 4K, type = regular ] 512K 516K [ bytenr X, offset = 516K, num_bytes = 508K, type = prealloc ] 516K 1M So we end up with 4 consecutive file extent items pointing to the data extent at bytenr X. 4) Hole punching in the middle of an extent. For example if a file has the following file extent item: [ bytenr X, offset = 0, num_bytes = 8M ] 0 8M And then hole is punched for the file range [4M, 6M[, we our file extent item split into two: [ bytenr X, offset = 0, num_bytes = 4M ] 0 4M [ 2M hole, implicit or explicit depending on NO_HOLES feature ] 4M 6M [ bytenr X, offset = 6M, num_bytes = 2M ] 6M 8M Again, we end up with two file extent items pointing to the same data extent. 5) When reflinking (clone and deduplication) within the same file. This is probably the least common case of all. In cases 1, 2, 4 and 4, when we have multiple file extent items that point to the same data extent, their distance is usually short, typically separated by a few slots in a b+tree leaf (or across sibling leaves). For case 5, the distance can vary a lot, but it's typically the less common case. This change caches the result of the sharedness checks for data extents, but only for the last 8 extents that we notice that our inode refers to with multiple file extent items. Whenever we want to check if a data extent is shared, we lookup the cache which consists of doing a linear scan of an 8 elements array, and if we find the data extent there, we return the result and don't check the extent tree and delayed refs. The array/cache is small so that doing the search has no noticeable negative impact on the performance in case we don't have file extent items within a distance of 8 slots that point to the same data extent. Slots in the cache/array are overwritten in a simple round robin fashion, as that approach fits very well. Using this simple approach with only the last 8 data extents seen is effective as usually when multiple file extents items point to the same data extent, their distance is within 8 slots. It also uses very little memory and the time to cache a result or lookup the cache is negligible. The following test was run on non-debug kernel (Debian's default kernel config) to measure the impact in the case of COW writes (first example given above), where we run fiemap after overwriting 33% of the blocks of a file: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount $DEV $MNT FILE_SIZE=$((1 * 1024 * 1024 * 1024)) # Create the file full of 1M extents. xfs_io -f -s -c "pwrite -b 1M -S 0xab 0 $FILE_SIZE" $MNT/foobar block_count=$((FILE_SIZE / 4096)) # Overwrite about 33% of the file blocks. overwrite_count=$((block_count / 3)) echo -e "\nOverwriting $overwrite_count 4K blocks (out of $block_count)..." RANDOM=123 for ((i = 1; i <= $overwrite_count; i++)); do off=$(((RANDOM % block_count) * 4096)) xfs_io -c "pwrite -S 0xcd $off 4K" $MNT/foobar > /dev/null echo -ne "\r$i blocks overwritten..." done echo -e "\n" # Unmount and mount to clear all cached metadata. umount $MNT mount $DEV $MNT start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds" umount $MNT Result before applying this patch: fiemap took 128 milliseconds Result after applying this patch: fiemap took 92 milliseconds (-28.1%) The test is somewhat limited in the sense the gains may be higher in practice, because in the test the filesystem is small, so we have small fs and extent trees, plus there's no concurrent access to the trees as well, therefore no lock contention there. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-10-11 12:17:06 +00:00
/*
* Cache the sharedness result for the last few extents we have found,
* but only for extents for which we have multiple file extent items
* that point to them.
* It's very common to have several file extent items that point to the
* same extent (bytenr) but with different offsets and lengths. This
* typically happens for COW writes, partial writes into prealloc
* extents, NOCOW writes after snapshoting a root, hole punching or
* reflinking within the same file (less common perhaps).
* So keep a small cache with the lookup results for the extent pointed
* by the last few file extent items. This cache is checked, with a
* linear scan, whenever btrfs_is_data_extent_shared() is called, so
* it must be small so that it does not negatively affect performance in
* case we don't have multiple file extent items that point to the same
* data extent.
*/
struct {
u64 bytenr;
bool is_shared;
} prev_extents_cache[BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE];
/*
* The slot in the prev_extents_cache array that will be used for
* storing the sharedness result of a new data extent.
*/
int prev_extents_cache_slot;
btrfs: speedup checking for extent sharedness during fiemap One of the most expensive tasks performed during fiemap is to check if an extent is shared. This task has two major steps: 1) Check if the data extent is shared. This implies checking the extent item in the extent tree, checking delayed references, etc. If we find the data extent is directly shared, we terminate immediately; 2) If the data extent is not directly shared (its extent item has a refcount of 1), then it may be shared if we have snapshots that share subtrees of the inode's subvolume b+tree. So we check if the leaf containing the file extent item is shared, then its parent node, then the parent node of the parent node, etc, until we reach the root node or we find one of them is shared - in which case we stop immediately. During fiemap we process the extents of a file from left to right, from file offset 0 to EOF. This means that we iterate b+tree leaves from left to right, and has the implication that we keep repeating that second step above several times for the same b+tree path of the inode's subvolume b+tree. For example, if we have two file extent items in leaf X, and the path to leaf X is A -> B -> C -> X, then when we try to determine if the data extent referenced by the first extent item is shared, we check if the data extent is shared - if it's not, then we check if leaf X is shared, if not, then we check if node C is shared, if not, then check if node B is shared, if not than check if node A is shared. When we move to the next file extent item, after determining the data extent is not shared, we repeat the checks for X, C, B and A - doing all the expensive searches in the extent tree, delayed refs, etc. If we have thousands of tile extents, then we keep repeating the sharedness checks for the same paths over and over. On a file that has no shared extents or only a small portion, it's easy to see that this scales terribly with the number of extents in the file and the sizes of the extent and subvolume b+trees. This change eliminates the repeated sharedness check on extent buffers by caching the results of the last path used. The results can be used as long as no snapshots were created since they were cached (for not shared extent buffers) or no roots were dropped since they were cached (for shared extent buffers). This greatly reduces the time spent by fiemap for files with thousands of extents and/or large extent and subvolume b+trees. Example performance test: $ cat fiemap-perf-test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi mkfs.btrfs -f $DEV mount -o compress=lzo $DEV $MNT # 40G gives 327680 128K file extents (due to compression). xfs_io -f -c "pwrite -S 0xab -b 1M 0 40G" $MNT/foobar umount $MNT mount -o compress=lzo $DEV $MNT start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata not cached)" start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata cached)" umount $MNT Before this patch: $ ./fiemap-perf-test.sh (...) /mnt/sdi/foobar: 327680 extents found fiemap took 3597 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 2107 milliseconds (metadata cached) After this patch: $ ./fiemap-perf-test.sh (...) /mnt/sdi/foobar: 327680 extents found fiemap took 1646 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 698 milliseconds (metadata cached) That's about 2.2x faster when no metadata is cached, and about 3x faster when all metadata is cached. On a real filesystem with many other files, data, directories, etc, the b+trees will be 2 or 3 levels higher, therefore this optimization will have a higher impact. Several reports of a slow fiemap show up often, the two Link tags below refer to two recent reports of such slowness. This patch, together with the next ones in the series, is meant to address that. Link: https://lore.kernel.org/linux-btrfs/21dd32c6-f1f9-f44a-466a-e18fdc6788a7@virtuozzo.com/ Link: https://lore.kernel.org/linux-btrfs/Ysace25wh5BbLd5f@atmark-techno.com/ Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-09-01 13:18:28 +00:00
};
btrfs: send: optimize clone detection to increase extent sharing Currently send does not do the best decisions when it comes to decide between multiple clone sources, which results in clone operations for partial extent ranges, which has the following disadvantages: 1) We get less shared extents at the destination; 2) We have to read more data during the send operation and emit more write commands. Besides not being optimal behaviour, it also breaks user expectations and is often reported by users, with a recent example in the Link tag at the bottom of this change log. Part of the reason for this non-optimal behaviour is that the backref walking code does not provide information about the length of the file extent items that were found for each backref, so send is blind about which backref is the best to chose as a cloning source. The other existing reasons are just silliness, namely always prefering the inode with the lowest number when multiple are found for the same root and when we can clone from multiple roots, always prefer the send root over any of the other clone roots. This does not make any sense since any inode or root is fine and as good as any other inode/root. Fix this by making backref walking pass information about the number of bytes referenced by each file extent item and then have send's backref callback pick the inode with the highest number of bytes for each root. Finally select the root from which we can clone more bytes from. Example reproducer: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi mkfs.btrfs -f $DEV mount $DEV $MNT xfs_io -f -c "pwrite -S 0xab -b 2M 0 2M" $MNT/foo cp --reflink=always $MNT/foo $MNT/bar cp --reflink=always $MNT/foo $MNT/baz sync # Overwrite the second half of file foo. xfs_io -c "pwrite -S 0xcd -b 1M 1M 1M" $MNT/foo sync echo echo "*** fiemap in the original filesystem ***" echo xfs_io -c "fiemap -v" $MNT/foo xfs_io -c "fiemap -v" $MNT/bar xfs_io -c "fiemap -v" $MNT/baz echo btrfs filesystem du $MNT btrfs subvolume snapshot -r $MNT $MNT/snap btrfs send -f /tmp/send_stream $MNT/snap umount $MNT mkfs.btrfs -f $DEV &> /dev/null mount $DEV $MNT btrfs receive -f /tmp/send_stream $MNT echo echo "*** fiemap in the new filesystem ***" echo xfs_io -r -c "fiemap -v" $MNT/snap/foo xfs_io -r -c "fiemap -v" $MNT/snap/bar xfs_io -r -c "fiemap -v" $MNT/snap/baz echo btrfs filesystem du $MNT rm -f /tmp/send_stream rm -f /tmp/snap.fssum umount $MNT Before this change: $ ./test.sh (...) *** fiemap in the original filesystem *** /mnt/sdi/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x1 /mnt/sdi/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 /mnt/sdi/baz: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 Total Exclusive Set shared Filename 2.00MiB 1.00MiB - /mnt/sdi/foo 2.00MiB 0.00B - /mnt/sdi/bar 2.00MiB 0.00B - /mnt/sdi/baz 6.00MiB 1.00MiB 2.00MiB /mnt/sdi Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap' At subvol /mnt/sdi/snap At subvol snap *** fiemap in the new filesystem *** /mnt/sdi/snap/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 /mnt/sdi/snap/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x1 /mnt/sdi/snap/baz: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 32768..34815 2048 0x1 Total Exclusive Set shared Filename 2.00MiB 0.00B - /mnt/sdi/snap/foo 2.00MiB 1.00MiB - /mnt/sdi/snap/bar 2.00MiB 1.00MiB - /mnt/sdi/snap/baz 6.00MiB 2.00MiB - /mnt/sdi/snap 6.00MiB 2.00MiB 2.00MiB /mnt/sdi We end up with two 1M extents that are not shared for files bar and baz. After this change: $ ./test.sh (...) *** fiemap in the original filesystem *** /mnt/sdi/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x1 /mnt/sdi/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 /mnt/sdi/baz: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 Total Exclusive Set shared Filename 2.00MiB 1.00MiB - /mnt/sdi/foo 2.00MiB 0.00B - /mnt/sdi/bar 2.00MiB 0.00B - /mnt/sdi/baz 6.00MiB 1.00MiB 2.00MiB /mnt/sdi Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap' At subvol /mnt/sdi/snap At subvol snap *** fiemap in the new filesystem *** /mnt/sdi/snap/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 /mnt/sdi/snap/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x2001 /mnt/sdi/snap/baz: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x2001 Total Exclusive Set shared Filename 2.00MiB 0.00B - /mnt/sdi/snap/foo 2.00MiB 0.00B - /mnt/sdi/snap/bar 2.00MiB 0.00B - /mnt/sdi/snap/baz 6.00MiB 0.00B - /mnt/sdi/snap 6.00MiB 0.00B 3.00MiB /mnt/sdi Now there's a much better sharing, files bar and baz share 1M of the extent of file foo and the second extent of files bar and baz is shared between themselves. This will later be turned into a test case for fstests. Link: https://lore.kernel.org/linux-btrfs/20221008005704.795b44b0@crass-HP-ZBook-15-G2/ Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-01 16:15:45 +00:00
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes,
u64 root, void *ctx);
struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void);
void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx);
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key,
u64 *flags);
int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
struct btrfs_key *key, struct btrfs_extent_item *ei,
u32 item_size, u64 *out_root, u8 *out_level);
int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
bool search_commit_root,
iterate_extent_inodes_t *iterate, void *user_ctx);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path, void *ctx,
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and offset (encoded as a single logical address) to a list of extent refs. LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping (extent ref -> extent bytenr and offset, or logical address). These are useful capabilities for programs that manipulate extents and extent references from userspace (e.g. dedup and defrag utilities). When the extents are uncompressed (and not encrypted and not other), check_extent_in_eb performs filtering of the extent refs to remove any extent refs which do not contain the same extent offset as the 'logical' parameter's extent offset. This prevents LOGICAL_INO from returning references to more than a single block. To find the set of extent references to an uncompressed extent from [a, b), userspace has to run a loop like this pseudocode: for (i = a; i < b; ++i) extent_ref_set += LOGICAL_INO(i); At each iteration of the loop (up to 32768 iterations for a 128M extent), data we are interested in is collected in the kernel, then deleted by the filter in check_extent_in_eb. When the extents are compressed (or encrypted or other), the 'logical' parameter must be an extent bytenr (the 'a' parameter in the loop). No filtering by extent offset is done (or possible?) so the result is the complete set of extent refs for the entire extent. This removes the need for the loop, since we get all the extent refs in one call. Add an 'ignore_offset' argument to iterate_inodes_from_logical, [...several levels of function call graph...], and check_extent_in_eb, so that we can disable the extent offset filtering for uncompressed extents. This flag can be set by an improved version of the LOGICAL_INO ioctl to get either behavior as desired. There is no functional change in this patch. The new flag is always false. Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org> Reviewed-by: David Sterba <dsterba@suse.com> [ minor coding style fixes ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 17:58:45 +00:00
bool ignore_offset);
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx);
int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx,
btrfs: fix lock inversion problem when doing qgroup extent tracing At btrfs_qgroup_trace_extent_post() we call btrfs_find_all_roots() with a NULL value as the transaction handle argument, which makes that function take the commit_root_sem semaphore, which is necessary when we don't hold a transaction handle or any other mechanism to prevent a transaction commit from wiping out commit roots. However btrfs_qgroup_trace_extent_post() can be called in a context where we are holding a write lock on an extent buffer from a subvolume tree, namely from btrfs_truncate_inode_items(), called either during truncate or unlink operations. In this case we end up with a lock inversion problem because the commit_root_sem is a higher level lock, always supposed to be acquired before locking any extent buffer. Lockdep detects this lock inversion problem since we switched the extent buffer locks from custom locks to semaphores, and when running btrfs/158 from fstests, it reported the following trace: [ 9057.626435] ====================================================== [ 9057.627541] WARNING: possible circular locking dependency detected [ 9057.628334] 5.14.0-rc2-btrfs-next-93 #1 Not tainted [ 9057.628961] ------------------------------------------------------ [ 9057.629867] kworker/u16:4/30781 is trying to acquire lock: [ 9057.630824] ffff8e2590f58760 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x24/0x110 [btrfs] [ 9057.632542] but task is already holding lock: [ 9057.633551] ffff8e25582d4b70 (&fs_info->commit_root_sem){++++}-{3:3}, at: iterate_extent_inodes+0x10b/0x280 [btrfs] [ 9057.635255] which lock already depends on the new lock. [ 9057.636292] the existing dependency chain (in reverse order) is: [ 9057.637240] -> #1 (&fs_info->commit_root_sem){++++}-{3:3}: [ 9057.638138] down_read+0x46/0x140 [ 9057.638648] btrfs_find_all_roots+0x41/0x80 [btrfs] [ 9057.639398] btrfs_qgroup_trace_extent_post+0x37/0x70 [btrfs] [ 9057.640283] btrfs_add_delayed_data_ref+0x418/0x490 [btrfs] [ 9057.641114] btrfs_free_extent+0x35/0xb0 [btrfs] [ 9057.641819] btrfs_truncate_inode_items+0x424/0xf70 [btrfs] [ 9057.642643] btrfs_evict_inode+0x454/0x4f0 [btrfs] [ 9057.643418] evict+0xcf/0x1d0 [ 9057.643895] do_unlinkat+0x1e9/0x300 [ 9057.644525] do_syscall_64+0x3b/0xc0 [ 9057.645110] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 9057.645835] -> #0 (btrfs-tree-00){++++}-{3:3}: [ 9057.646600] __lock_acquire+0x130e/0x2210 [ 9057.647248] lock_acquire+0xd7/0x310 [ 9057.647773] down_read_nested+0x4b/0x140 [ 9057.648350] __btrfs_tree_read_lock+0x24/0x110 [btrfs] [ 9057.649175] btrfs_read_lock_root_node+0x31/0x40 [btrfs] [ 9057.650010] btrfs_search_slot+0x537/0xc00 [btrfs] [ 9057.650849] scrub_print_warning_inode+0x89/0x370 [btrfs] [ 9057.651733] iterate_extent_inodes+0x1e3/0x280 [btrfs] [ 9057.652501] scrub_print_warning+0x15d/0x2f0 [btrfs] [ 9057.653264] scrub_handle_errored_block.isra.0+0x135f/0x1640 [btrfs] [ 9057.654295] scrub_bio_end_io_worker+0x101/0x2e0 [btrfs] [ 9057.655111] btrfs_work_helper+0xf8/0x400 [btrfs] [ 9057.655831] process_one_work+0x247/0x5a0 [ 9057.656425] worker_thread+0x55/0x3c0 [ 9057.656993] kthread+0x155/0x180 [ 9057.657494] ret_from_fork+0x22/0x30 [ 9057.658030] other info that might help us debug this: [ 9057.659064] Possible unsafe locking scenario: [ 9057.659824] CPU0 CPU1 [ 9057.660402] ---- ---- [ 9057.660988] lock(&fs_info->commit_root_sem); [ 9057.661581] lock(btrfs-tree-00); [ 9057.662348] lock(&fs_info->commit_root_sem); [ 9057.663254] lock(btrfs-tree-00); [ 9057.663690] *** DEADLOCK *** [ 9057.664437] 4 locks held by kworker/u16:4/30781: [ 9057.665023] #0: ffff8e25922a1148 ((wq_completion)btrfs-scrub){+.+.}-{0:0}, at: process_one_work+0x1c7/0x5a0 [ 9057.666260] #1: ffffabb3451ffe70 ((work_completion)(&work->normal_work)){+.+.}-{0:0}, at: process_one_work+0x1c7/0x5a0 [ 9057.667639] #2: ffff8e25922da198 (&ret->mutex){+.+.}-{3:3}, at: scrub_handle_errored_block.isra.0+0x5d2/0x1640 [btrfs] [ 9057.669017] #3: ffff8e25582d4b70 (&fs_info->commit_root_sem){++++}-{3:3}, at: iterate_extent_inodes+0x10b/0x280 [btrfs] [ 9057.670408] stack backtrace: [ 9057.670976] CPU: 7 PID: 30781 Comm: kworker/u16:4 Not tainted 5.14.0-rc2-btrfs-next-93 #1 [ 9057.672030] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 9057.673492] Workqueue: btrfs-scrub btrfs_work_helper [btrfs] [ 9057.674258] Call Trace: [ 9057.674588] dump_stack_lvl+0x57/0x72 [ 9057.675083] check_noncircular+0xf3/0x110 [ 9057.675611] __lock_acquire+0x130e/0x2210 [ 9057.676132] lock_acquire+0xd7/0x310 [ 9057.676605] ? __btrfs_tree_read_lock+0x24/0x110 [btrfs] [ 9057.677313] ? lock_is_held_type+0xe8/0x140 [ 9057.677849] down_read_nested+0x4b/0x140 [ 9057.678349] ? __btrfs_tree_read_lock+0x24/0x110 [btrfs] [ 9057.679068] __btrfs_tree_read_lock+0x24/0x110 [btrfs] [ 9057.679760] btrfs_read_lock_root_node+0x31/0x40 [btrfs] [ 9057.680458] btrfs_search_slot+0x537/0xc00 [btrfs] [ 9057.681083] ? _raw_spin_unlock+0x29/0x40 [ 9057.681594] ? btrfs_find_all_roots_safe+0x11f/0x140 [btrfs] [ 9057.682336] scrub_print_warning_inode+0x89/0x370 [btrfs] [ 9057.683058] ? btrfs_find_all_roots_safe+0x11f/0x140 [btrfs] [ 9057.683834] ? scrub_write_block_to_dev_replace+0xb0/0xb0 [btrfs] [ 9057.684632] iterate_extent_inodes+0x1e3/0x280 [btrfs] [ 9057.685316] scrub_print_warning+0x15d/0x2f0 [btrfs] [ 9057.685977] ? ___ratelimit+0xa4/0x110 [ 9057.686460] scrub_handle_errored_block.isra.0+0x135f/0x1640 [btrfs] [ 9057.687316] scrub_bio_end_io_worker+0x101/0x2e0 [btrfs] [ 9057.688021] btrfs_work_helper+0xf8/0x400 [btrfs] [ 9057.688649] ? lock_is_held_type+0xe8/0x140 [ 9057.689180] process_one_work+0x247/0x5a0 [ 9057.689696] worker_thread+0x55/0x3c0 [ 9057.690175] ? process_one_work+0x5a0/0x5a0 [ 9057.690731] kthread+0x155/0x180 [ 9057.691158] ? set_kthread_struct+0x40/0x40 [ 9057.691697] ret_from_fork+0x22/0x30 Fix this by making btrfs_find_all_roots() never attempt to lock the commit_root_sem when it is called from btrfs_qgroup_trace_extent_post(). We can't just pass a non-NULL transaction handle to btrfs_find_all_roots() from btrfs_qgroup_trace_extent_post(), because that would make backref lookup not use commit roots and acquire read locks on extent buffers, and therefore could deadlock when btrfs_qgroup_trace_extent_post() is called from the btrfs_truncate_inode_items() code path which has acquired a write lock on an extent buffer of the subvolume btree. CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Qu Wenruo <wqu@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-07-21 16:31:48 +00:00
bool skip_commit_root_sem);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
struct extent_buffer *eb_in, u64 parent,
char *dest, u32 size);
struct btrfs_data_container *init_data_container(u32 total_bytes);
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path);
void free_ipath(struct inode_fs_paths *ipath);
int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
btrfs: skip unnecessary extent buffer sharedness checks during fiemap During fiemap, for each file extent we find, we must check if it's shared or not. The sharedness check starts by verifying if the extent is directly shared (its refcount in the extent tree is > 1), and if it is not directly shared, then we will check if every node in the subvolume b+tree leading from the root to the leaf that has the file extent item (in reverse order), is shared (through snapshots). However this second step is not needed if our extent was created in a transaction more recent than the last transaction where a snapshot of the inode's root happened, because it can't be shared indirectly (through shared subtrees) without a snapshot created in a more recent transaction. So grab the generation of the extent from the extent map and pass it to btrfs_is_data_extent_shared(), which will skip this second phase when the generation is more recent than the root's last snapshot value. Note that we skip this optimization if the extent map is the result of merging 2 or more extent maps, because in this case its generation is the maximum of the generations of all merged extent maps. The fact the we use extent maps and they can be merged despite the underlying extents being distinct (different file extent items in the subvolume b+tree and different extent items in the extent b+tree), can result in some bugs when reporting shared extents. But this is a problem of the current implementation of fiemap relying on extent maps. One example where we get incorrect results is: $ cat fiemap-bug.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj mkfs.btrfs -f $DEV mount $DEV $MNT # Create a file with two 256K extents. # Since there is no other write activity, they will be contiguous, # and their extent maps merged, despite having two distinct extents. xfs_io -f -c "pwrite -S 0xab 0 256K" \ -c "fsync" \ -c "pwrite -S 0xcd 256K 256K" \ -c "fsync" \ $MNT/foo # Now clone only the second extent into another file. xfs_io -f -c "reflink $MNT/foo 256K 0 256K" $MNT/bar # Filefrag will report a single 512K extent, and say it's not shared. echo filefrag -v $MNT/foo umount $MNT Running the reproducer: $ ./fiemap-bug.sh wrote 262144/262144 bytes at offset 0 256 KiB, 64 ops; 0.0038 sec (65.479 MiB/sec and 16762.7030 ops/sec) wrote 262144/262144 bytes at offset 262144 256 KiB, 64 ops; 0.0040 sec (61.125 MiB/sec and 15647.9218 ops/sec) linked 262144/262144 bytes at offset 0 256 KiB, 1 ops; 0.0002 sec (1.034 GiB/sec and 4237.2881 ops/sec) Filesystem type is: 9123683e File size of /mnt/sdj/foo is 524288 (128 blocks of 4096 bytes) ext: logical_offset: physical_offset: length: expected: flags: 0: 0.. 127: 3328.. 3455: 128: last,eof /mnt/sdj/foo: 1 extent found We end up reporting that we have a single 512K that is not shared, however we have two 256K extents, and the second one is shared. Changing the reproducer to clone instead the first extent into file 'bar', makes us report a single 512K extent that is shared, which is algo incorrect since we have two 256K extents and only the first one is shared. This is z problem that existed before this change, and remains after this change, as it can't be easily fixed. The next patch in the series reworks fiemap to primarily use file extent items instead of extent maps (except for checking for delalloc ranges), with the goal of improving its scalability and performance, but it also ends up fixing this particular bug caused by extent map merging. Reviewed-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: Qu Wenruo <wqu@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-09-01 13:18:29 +00:00
u64 extent_gen,
struct btrfs_backref_share_check_ctx *ctx);
int __init btrfs_prelim_ref_init(void);
void __cold btrfs_prelim_ref_exit(void);
struct prelim_ref {
struct rb_node rbnode;
u64 root_id;
struct btrfs_key key_for_search;
int level;
int count;
struct extent_inode_elem *inode_list;
u64 parent;
u64 wanted_disk_byte;
};
/*
* Iterate backrefs of one extent.
*
* Now it only supports iteration of tree block in commit root.
*/
struct btrfs_backref_iter {
u64 bytenr;
struct btrfs_path *path;
struct btrfs_fs_info *fs_info;
struct btrfs_key cur_key;
u32 item_ptr;
u32 cur_ptr;
u32 end_ptr;
};
struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info);
static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter)
{
if (!iter)
return;
btrfs_free_path(iter->path);
kfree(iter);
}
static inline struct extent_buffer *btrfs_backref_get_eb(
struct btrfs_backref_iter *iter)
{
if (!iter)
return NULL;
return iter->path->nodes[0];
}
/*
* For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data
* is btrfs_tree_block_info, without a btrfs_extent_inline_ref header.
*
* This helper determines if that's the case.
*/
static inline bool btrfs_backref_has_tree_block_info(
struct btrfs_backref_iter *iter)
{
if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY &&
iter->cur_ptr - iter->item_ptr == sizeof(struct btrfs_extent_item))
return true;
return false;
}
int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr);
int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
static inline bool btrfs_backref_iter_is_inline_ref(
struct btrfs_backref_iter *iter)
{
if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY ||
iter->cur_key.type == BTRFS_METADATA_ITEM_KEY)
return true;
return false;
}
static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
{
iter->bytenr = 0;
iter->item_ptr = 0;
iter->cur_ptr = 0;
iter->end_ptr = 0;
btrfs_release_path(iter->path);
memset(&iter->cur_key, 0, sizeof(iter->cur_key));
}
/*
* Backref cache related structures
*
* The whole objective of backref_cache is to build a bi-directional map
* of tree blocks (represented by backref_node) and all their parents.
*/
/*
* Represent a tree block in the backref cache
*/
struct btrfs_backref_node {
struct {
struct rb_node rb_node;
u64 bytenr;
}; /* Use rb_simple_node for search/insert */
u64 new_bytenr;
/* Objectid of tree block owner, can be not uptodate */
u64 owner;
/* Link to pending, changed or detached list */
struct list_head list;
/* List of upper level edges, which link this node to its parents */
struct list_head upper;
/* List of lower level edges, which link this node to its children */
struct list_head lower;
/* NULL if this node is not tree root */
struct btrfs_root *root;
/* Extent buffer got by COWing the block */
struct extent_buffer *eb;
/* Level of the tree block */
unsigned int level:8;
/* Is the block in a non-shareable tree */
unsigned int cowonly:1;
/* 1 if no child node is in the cache */
unsigned int lowest:1;
/* Is the extent buffer locked */
unsigned int locked:1;
/* Has the block been processed */
unsigned int processed:1;
/* Have backrefs of this block been checked */
unsigned int checked:1;
/*
* 1 if corresponding block has been COWed but some upper level block
* pointers may not point to the new location
*/
unsigned int pending:1;
/* 1 if the backref node isn't connected to any other backref node */
unsigned int detached:1;
/*
* For generic purpose backref cache, where we only care if it's a reloc
* root, doesn't care the source subvolid.
*/
unsigned int is_reloc_root:1;
};
#define LOWER 0
#define UPPER 1
/*
* Represent an edge connecting upper and lower backref nodes.
*/
struct btrfs_backref_edge {
/*
* list[LOWER] is linked to btrfs_backref_node::upper of lower level
* node, and list[UPPER] is linked to btrfs_backref_node::lower of
* upper level node.
*
* Also, build_backref_tree() uses list[UPPER] for pending edges, before
* linking list[UPPER] to its upper level nodes.
*/
struct list_head list[2];
/* Two related nodes */
struct btrfs_backref_node *node[2];
};
struct btrfs_backref_cache {
/* Red black tree of all backref nodes in the cache */
struct rb_root rb_root;
/* For passing backref nodes to btrfs_reloc_cow_block */
struct btrfs_backref_node *path[BTRFS_MAX_LEVEL];
/*
* List of blocks that have been COWed but some block pointers in upper
* level blocks may not reflect the new location
*/
struct list_head pending[BTRFS_MAX_LEVEL];
/* List of backref nodes with no child node */
struct list_head leaves;
/* List of blocks that have been COWed in current transaction */
struct list_head changed;
/* List of detached backref node. */
struct list_head detached;
u64 last_trans;
int nr_nodes;
int nr_edges;
/* List of unchecked backref edges during backref cache build */
struct list_head pending_edge;
/* List of useless backref nodes during backref cache build */
struct list_head useless_node;
struct btrfs_fs_info *fs_info;
/*
* Whether this cache is for relocation
*
* Reloction backref cache require more info for reloc root compared
* to generic backref cache.
*/
unsigned int is_reloc;
};
void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
struct btrfs_backref_cache *cache, int is_reloc);
struct btrfs_backref_node *btrfs_backref_alloc_node(
struct btrfs_backref_cache *cache, u64 bytenr, int level);
struct btrfs_backref_edge *btrfs_backref_alloc_edge(
struct btrfs_backref_cache *cache);
#define LINK_LOWER (1 << 0)
#define LINK_UPPER (1 << 1)
static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
struct btrfs_backref_node *lower,
struct btrfs_backref_node *upper,
int link_which)
{
ASSERT(upper && lower && upper->level == lower->level + 1);
edge->node[LOWER] = lower;
edge->node[UPPER] = upper;
if (link_which & LINK_LOWER)
list_add_tail(&edge->list[LOWER], &lower->upper);
if (link_which & LINK_UPPER)
list_add_tail(&edge->list[UPPER], &upper->lower);
}
static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node)
{
if (node) {
btrfs: add asserts for deleting backref cache nodes A weird KASAN problem that Zygo reported could have been easily caught if we checked for basic things in our backref freeing code. We have two methods of freeing a backref node - btrfs_backref_free_node: this just is kfree() essentially. - btrfs_backref_drop_node: this actually unlinks the node and cleans up everything and then calls btrfs_backref_free_node(). We should mostly be using btrfs_backref_drop_node(), to make sure the node is properly unlinked from the backref cache, and only use btrfs_backref_free_node() when we know the node isn't actually linked to the backref cache. We made a mistake here and thus got the KASAN splat. Make this style of issue easier to find by adding some ASSERT()'s to btrfs_backref_free_node() and adjusting our deletion stuff to properly init the list so we can rely on list_empty() checks working properly. BUG: KASAN: use-after-free in btrfs_backref_cleanup_node+0x18a/0x420 Read of size 8 at addr ffff888112402950 by task btrfs/28836 CPU: 0 PID: 28836 Comm: btrfs Tainted: G W 5.10.0-e35f27394290-for-next+ #23 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Call Trace: dump_stack+0xbc/0xf9 ? btrfs_backref_cleanup_node+0x18a/0x420 print_address_description.constprop.8+0x21/0x210 ? record_print_text.cold.34+0x11/0x11 ? btrfs_backref_cleanup_node+0x18a/0x420 ? btrfs_backref_cleanup_node+0x18a/0x420 kasan_report.cold.10+0x20/0x37 ? btrfs_backref_cleanup_node+0x18a/0x420 __asan_load8+0x69/0x90 btrfs_backref_cleanup_node+0x18a/0x420 btrfs_backref_release_cache+0x83/0x1b0 relocate_block_group+0x394/0x780 ? merge_reloc_roots+0x4a0/0x4a0 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 ? check_flags.part.50+0x6c/0x1e0 ? btrfs_relocate_chunk+0x120/0x120 ? kmem_cache_alloc_trace+0xa06/0xcb0 ? _copy_from_user+0x83/0xc0 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 ? __kasan_check_read+0x11/0x20 ? check_chain_key+0x1f4/0x2f0 ? __asan_loadN+0xf/0x20 ? btrfs_ioctl_get_supported_features+0x30/0x30 ? kvm_sched_clock_read+0x18/0x30 ? check_chain_key+0x1f4/0x2f0 ? lock_downgrade+0x3f0/0x3f0 ? handle_mm_fault+0xad6/0x2150 ? do_vfs_ioctl+0xfc/0x9d0 ? ioctl_file_clone+0xe0/0xe0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags+0x26/0x30 ? lock_is_held_type+0xc3/0xf0 ? syscall_enter_from_user_mode+0x1b/0x60 ? do_syscall_64+0x13/0x80 ? rcu_read_lock_sched_held+0xa1/0xd0 ? __kasan_check_read+0x11/0x20 ? __fget_light+0xae/0x110 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f4c4bdfe427 RSP: 002b:00007fff33ee6df8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fff33ee6e98 RCX: 00007f4c4bdfe427 RDX: 00007fff33ee6e98 RSI: 00000000c4009420 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000003 R09: 0000000000000078 R10: fffffffffffff59d R11: 0000000000000202 R12: 0000000000000001 R13: 0000000000000000 R14: 00007fff33ee8a34 R15: 0000000000000001 Allocated by task 28836: kasan_save_stack+0x21/0x50 __kasan_kmalloc.constprop.18+0xbe/0xd0 kasan_kmalloc+0x9/0x10 kmem_cache_alloc_trace+0x410/0xcb0 btrfs_backref_alloc_node+0x46/0xf0 btrfs_backref_add_tree_node+0x60d/0x11d0 build_backref_tree+0xc5/0x700 relocate_tree_blocks+0x2be/0xb90 relocate_block_group+0x2eb/0x780 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 28836: kasan_save_stack+0x21/0x50 kasan_set_track+0x20/0x30 kasan_set_free_info+0x1f/0x30 __kasan_slab_free+0xf3/0x140 kasan_slab_free+0xe/0x10 kfree+0xde/0x200 btrfs_backref_error_cleanup+0x452/0x530 build_backref_tree+0x1a5/0x700 relocate_tree_blocks+0x2be/0xb90 relocate_block_group+0x2eb/0x780 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff888112402900 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 80 bytes inside of 128-byte region [ffff888112402900, ffff888112402980) The buggy address belongs to the page: page:0000000028b1cd08 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888131c810c0 pfn:0x112402 flags: 0x17ffe0000000200(slab) raw: 017ffe0000000200 ffffea000424f308 ffffea0007d572c8 ffff888100040440 raw: ffff888131c810c0 ffff888112402000 0000000100000009 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888112402800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888112402880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff888112402900: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888112402980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888112402a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Link: https://lore.kernel.org/linux-btrfs/20201208194607.GI31381@hungrycats.org/ CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-14 19:02:45 +00:00
ASSERT(list_empty(&node->list));
ASSERT(list_empty(&node->lower));
ASSERT(node->eb == NULL);
cache->nr_nodes--;
btrfs_put_root(node->root);
kfree(node);
}
}
static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
struct btrfs_backref_edge *edge)
{
if (edge) {
cache->nr_edges--;
kfree(edge);
}
}
static inline void btrfs_backref_unlock_node_buffer(
struct btrfs_backref_node *node)
{
if (node->locked) {
btrfs_tree_unlock(node->eb);
node->locked = 0;
}
}
static inline void btrfs_backref_drop_node_buffer(
struct btrfs_backref_node *node)
{
if (node->eb) {
btrfs_backref_unlock_node_buffer(node);
free_extent_buffer(node->eb);
node->eb = NULL;
}
}
/*
* Drop the backref node from cache without cleaning up its children
* edges.
*
* This can only be called on node without parent edges.
* The children edges are still kept as is.
*/
static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
struct btrfs_backref_node *node)
{
btrfs: add asserts for deleting backref cache nodes A weird KASAN problem that Zygo reported could have been easily caught if we checked for basic things in our backref freeing code. We have two methods of freeing a backref node - btrfs_backref_free_node: this just is kfree() essentially. - btrfs_backref_drop_node: this actually unlinks the node and cleans up everything and then calls btrfs_backref_free_node(). We should mostly be using btrfs_backref_drop_node(), to make sure the node is properly unlinked from the backref cache, and only use btrfs_backref_free_node() when we know the node isn't actually linked to the backref cache. We made a mistake here and thus got the KASAN splat. Make this style of issue easier to find by adding some ASSERT()'s to btrfs_backref_free_node() and adjusting our deletion stuff to properly init the list so we can rely on list_empty() checks working properly. BUG: KASAN: use-after-free in btrfs_backref_cleanup_node+0x18a/0x420 Read of size 8 at addr ffff888112402950 by task btrfs/28836 CPU: 0 PID: 28836 Comm: btrfs Tainted: G W 5.10.0-e35f27394290-for-next+ #23 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Call Trace: dump_stack+0xbc/0xf9 ? btrfs_backref_cleanup_node+0x18a/0x420 print_address_description.constprop.8+0x21/0x210 ? record_print_text.cold.34+0x11/0x11 ? btrfs_backref_cleanup_node+0x18a/0x420 ? btrfs_backref_cleanup_node+0x18a/0x420 kasan_report.cold.10+0x20/0x37 ? btrfs_backref_cleanup_node+0x18a/0x420 __asan_load8+0x69/0x90 btrfs_backref_cleanup_node+0x18a/0x420 btrfs_backref_release_cache+0x83/0x1b0 relocate_block_group+0x394/0x780 ? merge_reloc_roots+0x4a0/0x4a0 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 ? check_flags.part.50+0x6c/0x1e0 ? btrfs_relocate_chunk+0x120/0x120 ? kmem_cache_alloc_trace+0xa06/0xcb0 ? _copy_from_user+0x83/0xc0 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 ? __kasan_check_read+0x11/0x20 ? check_chain_key+0x1f4/0x2f0 ? __asan_loadN+0xf/0x20 ? btrfs_ioctl_get_supported_features+0x30/0x30 ? kvm_sched_clock_read+0x18/0x30 ? check_chain_key+0x1f4/0x2f0 ? lock_downgrade+0x3f0/0x3f0 ? handle_mm_fault+0xad6/0x2150 ? do_vfs_ioctl+0xfc/0x9d0 ? ioctl_file_clone+0xe0/0xe0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags+0x26/0x30 ? lock_is_held_type+0xc3/0xf0 ? syscall_enter_from_user_mode+0x1b/0x60 ? do_syscall_64+0x13/0x80 ? rcu_read_lock_sched_held+0xa1/0xd0 ? __kasan_check_read+0x11/0x20 ? __fget_light+0xae/0x110 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f4c4bdfe427 RSP: 002b:00007fff33ee6df8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fff33ee6e98 RCX: 00007f4c4bdfe427 RDX: 00007fff33ee6e98 RSI: 00000000c4009420 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000003 R09: 0000000000000078 R10: fffffffffffff59d R11: 0000000000000202 R12: 0000000000000001 R13: 0000000000000000 R14: 00007fff33ee8a34 R15: 0000000000000001 Allocated by task 28836: kasan_save_stack+0x21/0x50 __kasan_kmalloc.constprop.18+0xbe/0xd0 kasan_kmalloc+0x9/0x10 kmem_cache_alloc_trace+0x410/0xcb0 btrfs_backref_alloc_node+0x46/0xf0 btrfs_backref_add_tree_node+0x60d/0x11d0 build_backref_tree+0xc5/0x700 relocate_tree_blocks+0x2be/0xb90 relocate_block_group+0x2eb/0x780 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 28836: kasan_save_stack+0x21/0x50 kasan_set_track+0x20/0x30 kasan_set_free_info+0x1f/0x30 __kasan_slab_free+0xf3/0x140 kasan_slab_free+0xe/0x10 kfree+0xde/0x200 btrfs_backref_error_cleanup+0x452/0x530 build_backref_tree+0x1a5/0x700 relocate_tree_blocks+0x2be/0xb90 relocate_block_group+0x2eb/0x780 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff888112402900 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 80 bytes inside of 128-byte region [ffff888112402900, ffff888112402980) The buggy address belongs to the page: page:0000000028b1cd08 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888131c810c0 pfn:0x112402 flags: 0x17ffe0000000200(slab) raw: 017ffe0000000200 ffffea000424f308 ffffea0007d572c8 ffff888100040440 raw: ffff888131c810c0 ffff888112402000 0000000100000009 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888112402800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888112402880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff888112402900: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888112402980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888112402a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Link: https://lore.kernel.org/linux-btrfs/20201208194607.GI31381@hungrycats.org/ CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-14 19:02:45 +00:00
ASSERT(list_empty(&node->upper));
btrfs_backref_drop_node_buffer(node);
btrfs: add asserts for deleting backref cache nodes A weird KASAN problem that Zygo reported could have been easily caught if we checked for basic things in our backref freeing code. We have two methods of freeing a backref node - btrfs_backref_free_node: this just is kfree() essentially. - btrfs_backref_drop_node: this actually unlinks the node and cleans up everything and then calls btrfs_backref_free_node(). We should mostly be using btrfs_backref_drop_node(), to make sure the node is properly unlinked from the backref cache, and only use btrfs_backref_free_node() when we know the node isn't actually linked to the backref cache. We made a mistake here and thus got the KASAN splat. Make this style of issue easier to find by adding some ASSERT()'s to btrfs_backref_free_node() and adjusting our deletion stuff to properly init the list so we can rely on list_empty() checks working properly. BUG: KASAN: use-after-free in btrfs_backref_cleanup_node+0x18a/0x420 Read of size 8 at addr ffff888112402950 by task btrfs/28836 CPU: 0 PID: 28836 Comm: btrfs Tainted: G W 5.10.0-e35f27394290-for-next+ #23 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Call Trace: dump_stack+0xbc/0xf9 ? btrfs_backref_cleanup_node+0x18a/0x420 print_address_description.constprop.8+0x21/0x210 ? record_print_text.cold.34+0x11/0x11 ? btrfs_backref_cleanup_node+0x18a/0x420 ? btrfs_backref_cleanup_node+0x18a/0x420 kasan_report.cold.10+0x20/0x37 ? btrfs_backref_cleanup_node+0x18a/0x420 __asan_load8+0x69/0x90 btrfs_backref_cleanup_node+0x18a/0x420 btrfs_backref_release_cache+0x83/0x1b0 relocate_block_group+0x394/0x780 ? merge_reloc_roots+0x4a0/0x4a0 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 ? check_flags.part.50+0x6c/0x1e0 ? btrfs_relocate_chunk+0x120/0x120 ? kmem_cache_alloc_trace+0xa06/0xcb0 ? _copy_from_user+0x83/0xc0 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 ? __kasan_check_read+0x11/0x20 ? check_chain_key+0x1f4/0x2f0 ? __asan_loadN+0xf/0x20 ? btrfs_ioctl_get_supported_features+0x30/0x30 ? kvm_sched_clock_read+0x18/0x30 ? check_chain_key+0x1f4/0x2f0 ? lock_downgrade+0x3f0/0x3f0 ? handle_mm_fault+0xad6/0x2150 ? do_vfs_ioctl+0xfc/0x9d0 ? ioctl_file_clone+0xe0/0xe0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags+0x26/0x30 ? lock_is_held_type+0xc3/0xf0 ? syscall_enter_from_user_mode+0x1b/0x60 ? do_syscall_64+0x13/0x80 ? rcu_read_lock_sched_held+0xa1/0xd0 ? __kasan_check_read+0x11/0x20 ? __fget_light+0xae/0x110 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f4c4bdfe427 RSP: 002b:00007fff33ee6df8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fff33ee6e98 RCX: 00007f4c4bdfe427 RDX: 00007fff33ee6e98 RSI: 00000000c4009420 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000003 R09: 0000000000000078 R10: fffffffffffff59d R11: 0000000000000202 R12: 0000000000000001 R13: 0000000000000000 R14: 00007fff33ee8a34 R15: 0000000000000001 Allocated by task 28836: kasan_save_stack+0x21/0x50 __kasan_kmalloc.constprop.18+0xbe/0xd0 kasan_kmalloc+0x9/0x10 kmem_cache_alloc_trace+0x410/0xcb0 btrfs_backref_alloc_node+0x46/0xf0 btrfs_backref_add_tree_node+0x60d/0x11d0 build_backref_tree+0xc5/0x700 relocate_tree_blocks+0x2be/0xb90 relocate_block_group+0x2eb/0x780 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 28836: kasan_save_stack+0x21/0x50 kasan_set_track+0x20/0x30 kasan_set_free_info+0x1f/0x30 __kasan_slab_free+0xf3/0x140 kasan_slab_free+0xe/0x10 kfree+0xde/0x200 btrfs_backref_error_cleanup+0x452/0x530 build_backref_tree+0x1a5/0x700 relocate_tree_blocks+0x2be/0xb90 relocate_block_group+0x2eb/0x780 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff888112402900 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 80 bytes inside of 128-byte region [ffff888112402900, ffff888112402980) The buggy address belongs to the page: page:0000000028b1cd08 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888131c810c0 pfn:0x112402 flags: 0x17ffe0000000200(slab) raw: 017ffe0000000200 ffffea000424f308 ffffea0007d572c8 ffff888100040440 raw: ffff888131c810c0 ffff888112402000 0000000100000009 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888112402800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888112402880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff888112402900: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888112402980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888112402a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Link: https://lore.kernel.org/linux-btrfs/20201208194607.GI31381@hungrycats.org/ CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-14 19:02:45 +00:00
list_del_init(&node->list);
list_del_init(&node->lower);
if (!RB_EMPTY_NODE(&node->rb_node))
rb_erase(&node->rb_node, &tree->rb_root);
btrfs_backref_free_node(tree, node);
}
void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node);
void btrfs_backref_release_cache(struct btrfs_backref_cache *cache);
static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info,
u64 bytenr, int errno)
{
btrfs_panic(fs_info, errno,
"Inconsistency in backref cache found at offset %llu",
bytenr);
}
int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
struct btrfs_path *path,
struct btrfs_backref_iter *iter,
struct btrfs_key *node_key,
struct btrfs_backref_node *cur);
int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *start);
void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node);
#endif