From 30b3688e1f6c8cca8e0c6c74f88368497f081c9f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 23 Jan 2020 15:44:50 +0800 Subject: [PATCH 001/210] btrfs: Add overview of device replace The overview of btrfs dev-replace. It mentions some corner cases caused by the write duplication and scrub based data copy. Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba [ adjust wording ] Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2ca2a09d0e23..2aad07cdaea8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -22,6 +22,46 @@ #include "dev-replace.h" #include "sysfs.h" +/* + * Device replace overview + * + * [Objective] + * To copy all extents (both new and on-disk) from source device to target + * device, while still keeping the filesystem read-write. + * + * [Method] + * There are two main methods involved: + * + * - Write duplication + * + * All new writes will be written to both target and source devices, so even + * if replace gets canceled, sources device still contans up-to-date data. + * + * Location: handle_ops_on_dev_replace() from __btrfs_map_block() + * Start: btrfs_dev_replace_start() + * End: btrfs_dev_replace_finishing() + * Content: Latest data/metadata + * + * - Copy existing extents + * + * This happens by re-using scrub facility, as scrub also iterates through + * existing extents from commit root. + * + * Location: scrub_write_block_to_dev_replace() from + * scrub_block_complete() + * Content: Data/meta from commit root. + * + * Due to the content difference, we need to avoid nocow write when dev-replace + * is happening. This is done by marking the block group read-only and waiting + * for NOCOW writes. + * + * After replace is done, the finishing part is done by swapping the target and + * source devices. + * + * Location: btrfs_dev_replace_update_device_in_mapping_tree() from + * btrfs_dev_replace_finishing() + */ + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); static void btrfs_dev_replace_update_device_in_mapping_tree( From 42836cf4ba9c9b1797d1f7fe3245d82cf6dea6c4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 22 Jan 2020 12:23:54 +0000 Subject: [PATCH 002/210] Btrfs: don't iterate mod seq list when putting a tree mod seq Each new element added to the mod seq list is always appended to the list, and each one gets a sequence number coming from a counter which gets incremented everytime a new element is added to the list (or a new node is added to the tree mod log rbtree). Therefore the element with the lowest sequence number is always the first element in the list. So just remove the list iteration at btrfs_put_tree_mod_seq() that computes the minimum sequence number in the list and replace it with a check for the first element's sequence number. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f2ec1a9bae28..968faaec0e39 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -341,7 +341,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, struct rb_root *tm_root; struct rb_node *node; struct rb_node *next; - struct seq_list *cur_elem; struct tree_mod_elem *tm; u64 min_seq = (u64)-1; u64 seq_putting = elem->seq; @@ -353,18 +352,20 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, list_del(&elem->list); elem->seq = 0; - list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { - if (cur_elem->seq < min_seq) { - if (seq_putting > cur_elem->seq) { - /* - * blocker with lower sequence number exists, we - * cannot remove anything from the log - */ - write_unlock(&fs_info->tree_mod_log_lock); - return; - } - min_seq = cur_elem->seq; + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct seq_list *first; + + first = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + if (seq_putting > first->seq) { + /* + * Blocker with lower sequence number exists, we + * cannot remove anything from the log. + */ + write_unlock(&fs_info->tree_mod_log_lock); + return; } + min_seq = first->seq; } /* From 0c891389705698821ab59147bbcfffba22372a91 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 16 Jan 2020 13:04:07 +0800 Subject: [PATCH 003/210] btrfs: relocation: Add introduction of how relocation works Relocation is one of the most complex part of btrfs, while it's also the foundation stone for online resizing, profile converting. For such a complex facility, we should at least have some introduction to it. This patch will add an basic introduction at pretty a high level, explaining: - What relocation does - How relocation is done Only mentioning how data reloc tree and reloc tree are involved in the operation. No details like the backref cache, or the data reloc tree contents. - Which function to refer. More detailed comments will be added for reloc tree creation, data reloc tree creation and backref cache. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 47 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 995d4b8b1cfd..35873254d901 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -23,6 +23,53 @@ #include "delalloc-space.h" #include "block-group.h" +/* + * Relocation overview + * + * [What does relocation do] + * + * The objective of relocation is to relocate all extents of the target block + * group to other block groups. + * This is utilized by resize (shrink only), profile converting, compacting + * space, or balance routine to spread chunks over devices. + * + * Before | After + * ------------------------------------------------------------------ + * BG A: 10 data extents | BG A: deleted + * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) + * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) + * + * [How does relocation work] + * + * 1. Mark the target block group read-only + * New extents won't be allocated from the target block group. + * + * 2.1 Record each extent in the target block group + * To build a proper map of extents to be relocated. + * + * 2.2 Build data reloc tree and reloc trees + * Data reloc tree will contain an inode, recording all newly relocated + * data extents. + * There will be only one data reloc tree for one data block group. + * + * Reloc tree will be a special snapshot of its source tree, containing + * relocated tree blocks. + * Each tree referring to a tree block in target block group will get its + * reloc tree built. + * + * 2.3 Swap source tree with its corresponding reloc tree + * Each involved tree only refers to new extents after swap. + * + * 3. Cleanup reloc trees and data reloc tree. + * As old extents in the target block group are still referenced by reloc + * trees, we need to clean them up before really freeing the target block + * group. + * + * The main complexity is in steps 2.2 and 2.3. + * + * The entry point of relocation is relocate_block_group() function. + */ + /* * backref_node, mapping_node and tree_block start with this */ From cfe953c8240d1b21a81b8bd7f33cfe0b96b89f38 Mon Sep 17 00:00:00 2001 From: Su Yue Date: Tue, 4 Feb 2020 12:51:56 +0800 Subject: [PATCH 004/210] btrfs: update the comment of btrfs_control_ioctl() Btrfsctl was removed in 2012, now the function btrfs_control_ioctl() is only used for devices ioctls. So update the comment. Reviewed-by: Nikolay Borisov Signed-off-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 67c63858812a..1cbc3f125d5a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2203,7 +2203,7 @@ static int btrfs_control_open(struct inode *inode, struct file *file) } /* - * used by btrfsctl to scan devices when no FS is mounted + * Used by /dev/btrfs-control for devices ioctls. */ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, unsigned long arg) From 790a1d44f93f465b37d9d4ff22eea1850a079309 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Jan 2020 09:02:19 -0500 Subject: [PATCH 005/210] btrfs: use btrfs_ordered_update_i_size in clone_finish_inode_update We were using btrfs_i_size_write(), which unconditionally jacks up inode->disk_i_size. However since clone can operate on ranges we could have pending ordered extents for a range prior to the start of our clone operation and thus increase disk_i_size too far and have a hole with no file extent. Fix this by using the btrfs_ordered_update_i_size helper which will do the right thing in the face of pending ordered extents outside of our clone range. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4f4b13830b25..1bff1b6cd628 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3321,8 +3321,10 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, */ if (endoff > destoff + olen) endoff = destoff + olen; - if (endoff > inode->i_size) - btrfs_i_size_write(BTRFS_I(inode), endoff); + if (endoff > inode->i_size) { + i_size_write(inode, endoff); + btrfs_ordered_update_i_size(inode, endoff, NULL); + } ret = btrfs_update_inode(trans, root, inode); if (ret) { From 41a2ee75aab0290a5899677437736ec715dcd1b6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Jan 2020 09:02:21 -0500 Subject: [PATCH 006/210] btrfs: introduce per-inode file extent tree In order to keep track of where we have file extents on disk, and thus where it is safe to adjust the i_size to, we need to have a tree in place to keep track of the contiguous areas we have file extents for. Add helpers to use this tree, as it's not required for NO_HOLES file systems. We will use this by setting DIRTY for areas we know we have file extent item's set, and clearing it when we remove file extent items for truncation. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 6 +++ fs/btrfs/ctree.h | 5 ++ fs/btrfs/extent-io-tree.h | 3 ++ fs/btrfs/extent_io.c | 48 +++++++++++++++++++ fs/btrfs/file-item.c | 91 ++++++++++++++++++++++++++++++++++++ fs/btrfs/inode.c | 5 ++ include/trace/events/btrfs.h | 1 + 7 files changed, 159 insertions(+) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4e12a477d32e..27a1fefce508 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -60,6 +60,12 @@ struct btrfs_inode { */ struct extent_io_tree io_failure_tree; + /* + * Keep track of where the inode has extent items mapped in order to + * make sure the i_size adjustments are accurate + */ + struct extent_io_tree file_extent_tree; + /* held while logging the inode in tree-log.c */ struct mutex log_mutex; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 36df977b64d9..36dbf4e7c0dc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2859,6 +2859,11 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, struct btrfs_file_extent_item *fi, const bool new_inline, struct extent_map *em); +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len); +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len); +void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); /* inode.c */ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index a3febe746c79..cc3037f9765e 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -43,6 +43,7 @@ enum { IO_TREE_RELOC_BLOCKS, IO_TREE_TRANS_DIRTY_PAGES, IO_TREE_ROOT_DIRTY_LOG_PAGES, + IO_TREE_INODE_FILE_EXTENT, IO_TREE_SELFTEST, }; @@ -222,6 +223,8 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, struct extent_state **cached_state); void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, unsigned bits); +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c0f202741e09..cd07a9c30ec3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -257,6 +257,15 @@ void __cold extent_io_exit(void) bioset_exit(&btrfs_bioset); } +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc. These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; + void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner, void *private_data) @@ -268,6 +277,8 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, spin_lock_init(&tree->lock); tree->private_data = private_data; tree->owner = owner; + if (owner == IO_TREE_INODE_FILE_EXTENT) + lockdep_set_class(&tree->lock, &file_extent_tree_class); } void extent_io_tree_release(struct extent_io_tree *tree) @@ -1567,6 +1578,43 @@ out: return ret; } +/** + * find_contiguous_extent_bit: find a contiguous area of bits + * @tree - io tree to check + * @start - offset to start the search from + * @start_ret - the first offset we found with the bits set + * @end_ret - the final contiguous range of the bits that were set + * @bits - bits to look for + * + * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges + * to set bits appropriately, and then merge them again. During this time it + * will drop the tree->lock, so use this helper if you want to find the actual + * contiguous area for given bits. We will search to the first bit we find, and + * then walk down the tree until we find a non-contiguous area. The area + * returned will be the full contiguous area with the bits set. + */ +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, bits); + if (state) { + *start_ret = state->start; + *end_ret = state->end; + while ((state = next_state(state)) != NULL) { + if (state->start > (*end_ret + 1)) + break; + *end_ret = state->end; + } + ret = 0; + } + spin_unlock(&tree->lock); + return ret; +} + /** * find_first_clear_extent_bit - find the first range that has @bits not set. * This range could start before @start. diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c2f365662d55..6c849e8fd5a1 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -23,6 +23,97 @@ #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ PAGE_SIZE)) +/** + * @inode - the inode we want to update the disk_i_size for + * @new_i_size - the i_size we want to set to, 0 if we use i_size + * + * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read() + * returns as it is perfectly fine with a file that has holes without hole file + * extent items. + * + * However without NO_HOLES we need to only return the area that is contiguous + * from the 0 offset of the file. Otherwise we could end up adjust i_size up + * to an extent that has a gap in between. + * + * Finally new_i_size should only be set in the case of truncate where we're not + * ready to use i_size_read() as the limiter yet. + */ +void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + u64 start, end, i_size; + int ret; + + i_size = new_i_size ?: i_size_read(inode); + if (btrfs_fs_incompat(fs_info, NO_HOLES)) { + BTRFS_I(inode)->disk_i_size = i_size; + return; + } + + spin_lock(&BTRFS_I(inode)->lock); + ret = find_contiguous_extent_bit(&BTRFS_I(inode)->file_extent_tree, 0, + &start, &end, EXTENT_DIRTY); + if (!ret && start == 0) + i_size = min(i_size, end + 1); + else + i_size = 0; + BTRFS_I(inode)->disk_i_size = i_size; + spin_unlock(&BTRFS_I(inode)->lock); +} + +/** + * @inode - the inode we're modifying + * @start - the start file offset of the file extent we've inserted + * @len - the logical length of the file extent item + * + * Call when we are inserting a new file extent where there was none before. + * Does not need to call this in the case where we're replacing an existing file + * extent, however if not sure it's fine to call this multiple times. + * + * The start and len must match the file extent item, so thus must be sectorsize + * aligned. + */ +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len) +{ + if (len == 0) + return 0; + + ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize)); + + if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) + return 0; + return set_extent_bits(&inode->file_extent_tree, start, start + len - 1, + EXTENT_DIRTY); +} + +/** + * @inode - the inode we're modifying + * @start - the start file offset of the file extent we've inserted + * @len - the logical length of the file extent item + * + * Called when we drop a file extent, for example when we truncate. Doesn't + * need to be called for cases where we're replacing a file extent, like when + * we've COWed a file extent. + * + * The start and len must match the file extent item, so thus must be sectorsize + * aligned. + */ +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len) +{ + if (len == 0) + return 0; + + ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) || + len == (u64)-1); + + if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) + return 0; + return clear_extent_bit(&inode->file_extent_tree, start, + start + len - 1, EXTENT_DIRTY, 0, 0, NULL); +} + static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, u16 csum_size) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d267eb5caa7b..c481450dc76e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3187,6 +3187,8 @@ static int btrfs_read_locked_inode(struct inode *inode, i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, + round_up(i_size_read(inode), fs_info->sectorsize)); inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); @@ -8776,6 +8778,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); extent_io_tree_init(fs_info, &ei->io_failure_tree, IO_TREE_INODE_IO_FAILURE, inode); + extent_io_tree_init(fs_info, &ei->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT, inode); ei->io_tree.track_uptodate = true; ei->io_failure_tree.track_uptodate = true; atomic_set(&ei->sync_writers, 0); @@ -8842,6 +8846,7 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); + btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); } int btrfs_drop_inode(struct inode *inode) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 17088a112ed0..16ade35e8170 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -88,6 +88,7 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS); { IO_TREE_RELOC_BLOCKS, "RELOC_BLOCKS" }, \ { IO_TREE_TRANS_DIRTY_PAGES, "TRANS_DIRTY_PAGES" }, \ { IO_TREE_ROOT_DIRTY_LOG_PAGES, "ROOT_DIRTY_LOG_PAGES" }, \ + { IO_TREE_INODE_FILE_EXTENT, "INODE_FILE_EXTENT" }, \ { IO_TREE_SELFTEST, "SELFTEST" }) #define BTRFS_GROUP_FLAGS \ From 9ddc959e802bf7555a0be543205ddcba2bae98bf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Jan 2020 09:02:22 -0500 Subject: [PATCH 007/210] btrfs: use the file extent tree infrastructure We want to use this everywhere we modify the file extent items permanently. These include: 1) Inserting new file extents for writes and prealloc extents. 2) Truncating inode items. 3) btrfs_cont_expand(). 4) Insert inline extents. 5) Insert new extents from log replay. 6) Insert a new extent for clone, as it could be past i_size. 7) Hole punching For hole punching in particular it might seem it's not necessary because anybody extending would use btrfs_cont_expand, however there is a corner that still can give us trouble. Start with an empty file and fallocate KEEP_SIZE 1M-2M We now have a 0 length file, and a hole file extent from 0-1M, and a prealloc extent from 1M-2M. Now punch 1M-1.5M Because this is past i_size we have [HOLE EXTENT][ NOTHING ][PREALLOC] [0 1M][1M 1.5M][1.5M 2M] with an i_size of 0. Now if we pwrite 0-1.5M we'll increas our i_size to 1.5M, but our disk_i_size is still 0 until the ordered extent completes. However if we now immediately truncate 2M on the file we'll just call btrfs_cont_expand(inode, 1.5M, 2M), since our old i_size is 1.5M. If we commit the transaction here and crash we'll expose the gap. To fix this we need to clear the file extent mapping for the range that we punched but didn't insert a corresponding file extent for. This will mean the truncate will only get an disk_i_size set to 1M if we crash before the finish ordered io happens. I've written an xfstest to reproduce the problem and validate this fix. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 3 +++ fs/btrfs/file.c | 32 +++++++++++++++++++++++ fs/btrfs/inode.c | 55 +++++++++++++++++++++++++++++++++++++++- fs/btrfs/tree-log.c | 5 ++++ 4 files changed, 94 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index d3e15e1d4a91..9b23883a1086 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1760,6 +1760,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, int btrfs_fill_inode(struct inode *inode, u32 *rdev) { + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; @@ -1779,6 +1780,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); + btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, + round_up(i_size_read(inode), fs_info->sectorsize)); inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a16da274c9aa..200eb14b6721 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2486,6 +2486,11 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + clone_info->file_offset, clone_len); + if (ret) + return ret; + /* If it's a hole, nothing more needs to be done. */ if (clone_info->disk_offset == 0) return 0; @@ -2596,6 +2601,24 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); break; } + } else if (!clone_info && cur_offset < drop_end) { + /* + * We are past the i_size here, but since we didn't + * insert holes we need to clear the mapped area so we + * know to not set disk_i_size in this area until a new + * file extent is inserted here. + */ + ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + cur_offset, drop_end - cur_offset); + if (ret) { + /* + * We couldn't clear our area, so we could + * presumably adjust up and corrupt the fs, so + * we need to abort. + */ + btrfs_abort_transaction(trans, ret); + break; + } } if (clone_info && drop_end > clone_info->file_offset) { @@ -2686,6 +2709,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); goto out_trans; } + } else if (!clone_info && cur_offset < drop_end) { + /* See the comment in the loop above for the reasoning here. */ + ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + cur_offset, drop_end - cur_offset); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_trans; + } + } if (clone_info) { ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c481450dc76e..2865de52dc22 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -241,6 +241,15 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); + /* + * We align size to sectorsize for inline extents just for simplicity + * sake. + */ + size = ALIGN(size, root->fs_info->sectorsize); + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size); + if (ret) + goto fail; + /* * we're an inline extent, so nobody can * extend the file past i_size without locking @@ -2446,6 +2455,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos, + ram_bytes); + if (ret) + goto out; + /* * Release the reserved range from inode dirty range map, as it is * already moved into delayed_ref_head @@ -4160,6 +4174,8 @@ search_again: } while (1) { + u64 clear_start = 0, clear_len = 0; + fi = NULL; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -4210,6 +4226,8 @@ search_again: if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; + + clear_start = found_key.offset; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); if (!del_item) { u64 orig_num_bytes = @@ -4217,6 +4235,7 @@ search_again: extent_num_bytes = ALIGN(new_size - found_key.offset, fs_info->sectorsize); + clear_start = ALIGN(new_size, fs_info->sectorsize); btrfs_set_file_extent_num_bytes(leaf, fi, extent_num_bytes); num_dec = (orig_num_bytes - @@ -4242,6 +4261,7 @@ search_again: inode_sub_bytes(inode, num_dec); } } + clear_len = num_dec; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { /* * we can't truncate inline items that have had @@ -4263,12 +4283,33 @@ search_again: */ ret = NEED_TRUNCATE_BLOCK; break; + } else { + /* + * Inline extents are special, we just treat + * them as a full sector worth in the file + * extent tree just for simplicity sake. + */ + clear_len = fs_info->sectorsize; } if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); } delete: + /* + * We use btrfs_truncate_inode_items() to clean up log trees for + * multiple fsyncs, and in this case we don't want to clear the + * file extent range because it's just the log. + */ + if (root == BTRFS_I(inode)->root) { + ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + clear_start, clear_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + } + if (del_item) last_size = found_key.offset; else @@ -4591,14 +4632,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) } last_byte = min(extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize); + hole_size = last_byte - cur_offset; + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; - hole_size = last_byte - cur_offset; err = maybe_insert_hole(root, inode, cur_offset, hole_size); if (err) break; + + err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + cur_offset, hole_size); + if (err) + break; + btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); @@ -4630,6 +4678,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_size - 1, 0); } free_extent_map(hole_em); + } else { + err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + cur_offset, hole_size); + if (err) + break; } next: free_extent_map(em); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7dd7552f53a4..d3b101f2816a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -830,6 +830,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, + extent_end - start); + if (ret) + goto out; + inode_add_bytes(inode, nbytes); update_inode: ret = btrfs_update_inode(trans, root, inode); From d923afe96d7eabbe868a478bae0997dfecb8a5a3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Jan 2020 09:02:23 -0500 Subject: [PATCH 008/210] btrfs: replace all uses of btrfs_ordered_update_i_size Now that we have a safe way to update the i_size, replace all uses of btrfs_ordered_update_i_size with btrfs_inode_safe_disk_i_size_write. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 12 ++++++------ fs/btrfs/ioctl.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 200eb14b6721..06a2f0f2e602 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2967,7 +2967,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode, inode->i_ctime = current_time(inode); i_size_write(inode, end); - btrfs_ordered_update_i_size(inode, end, NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode(trans, root, inode); ret2 = btrfs_end_transaction(trans); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2865de52dc22..8b801a1e7845 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2550,7 +2550,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) */ btrfs_qgroup_free_data(inode, NULL, start, ordered_extent->num_bytes); - btrfs_ordered_update_i_size(inode, 0, ordered_extent); + btrfs_inode_safe_disk_i_size_write(inode, 0); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -2621,7 +2621,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - btrfs_ordered_update_i_size(inode, 0, ordered_extent); + btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); @@ -4411,7 +4411,7 @@ out: ASSERT(last_size >= new_size); if (!ret && last_size > new_size) last_size = new_size; - btrfs_ordered_update_i_size(inode, last_size, NULL); + btrfs_inode_safe_disk_i_size_write(inode, last_size); unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, &cached_state); } @@ -4740,7 +4740,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) } i_size_write(inode, newsize); - btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); pagecache_isize_extended(inode, oldsize, newsize); ret = btrfs_update_inode(trans, root, inode); btrfs_end_write_no_snapshotting(root); @@ -8727,7 +8727,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) ret = PTR_ERR(trans); goto out; } - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); } if (trans) { @@ -9996,7 +9996,7 @@ next: else i_size = cur_offset; i_size_write(inode, i_size); - btrfs_ordered_update_i_size(inode, i_size, NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); } ret = btrfs_update_inode(trans, root, inode); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1bff1b6cd628..ba1608d0cd8e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3323,7 +3323,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, endoff = destoff + olen; if (endoff > inode->i_size) { i_size_write(inode, endoff); - btrfs_ordered_update_i_size(inode, endoff, NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); } ret = btrfs_update_inode(trans, root, inode); From 3f1c64ce04387773d2b0d8ef6a7e573ff80e4436 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Jan 2020 09:02:24 -0500 Subject: [PATCH 009/210] btrfs: delete the ordered isize update code Now that we have a safe way to update the isize, remove all of this code as it's no longer needed. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 128 ----------------------------------- fs/btrfs/ordered-data.h | 7 -- include/trace/events/btrfs.h | 1 - 3 files changed, 136 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a65f189a5b94..66170c8cea6f 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -785,134 +785,6 @@ out: return entry; } -/* - * After an extent is done, call this to conditionally update the on disk - * i_size. i_size is updated to cover any fully written part of the file. - */ -int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, - struct btrfs_ordered_extent *ordered) -{ - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - u64 disk_i_size; - u64 new_i_size; - u64 i_size = i_size_read(inode); - struct rb_node *node; - struct rb_node *prev = NULL; - struct btrfs_ordered_extent *test; - int ret = 1; - u64 orig_offset = offset; - - spin_lock_irq(&tree->lock); - if (ordered) { - offset = entry_end(ordered); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) - offset = min(offset, - ordered->file_offset + - ordered->truncated_len); - } else { - offset = ALIGN(offset, btrfs_inode_sectorsize(inode)); - } - disk_i_size = BTRFS_I(inode)->disk_i_size; - - /* - * truncate file. - * If ordered is not NULL, then this is called from endio and - * disk_i_size will be updated by either truncate itself or any - * in-flight IOs which are inside the disk_i_size. - * - * Because btrfs_setsize() may set i_size with disk_i_size if truncate - * fails somehow, we need to make sure we have a precise disk_i_size by - * updating it as usual. - * - */ - if (!ordered && disk_i_size > i_size) { - BTRFS_I(inode)->disk_i_size = orig_offset; - ret = 0; - goto out; - } - - /* - * if the disk i_size is already at the inode->i_size, or - * this ordered extent is inside the disk i_size, we're done - */ - if (disk_i_size == i_size) - goto out; - - /* - * We still need to update disk_i_size if outstanding_isize is greater - * than disk_i_size. - */ - if (offset <= disk_i_size && - (!ordered || ordered->outstanding_isize <= disk_i_size)) - goto out; - - /* - * walk backward from this ordered extent to disk_i_size. - * if we find an ordered extent then we can't update disk i_size - * yet - */ - if (ordered) { - node = rb_prev(&ordered->rb_node); - } else { - prev = tree_search(tree, offset); - /* - * we insert file extents without involving ordered struct, - * so there should be no ordered struct cover this offset - */ - if (prev) { - test = rb_entry(prev, struct btrfs_ordered_extent, - rb_node); - BUG_ON(offset_in_entry(test, offset)); - } - node = prev; - } - for (; node; node = rb_prev(node)) { - test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - - /* We treat this entry as if it doesn't exist */ - if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) - continue; - - if (entry_end(test) <= disk_i_size) - break; - if (test->file_offset >= i_size) - break; - - /* - * We don't update disk_i_size now, so record this undealt - * i_size. Or we will not know the real i_size. - */ - if (test->outstanding_isize < offset) - test->outstanding_isize = offset; - if (ordered && - ordered->outstanding_isize > test->outstanding_isize) - test->outstanding_isize = ordered->outstanding_isize; - goto out; - } - new_i_size = min_t(u64, offset, i_size); - - /* - * Some ordered extents may completed before the current one, and - * we hold the real i_size in ->outstanding_isize. - */ - if (ordered && ordered->outstanding_isize > new_i_size) - new_i_size = min_t(u64, ordered->outstanding_isize, i_size); - BTRFS_I(inode)->disk_i_size = new_i_size; - ret = 0; -out: - /* - * We need to do this because we can't remove ordered extents until - * after the i_disk_size has been updated and then the inode has been - * updated to reflect the change, so we need to tell anybody who finds - * this ordered extent that we've already done all the real work, we - * just haven't completed all the other work. - */ - if (ordered) - set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); - spin_unlock_irq(&tree->lock); - return ret; -} - /* * search the ordered extents for one corresponding to 'offset' and * try to find a checksum. This is used because we allow pages to diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 3beb4da4ab41..a46f319d9ae0 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -52,11 +52,6 @@ enum { BTRFS_ORDERED_DIRECT, /* We had an io error when writing this out */ BTRFS_ORDERED_IOERR, - /* - * indicates whether this ordered extent has done its due diligence in - * updating the isize - */ - BTRFS_ORDERED_UPDATED_ISIZE, /* Set when we have to truncate an extent */ BTRFS_ORDERED_TRUNCATED, /* Regular IO for COW */ @@ -182,8 +177,6 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len); -int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, - struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 16ade35e8170..f1f2b6a04052 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -469,7 +469,6 @@ DEFINE_EVENT( { (1 << BTRFS_ORDERED_PREALLOC), "PREALLOC" }, \ { (1 << BTRFS_ORDERED_DIRECT), "DIRECT" }, \ { (1 << BTRFS_ORDERED_IOERR), "IOERR" }, \ - { (1 << BTRFS_ORDERED_UPDATED_ISIZE), "UPDATED_ISIZE" }, \ { (1 << BTRFS_ORDERED_TRUNCATED), "TRUNCATED" }) From 96dfcb46ffca4803627f7ae74085fbb2bae2acc5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:18 -0500 Subject: [PATCH 010/210] btrfs: push __setup_root into btrfs_alloc_root There's no reason to not init the root at alloc time, and with later patches it actually causes problems if we error out mounting the fs before the tree_root is init'ed because we expect it to have a valid ref count. Fix this by pushing __setup_root into btrfs_alloc_root. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c6c9a6a8e6c8..f1121d814e87 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1130,6 +1130,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + root->fs_info = fs_info; root->node = NULL; root->commit_root = NULL; root->state = 0; @@ -1198,11 +1199,11 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, } static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, - gfp_t flags) + u64 objectid, gfp_t flags) { struct btrfs_root *root = kzalloc(sizeof(*root), flags); if (root) - root->fs_info = fs_info; + __setup_root(root, fs_info, objectid); return root; } @@ -1215,12 +1216,11 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) if (!fs_info) return ERR_PTR(-EINVAL); - root = btrfs_alloc_root(fs_info, GFP_KERNEL); + root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); /* We don't use the stripesize in selftest, set it as sectorsize */ - __setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID); root->alloc_bytenr = 0; return root; @@ -1244,12 +1244,11 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, * context to avoid deadlock if reclaim happens. */ nofs_flag = memalloc_nofs_save(); - root = btrfs_alloc_root(fs_info, GFP_KERNEL); + root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(root, fs_info, objectid); root->root_key.objectid = objectid; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; @@ -1309,12 +1308,10 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root; struct extent_buffer *leaf; - root = btrfs_alloc_root(fs_info, GFP_NOFS); + root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID); - root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; @@ -1401,14 +1398,12 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, if (!path) return ERR_PTR(-ENOMEM); - root = btrfs_alloc_root(fs_info, GFP_NOFS); + root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS); if (!root) { ret = -ENOMEM; goto alloc_fail; } - __setup_root(root, fs_info, key->objectid); - ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); if (ret) { @@ -2208,12 +2203,11 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, return -EIO; } - log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); + log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, + GFP_KERNEL); if (!log_tree_root) return -ENOMEM; - __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); - log_tree_root->node = read_tree_block(fs_info, bytenr, fs_info->generation + 1, level, NULL); @@ -2645,8 +2639,12 @@ int __cold open_ctree(struct super_block *sb, int clear_free_space_tree = 0; int level; - tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); - chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); + tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, + GFP_KERNEL); + fs_info->tree_root = tree_root; + chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID, + GFP_KERNEL); + fs_info->chunk_root = chunk_root; if (!tree_root || !chunk_root) { err = -ENOMEM; goto fail; @@ -2824,8 +2822,6 @@ int __cold open_ctree(struct super_block *sb, goto fail_alloc; } - __setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); - invalidate_bdev(fs_devices->latest_bdev); /* @@ -3021,8 +3017,6 @@ int __cold open_ctree(struct super_block *sb, generation = btrfs_super_chunk_root_generation(disk_super); level = btrfs_super_chunk_root_level(disk_super); - __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); - chunk_root->node = read_tree_block(fs_info, btrfs_super_chunk_root(disk_super), generation, level, NULL); From f39e457156f919b849eb2c12a7a4452a1f2be15b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:19 -0500 Subject: [PATCH 011/210] btrfs: move fs root init stuff into btrfs_init_fs_root We have a helper for reading fs roots that just reads the fs root off the disk and then sets REF_COWS and init's the inheritable flags. Move this into btrfs_init_fs_root so we can later get rid of this helper and consolidate all of the fs root reading into one helper. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f1121d814e87..eff6f0e45c3e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1445,12 +1445,6 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, root = btrfs_read_tree_root(tree_root, location); if (IS_ERR(root)) return root; - - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - set_bit(BTRFS_ROOT_REF_COWS, &root->state); - btrfs_check_and_init_root_item(&root->root_item); - } - return root; } @@ -1474,6 +1468,11 @@ int btrfs_init_fs_root(struct btrfs_root *root) } root->subv_writers = writers; + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + set_bit(BTRFS_ROOT_REF_COWS, &root->state); + btrfs_check_and_init_root_item(&root->root_item); + } + btrfs_init_free_ino_ctl(root); spin_lock_init(&root->ino_cache_lock); init_waitqueue_head(&root->ino_cache_wait); From e59d18b45d081d66a2fb1a9d144fa82d4807b18d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:20 -0500 Subject: [PATCH 012/210] btrfs: make btrfs_find_orphan_roots use btrfs_get_fs_root btrfs_find_orphan_roots has this weird thing where it looks up the root in cache to see if it is there before just reading the root. But the read it uses just reads the root, it doesn't do any of the init work, we do that by hand here. But this is unnecessary, all we really want is to see if the root still exists and add it to the dead roots list to be cleaned up, otherwise we delete the orphan item. Fix this by just using btrfs_get_fs_root directly with check_ref set to false so we get the orphan root items. Then we just handle in cache and out of cache roots the same, add them to the dead roots list and carry on. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/root-tree.c | 36 ++---------------------------------- 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 612411c74550..1ea4a35a0830 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -255,25 +255,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) root_key.objectid = key.offset; key.offset++; - /* - * The root might have been inserted already, as before we look - * for orphan roots, log replay might have happened, which - * triggers a transaction commit and qgroup accounting, which - * in turn reads and inserts fs roots while doing backref - * walking. - */ - root = btrfs_lookup_fs_root(fs_info, root_key.objectid); - if (root) { - WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, - &root->state)); - if (btrfs_root_refs(&root->root_item) == 0) { - set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); - btrfs_add_dead_root(root); - } - continue; - } - - root = btrfs_read_fs_root(tree_root, &root_key); + root = btrfs_get_fs_root(fs_info, &root_key, false); err = PTR_ERR_OR_ZERO(root); if (err && err != -ENOENT) { break; @@ -300,21 +282,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) continue; } - err = btrfs_init_fs_root(root); - if (err) { - btrfs_free_fs_root(root); - break; - } - - set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); - - err = btrfs_insert_fs_root(fs_info, root); - if (err) { - BUG_ON(err == -EEXIST); - btrfs_free_fs_root(root); - break; - } - + WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); if (btrfs_root_refs(&root->root_item) == 0) { set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); From 62a2c73ebda3b0163832610c0e91f350a22908bd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:21 -0500 Subject: [PATCH 013/210] btrfs: export and use btrfs_read_tree_root for tree-log Tree-log uses btrfs_read_fs_root to load its log, but this just calls btrfs_read_tree_root. We don't save the log roots in our root cache, so just export this helper and use it in the logging code. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/disk-io.h | 2 ++ fs/btrfs/tree-log.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eff6f0e45c3e..9807167b6208 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1384,8 +1384,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return 0; } -static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key) +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root->fs_info; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 8c2d6cf1ce59..158fec0eeef2 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -58,6 +58,8 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, struct buffer_head **bh_ret); int btrfs_commit_super(struct btrfs_fs_info *fs_info); +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key); struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); int btrfs_init_fs_root(struct btrfs_root *root); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d3b101f2816a..6827e0927526 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6150,7 +6150,7 @@ again: if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) break; - log = btrfs_read_fs_root(log_root_tree, &found_key); + log = btrfs_read_tree_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); btrfs_handle_fs_error(fs_info, ret, From 3dbf1738a1d3429248a01bcb46ce5cec0eae24f7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:22 -0500 Subject: [PATCH 014/210] btrfs: make relocation use btrfs_read_tree_root() Relocation has it's special roots, we don't want to save these in the root cache either, so swap it to use btrfs_read_tree_root(). However the reloc root does need REF_COWS set, so make sure we set it everywhere we use this helper, as it no longer does the REF_COWS setting. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 35873254d901..2056b0e643bb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1494,8 +1494,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, BUG_ON(ret); kfree(root_item); - reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key); + reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); BUG_ON(IS_ERR(reloc_root)); + set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); reloc_root->last_trans = trans->transid; return reloc_root; } @@ -4584,12 +4585,13 @@ int btrfs_recover_relocation(struct btrfs_root *root) key.type != BTRFS_ROOT_ITEM_KEY) break; - reloc_root = btrfs_read_fs_root(root, &key); + reloc_root = btrfs_read_tree_root(root, &key); if (IS_ERR(reloc_root)) { err = PTR_ERR(reloc_root); goto out; } + set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); list_add(&reloc_root->root_list, &reloc_roots); if (btrfs_root_refs(&reloc_root->root_item) > 0) { From 83db2aadb3bd3a117918434448e8c0e90229b73b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:23 -0500 Subject: [PATCH 015/210] btrfs: remove btrfs_read_fs_root, not used anymore All helpers should either be using btrfs_get_fs_root() or btrfs_read_tree_root(). Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 13 +------------ fs/btrfs/disk-io.h | 2 -- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9807167b6208..2e92a6bc64c0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1437,17 +1437,6 @@ alloc_fail: goto out; } -struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, - struct btrfs_key *location) -{ - struct btrfs_root *root; - - root = btrfs_read_tree_root(tree_root, location); - if (IS_ERR(root)) - return root; - return root; -} - int btrfs_init_fs_root(struct btrfs_root *root) { int ret; @@ -1568,7 +1557,7 @@ again: return root; } - root = btrfs_read_fs_root(fs_info->tree_root, location); + root = btrfs_read_tree_root(fs_info->tree_root, location); if (IS_ERR(root)) return root; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 158fec0eeef2..4e43bd37f9c5 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -60,8 +60,6 @@ int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, struct btrfs_key *key); -struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, - struct btrfs_key *location); int btrfs_init_fs_root(struct btrfs_root *root); struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, u64 root_id); From 3619c94f073e4e96bef4cc15e70adbc36f3cb203 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:24 -0500 Subject: [PATCH 016/210] btrfs: open code btrfs_read_fs_root_no_name All this does is call btrfs_get_fs_root() with check_ref == true. Just use btrfs_get_fs_root() so we don't have a bunch of different helpers that do the same thing. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/disk-io.h | 6 ------ fs/btrfs/export.c | 2 +- fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 12 ++++++------ fs/btrfs/scrub.c | 2 +- fs/btrfs/send.c | 4 ++-- fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 2 +- fs/btrfs/tree-log.c | 2 +- fs/btrfs/volumes.c | 2 +- 12 files changed, 17 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2e92a6bc64c0..f0a33e004d91 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3178,7 +3178,7 @@ int __cold open_ctree(struct super_block *sb, location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; - fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); + fs_info->fs_root = btrfs_get_fs_root(fs_info, &location, true); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); btrfs_warn(fs_info, "failed to read fs tree: %d", err); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4e43bd37f9c5..c2e765edf034 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -70,12 +70,6 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key, bool check_ref); -static inline struct btrfs_root * -btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, - struct btrfs_key *location) -{ - return btrfs_get_fs_root(fs_info, location, true); -} int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 72e312cae69d..08cd8c4a02a5 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -77,7 +77,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, index = srcu_read_lock(&fs_info->subvol_srcu); - root = btrfs_read_fs_root_no_name(fs_info, &key); + root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(root)) { err = PTR_ERR(root); goto fail; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 06a2f0f2e602..1bc7665f8ff0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -287,7 +287,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, index = srcu_read_lock(&fs_info->subvol_srcu); - inode_root = btrfs_read_fs_root_no_name(fs_info, &key); + inode_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(inode_root)) { ret = PTR_ERR(inode_root); goto cleanup; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8b801a1e7845..d105d4d9e9aa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5153,7 +5153,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, btrfs_release_path(path); - new_root = btrfs_read_fs_root_no_name(fs_info, location); + new_root = btrfs_get_fs_root(fs_info, location, true); if (IS_ERR(new_root)) { err = PTR_ERR(new_root); goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ba1608d0cd8e..5342e4a2bb9a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -666,7 +666,7 @@ static noinline int create_subvol(struct inode *dir, goto fail; key.offset = (u64)-1; - new_root = btrfs_read_fs_root_no_name(fs_info, &key); + new_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); @@ -2179,7 +2179,7 @@ static noinline int search_ioctl(struct inode *inode, key.objectid = sk->tree_id; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(info, &key); + root = btrfs_get_fs_root(info, &key, true); if (IS_ERR(root)) { btrfs_free_path(path); return PTR_ERR(root); @@ -2314,7 +2314,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.objectid = tree_id; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(info, &key); + root = btrfs_get_fs_root(info, &key, true); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; @@ -2408,7 +2408,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, key.objectid = treeid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(fs_info, &key); + root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; @@ -2653,7 +2653,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) key.objectid = BTRFS_I(inode)->root->root_key.objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(fs_info, &key); + root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; @@ -3983,7 +3983,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) location.type = BTRFS_ROOT_ITEM_KEY; location.offset = (u64)-1; - new_root = btrfs_read_fs_root_no_name(fs_info, &location); + new_root = btrfs_get_fs_root(fs_info, &location, true); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); goto out; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 61b37c56a7fb..7e18dde5dcb0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -653,7 +653,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, root_key.objectid = root; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = (u64)-1; - local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); + local_root = btrfs_get_fs_root(fs_info, &root_key, true); if (IS_ERR(local_root)) { ret = PTR_ERR(local_root); goto err; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a055b657cb85..27e580564e49 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7195,7 +7195,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) index = srcu_read_lock(&fs_info->subvol_srcu); - clone_root = btrfs_read_fs_root_no_name(fs_info, &key); + clone_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(clone_root)) { srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(clone_root); @@ -7234,7 +7234,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) index = srcu_read_lock(&fs_info->subvol_srcu); - sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); + sctx->parent_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(sctx->parent_root)) { srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(sctx->parent_root); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1cbc3f125d5a..a2c2e71af122 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1096,7 +1096,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, key.objectid = subvol_objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - fs_root = btrfs_read_fs_root_no_name(fs_info, &key); + fs_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(fs_root)) { ret = PTR_ERR(fs_root); goto err; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index beb6c69cd1e5..bcf23b06e67f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1633,7 +1633,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.offset = (u64)-1; - pending->snap = btrfs_read_fs_root_no_name(fs_info, &key); + pending->snap = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6827e0927526..556aa4a614f5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6162,7 +6162,7 @@ again: tmp_key.type = BTRFS_ROOT_ITEM_KEY; tmp_key.offset = (u64)-1; - wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); + wc.replay_dest = btrfs_get_fs_root(fs_info, &tmp_key, true); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9cfc668f91f4..0b3167bd9f35 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4424,7 +4424,7 @@ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, key.objectid = subid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); + subvol_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(subvol_root)) { ret = PTR_ERR(subvol_root); if (ret == -ENOENT) From a98db0f304670dfd65375e6d0ea08793c174bf44 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:25 -0500 Subject: [PATCH 017/210] btrfs: make the fs root init functions static Now that the orphan cleanup stuff doesn't use this directly we can just make them static. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++--- fs/btrfs/disk-io.h | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f0a33e004d91..67e7b0ccdea0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1437,7 +1437,7 @@ alloc_fail: goto out; } -int btrfs_init_fs_root(struct btrfs_root *root) +static int btrfs_init_fs_root(struct btrfs_root *root) { int ret; struct btrfs_subvolume_writers *writers; @@ -1488,8 +1488,8 @@ fail: return ret; } -struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id) +static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) { struct btrfs_root *root; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c2e765edf034..7aa1c7a3a115 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -60,9 +60,6 @@ int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, struct btrfs_key *key); -int btrfs_init_fs_root(struct btrfs_root *root); -struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); From 4cdfd93002cb84471ed85b4999cd38077a317873 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:26 -0500 Subject: [PATCH 018/210] btrfs: handle NULL roots in btrfs_put/btrfs_grab_fs_root We want to use this for dropping all roots, and in some error cases we may not have a root, so handle this to make the cleanup code easier. Make btrfs_grab_fs_root the same so we can use it in cases where the root may not exist (like the quota root). Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 7aa1c7a3a115..8add2e14aab1 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -88,6 +88,8 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); */ static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) { + if (!root) + return NULL; if (refcount_inc_not_zero(&root->refs)) return root; return NULL; @@ -95,6 +97,8 @@ static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) static inline void btrfs_put_fs_root(struct btrfs_root *root) { + if (!root) + return; if (refcount_dec_and_test(&root->refs)) kfree(root); } From 734d8c15df8a6c2e0963e5f4133dfde4acb5fc44 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 4 Feb 2020 13:18:54 -0500 Subject: [PATCH 019/210] btrfs: add a comment describing block reserves This is a giant comment at the top of block-rsv.c describing generally how block reserves work. It is purely about the block reserves themselves, and nothing to do with how the actual reservation system works. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 92 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index d07bd41a7c1e..e46dc3688983 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -6,6 +6,98 @@ #include "space-info.h" #include "transaction.h" +/* + * HOW DO BLOCK RESERVES WORK + * + * Think of block_rsv's as buckets for logically grouped metadata + * reservations. Each block_rsv has a ->size and a ->reserved. ->size is + * how large we want our block rsv to be, ->reserved is how much space is + * currently reserved for this block reserve. + * + * ->failfast exists for the truncate case, and is described below. + * + * NORMAL OPERATION + * + * -> Reserve + * Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill + * + * We call into btrfs_reserve_metadata_bytes() with our bytes, which is + * accounted for in space_info->bytes_may_use, and then add the bytes to + * ->reserved, and ->size in the case of btrfs_block_rsv_add. + * + * ->size is an over-estimation of how much we may use for a particular + * operation. + * + * -> Use + * Entrance: btrfs_use_block_rsv + * + * When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv() + * to determine the appropriate block_rsv to use, and then verify that + * ->reserved has enough space for our tree block allocation. Once + * successful we subtract fs_info->nodesize from ->reserved. + * + * -> Finish + * Entrance: btrfs_block_rsv_release + * + * We are finished with our operation, subtract our individual reservation + * from ->size, and then subtract ->size from ->reserved and free up the + * excess if there is any. + * + * There is some logic here to refill the delayed refs rsv or the global rsv + * as needed, otherwise the excess is subtracted from + * space_info->bytes_may_use. + * + * TYPES OF BLOCK RESERVES + * + * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK + * These behave normally, as described above, just within the confines of the + * lifetime of their particular operation (transaction for the whole trans + * handle lifetime, for example). + * + * BLOCK_RSV_GLOBAL + * It is impossible to properly account for all the space that may be required + * to make our extent tree updates. This block reserve acts as an overflow + * buffer in case our delayed refs reserve does not reserve enough space to + * update the extent tree. + * + * We can steal from this in some cases as well, notably on evict() or + * truncate() in order to help users recover from ENOSPC conditions. + * + * BLOCK_RSV_DELALLOC + * The individual item sizes are determined by the per-inode size + * calculations, which are described with the delalloc code. This is pretty + * straightforward, it's just the calculation of ->size encodes a lot of + * different items, and thus it gets used when updating inodes, inserting file + * extents, and inserting checksums. + * + * BLOCK_RSV_DELREFS + * We keep a running tally of how many delayed refs we have on the system. + * We assume each one of these delayed refs are going to use a full + * reservation. We use the transaction items and pre-reserve space for every + * operation, and use this reservation to refill any gap between ->size and + * ->reserved that may exist. + * + * From there it's straightforward, removing a delayed ref means we remove its + * count from ->size and free up reservations as necessary. Since this is + * the most dynamic block reserve in the system, we will try to refill this + * block reserve first with any excess returned by any other block reserve. + * + * BLOCK_RSV_EMPTY + * This is the fallback block reserve to make us try to reserve space if we + * don't have a specific bucket for this allocation. It is mostly used for + * updating the device tree and such, since that is a separate pool we're + * content to just reserve space from the space_info on demand. + * + * BLOCK_RSV_TEMP + * This is used by things like truncate and iput. We will temporarily + * allocate a block reserve, set it to some size, and then truncate bytes + * until we have no space left. With ->failfast set we'll simply return + * ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try + * to make a new reservation. This is because these operations are + * unbounded, so we want to do as much work as we can, and then back off and + * re-reserve. + */ + static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes, From 6f4ad559ea1cb5c5e78217405e4141c6a5981f86 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 4 Feb 2020 13:18:55 -0500 Subject: [PATCH 020/210] btrfs: add a comment describing delalloc space reservation delalloc space reservation is tricky because it encompasses both data and metadata. Make it clear what each side does, the general flow of how space is moved throughout the lifetime of a write, and what goes into the calculations. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 102 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 4cdac4d834f5..f23d07a981e4 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -9,6 +9,108 @@ #include "qgroup.h" #include "block-group.h" +/* + * HOW DOES THIS WORK + * + * There are two stages to data reservations, one for data and one for metadata + * to handle the new extents and checksums generated by writing data. + * + * + * DATA RESERVATION + * The general flow of the data reservation is as follows + * + * -> Reserve + * We call into btrfs_reserve_data_bytes() for the user request bytes that + * they wish to write. We make this reservation and add it to + * space_info->bytes_may_use. We set EXTENT_DELALLOC on the inode io_tree + * for the range and carry on if this is buffered, or follow up trying to + * make a real allocation if we are pre-allocating or doing O_DIRECT. + * + * -> Use + * At writepages()/prealloc/O_DIRECT time we will call into + * btrfs_reserve_extent() for some part or all of this range of bytes. We + * will make the allocation and subtract space_info->bytes_may_use by the + * original requested length and increase the space_info->bytes_reserved by + * the allocated length. This distinction is important because compression + * may allocate a smaller on disk extent than we previously reserved. + * + * -> Allocation + * finish_ordered_io() will insert the new file extent item for this range, + * and then add a delayed ref update for the extent tree. Once that delayed + * ref is written the extent size is subtracted from + * space_info->bytes_reserved and added to space_info->bytes_used. + * + * Error handling + * + * -> By the reservation maker + * This is the simplest case, we haven't completed our operation and we know + * how much we reserved, we can simply call + * btrfs_free_reserved_data_space*() and it will be removed from + * space_info->bytes_may_use. + * + * -> After the reservation has been made, but before cow_file_range() + * This is specifically for the delalloc case. You must clear + * EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will + * be subtracted from space_info->bytes_may_use. + * + * METADATA RESERVATION + * The general metadata reservation lifetimes are discussed elsewhere, this + * will just focus on how it is used for delalloc space. + * + * We keep track of two things on a per inode bases + * + * ->outstanding_extents + * This is the number of file extent items we'll need to handle all of the + * outstanding DELALLOC space we have in this inode. We limit the maximum + * size of an extent, so a large contiguous dirty area may require more than + * one outstanding_extent, which is why count_max_extents() is used to + * determine how many outstanding_extents get added. + * + * ->csum_bytes + * This is essentially how many dirty bytes we have for this inode, so we + * can calculate the number of checksum items we would have to add in order + * to checksum our outstanding data. + * + * We keep a per-inode block_rsv in order to make it easier to keep track of + * our reservation. We use btrfs_calculate_inode_block_rsv_size() to + * calculate the current theoretical maximum reservation we would need for the + * metadata for this inode. We call this and then adjust our reservation as + * necessary, either by attempting to reserve more space, or freeing up excess + * space. + * + * OUTSTANDING_EXTENTS HANDLING + * + * ->outstanding_extents is used for keeping track of how many extents we will + * need to use for this inode, and it will fluctuate depending on where you are + * in the life cycle of the dirty data. Consider the following normal case for + * a completely clean inode, with a num_bytes < our maximum allowed extent size + * + * -> reserve + * ->outstanding_extents += 1 (current value is 1) + * + * -> set_delalloc + * ->outstanding_extents += 1 (currrent value is 2) + * + * -> btrfs_delalloc_release_extents() + * ->outstanding_extents -= 1 (current value is 1) + * + * We must call this once we are done, as we hold our reservation for the + * duration of our operation, and then assume set_delalloc will update the + * counter appropriately. + * + * -> add ordered extent + * ->outstanding_extents += 1 (current value is 2) + * + * -> btrfs_clear_delalloc_extent + * ->outstanding_extents -= 1 (current value is 1) + * + * -> finish_ordered_io/btrfs_remove_ordered_extent + * ->outstanding_extents -= 1 (current value is 0) + * + * Each stage is responsible for their own accounting of the extent, thus + * making error handling and cleanup easier. + */ + int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; From 4b8b05288835501012d957bee597bffc72512646 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 4 Feb 2020 13:18:56 -0500 Subject: [PATCH 021/210] btrfs: describe the space reservation system in general Add another comment to cover how the space reservation system works generally. This covers the actual reservation flow, as well as how flushing is handled. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 147 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 01297c5b2666..3b8d8fa8d767 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -10,6 +10,153 @@ #include "transaction.h" #include "block-group.h" +/* + * HOW DOES SPACE RESERVATION WORK + * + * If you want to know about delalloc specifically, there is a separate comment + * for that with the delalloc code. This comment is about how the whole system + * works generally. + * + * BASIC CONCEPTS + * + * 1) space_info. This is the ultimate arbiter of how much space we can use. + * There's a description of the bytes_ fields with the struct declaration, + * refer to that for specifics on each field. Suffice it to say that for + * reservations we care about total_bytes - SUM(space_info->bytes_) when + * determining if there is space to make an allocation. There is a space_info + * for METADATA, SYSTEM, and DATA areas. + * + * 2) block_rsv's. These are basically buckets for every different type of + * metadata reservation we have. You can see the comment in the block_rsv + * code on the rules for each type, but generally block_rsv->reserved is how + * much space is accounted for in space_info->bytes_may_use. + * + * 3) btrfs_calc*_size. These are the worst case calculations we used based + * on the number of items we will want to modify. We have one for changing + * items, and one for inserting new items. Generally we use these helpers to + * determine the size of the block reserves, and then use the actual bytes + * values to adjust the space_info counters. + * + * MAKING RESERVATIONS, THE NORMAL CASE + * + * We call into either btrfs_reserve_data_bytes() or + * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with + * num_bytes we want to reserve. + * + * ->reserve + * space_info->bytes_may_reserve += num_bytes + * + * ->extent allocation + * Call btrfs_add_reserved_bytes() which does + * space_info->bytes_may_reserve -= num_bytes + * space_info->bytes_reserved += extent_bytes + * + * ->insert reference + * Call btrfs_update_block_group() which does + * space_info->bytes_reserved -= extent_bytes + * space_info->bytes_used += extent_bytes + * + * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority) + * + * Assume we are unable to simply make the reservation because we do not have + * enough space + * + * -> __reserve_bytes + * create a reserve_ticket with ->bytes set to our reservation, add it to + * the tail of space_info->tickets, kick async flush thread + * + * ->handle_reserve_ticket + * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set + * on the ticket. + * + * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space + * Flushes various things attempting to free up space. + * + * -> btrfs_try_granting_tickets() + * This is called by anything that either subtracts space from + * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the + * space_info->total_bytes. This loops through the ->priority_tickets and + * then the ->tickets list checking to see if the reservation can be + * completed. If it can the space is added to space_info->bytes_may_use and + * the ticket is woken up. + * + * -> ticket wakeup + * Check if ->bytes == 0, if it does we got our reservation and we can carry + * on, if not return the appropriate error (ENOSPC, but can be EINTR if we + * were interrupted.) + * + * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY + * + * Same as the above, except we add ourselves to the + * space_info->priority_tickets, and we do not use ticket->wait, we simply + * call flush_space() ourselves for the states that are safe for us to call + * without deadlocking and hope for the best. + * + * THE FLUSHING STATES + * + * Generally speaking we will have two cases for each state, a "nice" state + * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to + * reduce the locking over head on the various trees, and even to keep from + * doing any work at all in the case of delayed refs. Each of these delayed + * things however hold reservations, and so letting them run allows us to + * reclaim space so we can make new reservations. + * + * FLUSH_DELAYED_ITEMS + * Every inode has a delayed item to update the inode. Take a simple write + * for example, we would update the inode item at write time to update the + * mtime, and then again at finish_ordered_io() time in order to update the + * isize or bytes. We keep these delayed items to coalesce these operations + * into a single operation done on demand. These are an easy way to reclaim + * metadata space. + * + * FLUSH_DELALLOC + * Look at the delalloc comment to get an idea of how much space is reserved + * for delayed allocation. We can reclaim some of this space simply by + * running delalloc, but usually we need to wait for ordered extents to + * reclaim the bulk of this space. + * + * FLUSH_DELAYED_REFS + * We have a block reserve for the outstanding delayed refs space, and every + * delayed ref operation holds a reservation. Running these is a quick way + * to reclaim space, but we want to hold this until the end because COW can + * churn a lot and we can avoid making some extent tree modifications if we + * are able to delay for as long as possible. + * + * ALLOC_CHUNK + * We will skip this the first time through space reservation, because of + * overcommit and we don't want to have a lot of useless metadata space when + * our worst case reservations will likely never come true. + * + * RUN_DELAYED_IPUTS + * If we're freeing inodes we're likely freeing checksums, file extent + * items, and extent tree items. Loads of space could be freed up by these + * operations, however they won't be usable until the transaction commits. + * + * COMMIT_TRANS + * may_commit_transaction() is the ultimate arbiter on whether we commit the + * transaction or not. In order to avoid constantly churning we do all the + * above flushing first and then commit the transaction as the last resort. + * However we need to take into account things like pinned space that would + * be freed, plus any delayed work we may not have gotten rid of in the case + * of metadata. + * + * OVERCOMMIT + * + * Because we hold so many reservations for metadata we will allow you to + * reserve more space than is currently free in the currently allocate + * metadata space. This only happens with metadata, data does not allow + * overcommitting. + * + * You can see the current logic for when we allow overcommit in + * btrfs_can_overcommit(), but it only applies to unallocated space. If there + * is no unallocated space to be had, all reservations are kept within the + * free space in the allocated metadata chunks. + * + * Because of overcommitting, you generally want to use the + * btrfs_can_overcommit() logic for metadata allocations, as it does the right + * thing with or without extra unallocated space. + */ + u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included) { From af01d2e53f3343e9f22495e79f07b107374b05a1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:27 -0500 Subject: [PATCH 022/210] btrfs: hold a ref on fs roots while they're in the radix tree If the root is sitting in the radix tree, we should probably have a ref for the radix tree. Grab a ref on the root when we insert it, and drop it when it gets deleted. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 67e7b0ccdea0..43af9f9b8583 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1513,8 +1513,10 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, root); - if (ret == 0) + if (ret == 0) { + btrfs_grab_fs_root(root); set_bit(BTRFS_ROOT_IN_RADIX, &root->state); + } spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); @@ -3817,6 +3819,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); + if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) + btrfs_put_fs_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); if (btrfs_root_refs(&root->root_item) == 0) From 9326f76f4bc4f4fc624d2f5263386ac879957867 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:28 -0500 Subject: [PATCH 023/210] btrfs: hold a ref on the root in resolve_indirect_ref We're looking up a random root, we need to hold a ref on it while we're using it. Reviewed-by: David Sterba Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e5d85311d5d5..193747b6e1f9 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -524,7 +524,13 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (IS_ERR(root)) { srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(root); - goto out; + goto out_free; + } + + if (!btrfs_grab_fs_root(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -ENOENT; + goto out_free; } if (btrfs_is_testing(fs_info)) { @@ -577,6 +583,8 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, ret = add_all_parents(root, path, parents, ref, level, time_seq, extent_item_pos, total_refs, ignore_offset); out: + btrfs_put_fs_root(root); +out_free: path->lowest_level = 0; btrfs_release_path(path); return ret; From bdf70b9e75f54a04941dc450abe61694e03fead4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:29 -0500 Subject: [PATCH 024/210] btrfs: hold a root ref in btrfs_get_dentry Looking up the inode we need to search the root, make sure we hold a reference on that root while we're doing the lookup. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/export.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 08cd8c4a02a5..eba6c6d27bad 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -82,12 +82,17 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, err = PTR_ERR(root); goto fail; } + if (!btrfs_grab_fs_root(root)) { + err = -ENOENT; + goto fail; + } key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(sb, &key, root); + btrfs_put_fs_root(root); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail; From 02162a0265eb56742442ef42c58db8739ddd9b94 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:30 -0500 Subject: [PATCH 025/210] btrfs: hold a ref on the root in __btrfs_run_defrag_inode We are looking up an arbitrary inode, we need to hold a ref on the root while we're doing this. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1bc7665f8ff0..27237bbaad65 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -292,11 +292,16 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, ret = PTR_ERR(inode_root); goto cleanup; } + if (!btrfs_grab_fs_root(inode_root)) { + ret = -ENOENT; + goto cleanup; + } key.objectid = defrag->ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root); + btrfs_put_fs_root(inode_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto cleanup; From 8727002f79095d006a642f4f1f42372e84a77082 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:31 -0500 Subject: [PATCH 026/210] btrfs: hold a ref on the root in fixup_tree_root_location Looking up the inode from an arbitrary tree means we need to hold a ref on that root. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d105d4d9e9aa..324fc79174e6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5158,6 +5158,10 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, err = PTR_ERR(new_root); goto out; } + if (!btrfs_grab_fs_root(new_root)) { + err = -ENOENT; + goto out; + } *sub_root = new_root; location->objectid = btrfs_root_dirid(&new_root->root_item); @@ -5400,6 +5404,8 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) } else { inode = btrfs_iget(dir->i_sb, &location, sub_root); } + if (root != sub_root) + btrfs_put_fs_root(sub_root); srcu_read_unlock(&fs_info->subvol_srcu, index); if (!IS_ERR(inode) && root != sub_root) { From fc92f79856aaf4edb51ef8ca7650747df34b6dd6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:32 -0500 Subject: [PATCH 027/210] btrfs: hold a ref on the root in create_subvol We're creating the new root here, but we should hold the ref until after we've initialized the inode for it. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5342e4a2bb9a..0e524f3eaa9f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -672,10 +672,16 @@ static noinline int create_subvol(struct inode *dir, btrfs_abort_transaction(trans, ret); goto fail; } + if (!btrfs_grab_fs_root(new_root)) { + ret = -ENOENT; + btrfs_abort_transaction(trans, ret); + goto fail; + } btrfs_record_root_in_trans(trans, new_root); ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); + btrfs_put_fs_root(new_root); if (ret) { /* We potentially lose an unused inode item here */ btrfs_abort_transaction(trans, ret); From 3ca35e839e949811c7e35b7ef786de225a1c5008 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:33 -0500 Subject: [PATCH 028/210] btrfs: hold a ref on the root in search_ioctl We lookup a arbitrary fs root, we need to hold a ref on that root. If we're using our own inodes root then grab a ref on that as well to make the cleanup easier. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0e524f3eaa9f..b173bb6915d9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2180,7 +2180,7 @@ static noinline int search_ioctl(struct inode *inode, if (sk->tree_id == 0) { /* search the root of the inode that was passed */ - root = BTRFS_I(inode)->root; + root = btrfs_grab_fs_root(BTRFS_I(inode)->root); } else { key.objectid = sk->tree_id; key.type = BTRFS_ROOT_ITEM_KEY; @@ -2190,6 +2190,10 @@ static noinline int search_ioctl(struct inode *inode, btrfs_free_path(path); return PTR_ERR(root); } + if (!btrfs_grab_fs_root(root)) { + btrfs_free_path(path); + return -ENOENT; + } } key.objectid = sk->min_objectid; @@ -2214,6 +2218,7 @@ static noinline int search_ioctl(struct inode *inode, ret = 0; err: sk->nr_items = num_found; + btrfs_put_fs_root(root); btrfs_free_path(path); return ret; } From 88234012beaaf677dc6d3e685d270acc1bb56a03 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:34 -0500 Subject: [PATCH 029/210] btrfs: hold a ref on the root in btrfs_search_path_in_tree We look up an arbitrary fs root, we need to hold a ref on it while we're doing our search. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b173bb6915d9..9c8a9944f848 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2328,6 +2328,12 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, root = btrfs_get_fs_root(info, &key, true); if (IS_ERR(root)) { ret = PTR_ERR(root); + root = NULL; + goto out; + } + if (!btrfs_grab_fs_root(root)) { + ret = -ENOENT; + root = NULL; goto out; } @@ -2378,6 +2384,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, name[total_len] = '\0'; ret = 0; out: + btrfs_put_fs_root(root); btrfs_free_path(path); return ret; } From b8a49ae1913f1befdbd477f805e99cb61d7971df Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:35 -0500 Subject: [PATCH 030/210] btrfs: hold a ref on the root in btrfs_search_path_in_tree_user We can wander into a different root, so grab a ref on the root we look up. Later on we make root = fs_info->tree_root so we need this separate out label to make sure we do the right cleanup only in the case we're looking up a different root. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9c8a9944f848..7e1af2d46dd2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2401,7 +2401,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, unsigned long item_len; struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; - struct btrfs_root *root; + struct btrfs_root *root = NULL; struct btrfs_path *path; struct btrfs_key key, key2; struct extent_buffer *leaf; @@ -2431,6 +2431,10 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, ret = PTR_ERR(root); goto out; } + if (!btrfs_grab_fs_root(root)) { + ret = -ENOENT; + goto out; + } key.objectid = dirid; key.type = BTRFS_INODE_REF_KEY; @@ -2438,15 +2442,15 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - goto out; + goto out_put; } else if (ret > 0) { ret = btrfs_previous_item(root, path, dirid, BTRFS_INODE_REF_KEY); if (ret < 0) { - goto out; + goto out_put; } else if (ret > 0) { ret = -ENOENT; - goto out; + goto out_put; } } @@ -2460,7 +2464,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, total_len += len + 1; if (ptr < args->path) { ret = -ENAMETOOLONG; - goto out; + goto out_put; } *(ptr + len) = '/'; @@ -2471,10 +2475,10 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, ret = btrfs_previous_item(root, path, dirid, BTRFS_INODE_ITEM_KEY); if (ret < 0) { - goto out; + goto out_put; } else if (ret > 0) { ret = -ENOENT; - goto out; + goto out_put; } leaf = path->nodes[0]; @@ -2482,26 +2486,26 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, btrfs_item_key_to_cpu(leaf, &key2, slot); if (key2.objectid != dirid) { ret = -ENOENT; - goto out; + goto out_put; } temp_inode = btrfs_iget(sb, &key2, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); - goto out; + goto out_put; } ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC); iput(temp_inode); if (ret) { ret = -EACCES; - goto out; + goto out_put; } if (key.offset == upper_limit.objectid) break; if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) { ret = -EACCES; - goto out; + goto out_put; } btrfs_release_path(path); @@ -2512,15 +2516,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, memmove(args->path, ptr, total_len); args->path[total_len] = '\0'; + btrfs_put_fs_root(root); + root = NULL; btrfs_release_path(path); } /* Get the bottom subvolume's name from ROOT_REF */ - root = fs_info->tree_root; key.objectid = treeid; key.type = BTRFS_ROOT_REF_KEY; key.offset = args->treeid; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (ret > 0) { @@ -2547,6 +2552,8 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, read_extent_buffer(leaf, args->name, item_off, item_len); args->name[item_len] = 0; +out_put: + btrfs_put_fs_root(root); out: btrfs_free_path(path); return ret; From 04734e844894ccdfcb907d10eec1433e19d4918a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:36 -0500 Subject: [PATCH 031/210] btrfs: hold a ref on the root in btrfs_ioctl_get_subvol_info We look up whatever root userspace has given us, we need to hold a ref throughout this operation. Use 'root' only for the on fs root and not as a temporary variable elsewhere. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7e1af2d46dd2..0aa47bc3e172 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2681,6 +2681,10 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(root)) { ret = PTR_ERR(root); + goto out_free; + } + if (!btrfs_grab_fs_root(root)) { + ret = -ENOENT; goto out; } root_item = &root->root_item; @@ -2714,16 +2718,14 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) if (key.objectid != BTRFS_FS_TREE_OBJECTID) { /* Search root tree for ROOT_BACKREF of this subvolume */ - root = fs_info->tree_root; - key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_leaf(fs_info->tree_root, path); if (ret < 0) { goto out; } else if (ret > 0) { @@ -2758,6 +2760,8 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) ret = -EFAULT; out: + btrfs_put_fs_root(root); +out_free: btrfs_free_path(path); kzfree(subvol_info); return ret; From 2a2b5d62026633d6fc515b1c0fe3a5e58cae7891 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:37 -0500 Subject: [PATCH 032/210] btrfs: hold ref on root in btrfs_ioctl_default_subvol We look up an arbitrary fs root here, we need to hold a ref on the root for the duration. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0aa47bc3e172..5fde22db1727 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3986,7 +3986,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_root *new_root; struct btrfs_dir_item *di; struct btrfs_trans_handle *trans; - struct btrfs_path *path; + struct btrfs_path *path = NULL; struct btrfs_key location; struct btrfs_disk_key disk_key; u64 objectid = 0; @@ -4017,44 +4017,50 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } - if (!is_fstree(new_root->root_key.objectid)) { + if (!btrfs_grab_fs_root(new_root)) { ret = -ENOENT; goto out; } + if (!is_fstree(new_root->root_key.objectid)) { + ret = -ENOENT; + goto out_free; + } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto out; + goto out_free; } path->leave_spinning = 1; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - btrfs_free_path(path); ret = PTR_ERR(trans); - goto out; + goto out_free; } dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path, dir_id, "default", 7, 1); if (IS_ERR_OR_NULL(di)) { - btrfs_free_path(path); + btrfs_release_path(path); btrfs_end_transaction(trans); btrfs_err(fs_info, "Umm, you don't have the default diritem, this isn't going to work"); ret = -ENOENT; - goto out; + goto out_free; } btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); + btrfs_release_path(path); btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); btrfs_end_transaction(trans); +out_free: + btrfs_put_fs_root(new_root); + btrfs_free_path(path); out: mnt_drop_write_file(file); return ret; From 0b530bc5e11f06da8cb6791d658c3ac2dced2486 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:38 -0500 Subject: [PATCH 033/210] btrfs: hold a ref on the root in build_backref_tree This is trickier than the previous conversions. We have backref_node's that need to hold onto their root for their lifetime. Do the read of the root and grab the ref. If at any point we don't use the root we discard it, however if we use it in our backref node we don't free it until we free the backref node. Any time we switch the root's for the backref node we need to drop our ref on the old root and grab the ref on the new root, and if we dupe a node we need to get a ref on the root there as well. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2056b0e643bb..83fd74cade3b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -303,6 +303,7 @@ static void free_backref_node(struct backref_cache *cache, { if (node) { cache->nr_nodes--; + btrfs_put_fs_root(node->root); kfree(node); } } @@ -636,7 +637,7 @@ static struct btrfs_root *find_reloc_root(struct reloc_control *rc, root = (struct btrfs_root *)node->data; } spin_unlock(&rc->reloc_root_tree.lock); - return root; + return btrfs_grab_fs_root(root); } static int is_cowonly_root(u64 root_objectid) @@ -938,6 +939,10 @@ again: err = PTR_ERR(root); goto out; } + if (!btrfs_grab_fs_root(root)) { + err = -ENOENT; + goto out; + } if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) cur->cowonly = 1; @@ -946,10 +951,12 @@ again: /* tree root */ ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr); - if (should_ignore_root(root)) + if (should_ignore_root(root)) { + btrfs_put_fs_root(root); list_add(&cur->list, &useless); - else + } else { cur->root = root; + } break; } @@ -962,6 +969,7 @@ again: ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); path2->lowest_level = 0; if (ret < 0) { + btrfs_put_fs_root(root); err = ret; goto out; } @@ -977,6 +985,7 @@ again: root->root_key.objectid, node_key->objectid, node_key->type, node_key->offset); + btrfs_put_fs_root(root); err = -ENOENT; goto out; } @@ -988,15 +997,18 @@ again: if (!path2->nodes[level]) { ASSERT(btrfs_root_bytenr(&root->root_item) == lower->bytenr); - if (should_ignore_root(root)) + if (should_ignore_root(root)) { + btrfs_put_fs_root(root); list_add(&lower->list, &useless); - else + } else { lower->root = root; + } break; } edge = alloc_backref_edge(cache); if (!edge) { + btrfs_put_fs_root(root); err = -ENOMEM; goto out; } @@ -1006,6 +1018,7 @@ again: if (!rb_node) { upper = alloc_backref_node(cache); if (!upper) { + btrfs_put_fs_root(root); free_backref_edge(cache, edge); err = -ENOMEM; goto out; @@ -1053,8 +1066,10 @@ again: edge->node[LOWER] = lower; edge->node[UPPER] = upper; - if (rb_node) + if (rb_node) { + btrfs_put_fs_root(root); break; + } lower = upper; upper = NULL; } @@ -1291,7 +1306,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, new_node->level = node->level; new_node->lowest = node->lowest; new_node->checked = 1; - new_node->root = dest; + new_node->root = btrfs_grab_fs_root(dest); + ASSERT(new_node->root); if (!node->lowest) { list_for_each_entry(edge, &node->lower, list[UPPER]) { @@ -2669,7 +2685,9 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, BUG_ON(next->new_bytenr); BUG_ON(!list_empty(&next->list)); next->new_bytenr = root->node->start; - next->root = root; + btrfs_put_fs_root(next->root); + next->root = btrfs_grab_fs_root(root); + ASSERT(next->root); list_add_tail(&next->list, &rc->backref_cache.changed); __mark_block_processed(rc, next); @@ -3141,7 +3159,9 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, btrfs_record_root_in_trans(trans, root); root = root->reloc_root; node->new_bytenr = root->node->start; - node->root = root; + btrfs_put_fs_root(node->root); + node->root = btrfs_grab_fs_root(root); + ASSERT(node->root); list_add_tail(&node->list, &rc->backref_cache.changed); } else { path->lowest_level = node->level; From db2c2ca2db44e4971ec41d3245119f7bc9d25d37 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:39 -0500 Subject: [PATCH 034/210] btrfs: hold a ref on the root in prepare_to_merge We look up the reloc roots corresponding root, we need to hold a ref on that root. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 83fd74cade3b..b85850823a1b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2521,6 +2521,7 @@ again: root = read_fs_root(fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); + BUG_ON(!btrfs_grab_fs_root(root)); BUG_ON(root->reloc_root != reloc_root); /* @@ -2532,6 +2533,7 @@ again: btrfs_update_reloc_root(trans, root); list_add(&reloc_root->root_list, &reloc_roots); + btrfs_put_fs_root(root); } list_splice(&reloc_roots, &rc->reloc_roots); From ab9737bd759741a56641d627e6a2025be18c91f9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:40 -0500 Subject: [PATCH 035/210] btrfs: hold a ref on the root in merge_reloc_roots We look up the corresponding root for the reloc root, we need to hold a ref while we're messing with it. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b85850823a1b..679dd3958ff2 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2592,9 +2592,11 @@ again: root = read_fs_root(fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); + BUG_ON(!btrfs_grab_fs_root(root)); BUG_ON(root->reloc_root != reloc_root); ret = merge_reloc_root(rc, root); + btrfs_put_fs_root(root); if (ret) { if (list_empty(&reloc_root->root_list)) list_add_tail(&reloc_root->root_list, From 442b1ac5244ea2dc09d8aa688b41e0a0c4634b0a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:41 -0500 Subject: [PATCH 036/210] btrfs: hold a ref on the root in record_reloc_root_in_trans We are recording this root in the transaction, so we need to hold a ref on it until we do that. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 679dd3958ff2..0336a48d7d96 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2648,15 +2648,19 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = reloc_root->fs_info; struct btrfs_root *root; + int ret; if (reloc_root->last_trans == trans->transid) return 0; root = read_fs_root(fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); + BUG_ON(!btrfs_grab_fs_root(root)); BUG_ON(root->reloc_root != reloc_root); + ret = btrfs_record_root_in_trans(trans, root); + btrfs_put_fs_root(root); - return btrfs_record_root_in_trans(trans, root); + return ret; } static noinline_for_stack From 3d7babdcf2cc87367e694f9b67460dde22c48319 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:42 -0500 Subject: [PATCH 037/210] btrfs: hold a ref on the root in find_data_references We're looking up the data references for the bytenr in a root, we need to hold a ref on that root while we're doing that. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0336a48d7d96..976917ca412a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3751,7 +3751,11 @@ static int find_data_references(struct reloc_control *rc, root = read_fs_root(fs_info, ref_root); if (IS_ERR(root)) { err = PTR_ERR(root); - goto out; + goto out_free; + } + if (!btrfs_grab_fs_root(root)) { + err = -ENOENT; + goto out_free; } key.objectid = ref_objectid; @@ -3864,6 +3868,8 @@ next: } out: + btrfs_put_fs_root(root); +out_free: btrfs_free_path(path); return err; } From 76deacf02387f0cd5b6125fb54caaabe595ffc17 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:43 -0500 Subject: [PATCH 038/210] btrfs: hold a ref on the root in create_reloc_inode We're creating a reloc inode in the data reloc tree, we need to hold a ref on the root while we're doing that. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 976917ca412a..a257147db7a9 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4351,10 +4351,14 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); if (IS_ERR(root)) return ERR_CAST(root); + if (!btrfs_grab_fs_root(root)) + return ERR_PTR(-ENOENT); trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + btrfs_put_fs_root(root); return ERR_CAST(trans); + } err = btrfs_find_free_objectid(root, &objectid); if (err) @@ -4372,6 +4376,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, err = btrfs_orphan_add(trans, BTRFS_I(inode)); out: + btrfs_put_fs_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (err) { From 932fd26df8125a5b14438563c4d3e33f59ba80f7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:44 -0500 Subject: [PATCH 039/210] btrfs: hold a ref on the root in btrfs_recover_relocation We look up the fs root in various places in here when recovering from a crashed relcoation. Make sure we hold a ref on the root whenever we look them up. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index a257147db7a9..f0d8177cd528 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4647,6 +4647,12 @@ int btrfs_recover_relocation(struct btrfs_root *root) err = ret; goto out; } + } else { + if (!btrfs_grab_fs_root(fs_root)) { + err = -ENOENT; + goto out; + } + btrfs_put_fs_root(fs_root); } } @@ -4696,10 +4702,15 @@ int btrfs_recover_relocation(struct btrfs_root *root) list_add_tail(&reloc_root->root_list, &reloc_roots); goto out_free; } + if (!btrfs_grab_fs_root(fs_root)) { + err = -ENOENT; + goto out_free; + } err = __add_reloc_root(reloc_root); BUG_ON(err < 0); /* -ENOMEM or logic error */ fs_root->reloc_root = reloc_root; + btrfs_put_fs_root(fs_root); } err = btrfs_commit_transaction(trans); @@ -4731,10 +4742,14 @@ out: if (err == 0) { /* cleanup orphan inode in data relocation tree */ fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); - if (IS_ERR(fs_root)) + if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); - else - err = btrfs_orphan_cleanup(fs_root); + } else { + if (btrfs_grab_fs_root(fs_root)) { + err = btrfs_orphan_cleanup(fs_root); + btrfs_put_fs_root(fs_root); + } + } } return err; } From 9f583209f20a6eb0d959416010d07582a5014098 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:45 -0500 Subject: [PATCH 040/210] btrfs: push grab_fs_root into read_fs_root All of relocation uses read_fs_root to lookup fs roots, so push the btrfs_grab_fs_root() up into that helper and remove the individual calls. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f0d8177cd528..440b662c3b0b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -659,6 +659,7 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, u64 root_objectid) { struct btrfs_key key; + struct btrfs_root *root; key.objectid = root_objectid; key.type = BTRFS_ROOT_ITEM_KEY; @@ -667,7 +668,12 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, else key.offset = (u64)-1; - return btrfs_get_fs_root(fs_info, &key, false); + root = btrfs_get_fs_root(fs_info, &key, false); + if (IS_ERR(root)) + return root; + if (!btrfs_grab_fs_root(root)) + return ERR_PTR(-ENOENT); + return root; } static noinline_for_stack @@ -939,10 +945,6 @@ again: err = PTR_ERR(root); goto out; } - if (!btrfs_grab_fs_root(root)) { - err = -ENOENT; - goto out; - } if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) cur->cowonly = 1; @@ -2521,7 +2523,6 @@ again: root = read_fs_root(fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); - BUG_ON(!btrfs_grab_fs_root(root)); BUG_ON(root->reloc_root != reloc_root); /* @@ -2592,7 +2593,6 @@ again: root = read_fs_root(fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); - BUG_ON(!btrfs_grab_fs_root(root)); BUG_ON(root->reloc_root != reloc_root); ret = merge_reloc_root(rc, root); @@ -2655,7 +2655,6 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, root = read_fs_root(fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); - BUG_ON(!btrfs_grab_fs_root(root)); BUG_ON(root->reloc_root != reloc_root); ret = btrfs_record_root_in_trans(trans, root); btrfs_put_fs_root(root); @@ -3753,10 +3752,6 @@ static int find_data_references(struct reloc_control *rc, err = PTR_ERR(root); goto out_free; } - if (!btrfs_grab_fs_root(root)) { - err = -ENOENT; - goto out_free; - } key.objectid = ref_objectid; key.type = BTRFS_EXTENT_DATA_KEY; @@ -4351,8 +4346,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); if (IS_ERR(root)) return ERR_CAST(root); - if (!btrfs_grab_fs_root(root)) - return ERR_PTR(-ENOENT); trans = btrfs_start_transaction(root, 6); if (IS_ERR(trans)) { @@ -4648,10 +4641,6 @@ int btrfs_recover_relocation(struct btrfs_root *root) goto out; } } else { - if (!btrfs_grab_fs_root(fs_root)) { - err = -ENOENT; - goto out; - } btrfs_put_fs_root(fs_root); } } @@ -4702,10 +4691,6 @@ int btrfs_recover_relocation(struct btrfs_root *root) list_add_tail(&reloc_root->root_list, &reloc_roots); goto out_free; } - if (!btrfs_grab_fs_root(fs_root)) { - err = -ENOENT; - goto out_free; - } err = __add_reloc_root(reloc_root); BUG_ON(err < 0); /* -ENOMEM or logic error */ @@ -4745,10 +4730,8 @@ out: if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); } else { - if (btrfs_grab_fs_root(fs_root)) { - err = btrfs_orphan_cleanup(fs_root); - btrfs_put_fs_root(fs_root); - } + err = btrfs_orphan_cleanup(fs_root); + btrfs_put_fs_root(fs_root); } } return err; From 0b2dee5cff7490b5e874d33537d4cfcf66e5ba0d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:46 -0500 Subject: [PATCH 041/210] btrfs: hold a ref for the root in btrfs_find_orphan_roots We lookup roots for every orphan item we have, we need to hold a ref on the root while we're doing this work. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/root-tree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 1ea4a35a0830..a32632f43987 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -257,6 +257,8 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) root = btrfs_get_fs_root(fs_info, &root_key, false); err = PTR_ERR_OR_ZERO(root); + if (!err && !btrfs_grab_fs_root(root)) + err = -ENOENT; if (err && err != -ENOENT) { break; } else if (err == -ENOENT) { @@ -287,6 +289,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); } + btrfs_put_fs_root(root); } btrfs_free_path(path); From fd79d43b347e12b3f7ededaadf6437c6538247c7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:47 -0500 Subject: [PATCH 042/210] btrfs: hold a ref on the root in scrub_print_warning_inode We look up the root for the bytenr that is failing, so we need to hold a ref on the root for that operation. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7e18dde5dcb0..158a661ad2b0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -658,6 +658,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, ret = PTR_ERR(local_root); goto err; } + if (!btrfs_grab_fs_root(local_root)) { + ret = -ENOENT; + goto err; + } /* * this makes the path point to (inum INODE_ITEM ioff) @@ -668,6 +672,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); if (ret) { + btrfs_put_fs_root(local_root); btrfs_release_path(swarn->path); goto err; } @@ -688,6 +693,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, ipath = init_ipath(4096, local_root, swarn->path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) { + btrfs_put_fs_root(local_root); ret = PTR_ERR(ipath); ipath = NULL; goto err; @@ -711,6 +717,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, min(isize - offset, (u64)PAGE_SIZE), nlink, (char *)(unsigned long)ipath->fspath->val[i]); + btrfs_put_fs_root(local_root); free_ipath(ipath); return 0; From 6f9a3da5da9e7e59f3a5a5909aab5f680fe919c9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:48 -0500 Subject: [PATCH 043/210] btrfs: hold a ref on the root in btrfs_ioctl_send We lookup all the clone roots and the parent root for send, so we need to hold refs on all of these roots while we're processing them. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 27e580564e49..31495cdf1877 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7201,10 +7201,16 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) ret = PTR_ERR(clone_root); goto out; } + if (!btrfs_grab_fs_root(clone_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -ENOENT; + goto out; + } spin_lock(&clone_root->root_item_lock); if (!btrfs_root_readonly(clone_root) || btrfs_root_dead(clone_root)) { spin_unlock(&clone_root->root_item_lock); + btrfs_put_fs_root(clone_root); srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; goto out; @@ -7212,6 +7218,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (clone_root->dedupe_in_progress) { dedupe_in_progress_warn(clone_root); spin_unlock(&clone_root->root_item_lock); + btrfs_put_fs_root(clone_root); srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EAGAIN; goto out; @@ -7240,6 +7247,12 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) ret = PTR_ERR(sctx->parent_root); goto out; } + if (!btrfs_grab_fs_root(sctx->parent_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -ENOENT; + sctx->parent_root = ERR_PTR(ret); + goto out; + } spin_lock(&sctx->parent_root->root_item_lock); sctx->parent_root->send_in_progress++; @@ -7267,7 +7280,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) * is behind the current send position. This is checked while searching * for possible clone sources. */ - sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root; + sctx->clone_roots[sctx->clone_roots_cnt++].root = + btrfs_grab_fs_root(sctx->send_root); /* We do a bsearch later */ sort(sctx->clone_roots, sctx->clone_roots_cnt, @@ -7352,18 +7366,24 @@ out: } if (sort_clone_roots) { - for (i = 0; i < sctx->clone_roots_cnt; i++) + for (i = 0; i < sctx->clone_roots_cnt; i++) { btrfs_root_dec_send_in_progress( sctx->clone_roots[i].root); + btrfs_put_fs_root(sctx->clone_roots[i].root); + } } else { - for (i = 0; sctx && i < clone_sources_to_rollback; i++) + for (i = 0; sctx && i < clone_sources_to_rollback; i++) { btrfs_root_dec_send_in_progress( sctx->clone_roots[i].root); + btrfs_put_fs_root(sctx->clone_roots[i].root); + } btrfs_root_dec_send_in_progress(send_root); } - if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) + if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) { btrfs_root_dec_send_in_progress(sctx->parent_root); + btrfs_put_fs_root(sctx->parent_root); + } kvfree(clone_sources_tmp); From 5168489a079aacd5626156ce5f8ab77325bcde15 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 6 Feb 2020 10:24:26 -0500 Subject: [PATCH 044/210] btrfs: hold a ref on the root in get_subvol_name_from_objectid We lookup the name of a subvol which means we'll cross into different roots. Hold a ref while we're doing the look ups in the fs_root we're searching. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a2c2e71af122..2c459d29e099 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1028,7 +1028,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid) { struct btrfs_root *root = fs_info->tree_root; - struct btrfs_root *fs_root; + struct btrfs_root *fs_root = NULL; struct btrfs_root_ref *root_ref; struct btrfs_inode_ref *inode_ref; struct btrfs_key key; @@ -1099,6 +1099,12 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, fs_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(fs_root)) { ret = PTR_ERR(fs_root); + fs_root = NULL; + goto err; + } + if (!btrfs_grab_fs_root(fs_root)) { + ret = -ENOENT; + fs_root = NULL; goto err; } @@ -1143,6 +1149,8 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, ptr[0] = '/'; btrfs_release_path(path); } + btrfs_put_fs_root(fs_root); + fs_root = NULL; } btrfs_free_path(path); @@ -1155,6 +1163,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, return name; err: + btrfs_put_fs_root(fs_root); btrfs_free_path(path); kfree(name); return ERR_PTR(ret); From 5119cfc36f6da62ee7c8f38208afece006a27fcb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:50 -0500 Subject: [PATCH 045/210] btrfs: hold a ref on the root in create_pending_snapshot We create the snapshot and then use it for a bunch of things, we need to hold a ref on it while we're messing with it. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 1 + fs/btrfs/transaction.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5fde22db1727..f36aa0674ade 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -875,6 +875,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, d_instantiate(dentry, inode); ret = 0; fail: + btrfs_put_fs_root(pending_snapshot->snap); btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); dec_and_free: if (snapshot_force_cow) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bcf23b06e67f..3fa2e7d52eda 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1639,6 +1639,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); goto fail; } + if (!btrfs_grab_fs_root(pending->snap)) { + ret = -ENOENT; + pending->snap = NULL; + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_reloc_post_snapshot(trans, pending); if (ret) { From ca2037fba6af039c18ae3219ff2fbacf2ca7c133 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:51 -0500 Subject: [PATCH 046/210] btrfs: hold a ref on the root in btrfs_recover_log_trees We replay the log into arbitrary fs roots, hold a ref on the root while we're doing this. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 556aa4a614f5..169270c931a9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6163,6 +6163,10 @@ again: tmp_key.offset = (u64)-1; wc.replay_dest = btrfs_get_fs_root(fs_info, &tmp_key, true); + if (!IS_ERR(wc.replay_dest)) { + if (!btrfs_grab_fs_root(wc.replay_dest)) + wc.replay_dest = ERR_PTR(-ENOENT); + } if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); @@ -6219,6 +6223,7 @@ again: } wc.replay_dest->log_root = NULL; + btrfs_put_fs_root(wc.replay_dest); free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); From fbb0ce40d6064e731b5976f10e91ce7b9683df1f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:52 -0500 Subject: [PATCH 047/210] btrfs: hold a ref on the root in btrfs_check_uuid_tree_entry We lookup the uuid of arbitrary subvolumes, hold a ref on the root while we're doing this. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0b3167bd9f35..3fca9f9e3b5d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4431,6 +4431,10 @@ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, ret = 1; goto out; } + if (!btrfs_grab_fs_root(subvol_root)) { + ret = 1; + goto out; + } switch (type) { case BTRFS_UUID_KEY_SUBVOL: @@ -4443,7 +4447,7 @@ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, ret = 1; break; } - + btrfs_put_fs_root(subvol_root); out: return ret; } From 0d4b0463011de06288d8ca80a873a97a7d99a948 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:53 -0500 Subject: [PATCH 048/210] btrfs: export and rename free_fs_info We're going to start freeing roots and doing other complicated things in free_fs_info, so we need to move it to disk-io.c and export it in order to use things lik btrfs_put_fs_root(). Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 17 ----------------- fs/btrfs/disk-io.c | 18 ++++++++++++++++++ fs/btrfs/disk-io.h | 1 + fs/btrfs/super.c | 6 +++--- 4 files changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 36dbf4e7c0dc..8c1a8f275773 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2695,23 +2695,6 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info); } -static inline void free_fs_info(struct btrfs_fs_info *fs_info) -{ - kfree(fs_info->balance_ctl); - kfree(fs_info->delayed_root); - kfree(fs_info->extent_root); - kfree(fs_info->tree_root); - kfree(fs_info->chunk_root); - kfree(fs_info->dev_root); - kfree(fs_info->csum_root); - kfree(fs_info->quota_root); - kfree(fs_info->uuid_root); - kfree(fs_info->free_space_root); - kfree(fs_info->super_copy); - kfree(fs_info->super_for_commit); - kvfree(fs_info); -} - /* tree mod log functions from ctree.c */ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 43af9f9b8583..ce36a4b9c8a3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1523,6 +1523,24 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, return ret; } +void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) +{ + kfree(fs_info->balance_ctl); + kfree(fs_info->delayed_root); + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + kfree(fs_info->quota_root); + kfree(fs_info->uuid_root); + kfree(fs_info->free_space_root); + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kvfree(fs_info); +} + + struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *location, bool check_ref) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 8add2e14aab1..97e7ac474a52 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -68,6 +68,7 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key, bool check_ref); +void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2c459d29e099..a70c89b3a223 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1580,7 +1580,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, if (s->s_root) { btrfs_close_devices(fs_devices); - free_fs_info(fs_info); + btrfs_free_fs_info(fs_info); if ((flags ^ s->s_flags) & SB_RDONLY) error = -EBUSY; } else { @@ -1603,7 +1603,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, error_close_devices: btrfs_close_devices(fs_devices); error_fs_info: - free_fs_info(fs_info); + btrfs_free_fs_info(fs_info); error_sec_opts: security_free_mnt_opts(&new_sec_opts); return ERR_PTR(error); @@ -2179,7 +2179,7 @@ static void btrfs_kill_super(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); kill_anon_super(sb); - free_fs_info(fs_info); + btrfs_free_fs_info(fs_info); } static struct file_system_type btrfs_fs_type = { From 4c78e9f59632b5e635e51db9b68111288b38ff22 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:54 -0500 Subject: [PATCH 049/210] btrfs: hold a ref on the root in open_ctree We lookup the fs_root and put it in our fs_info directly, we should hold a ref on this root for the lifetime of the fs_info. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ce36a4b9c8a3..76b888728540 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1537,6 +1537,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->free_space_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); + btrfs_put_fs_root(fs_info->fs_root); kvfree(fs_info); } @@ -3206,6 +3207,13 @@ int __cold open_ctree(struct super_block *sb, goto fail_qgroup; } + if (!btrfs_grab_fs_root(fs_info->fs_root)) { + fs_info->fs_root = NULL; + err = -ENOENT; + btrfs_warn(fs_info, "failed to grab a ref on the fs tree"); + goto fail_qgroup; + } + if (sb_rdonly(sb)) return 0; From 81f096edf0472864879e1e5a7f0fdf87ea90fe75 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:55 -0500 Subject: [PATCH 050/210] btrfs: use btrfs_put_fs_root to free roots always If we are going to track leaked roots we need to free them all the same way, so don't kfree() roots directly, use btrfs_put_fs_root. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 30 +++++++++++++++--------------- fs/btrfs/free-space-tree.c | 2 +- fs/btrfs/qgroup.c | 4 ++-- fs/btrfs/tests/btrfs-tests.c | 2 +- fs/btrfs/tree-log.c | 8 ++++---- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 76b888728540..5be61619c71b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1297,7 +1297,7 @@ fail: free_extent_buffer(root->commit_root); free_extent_buffer(leaf); } - kfree(root); + btrfs_put_fs_root(root); return ERR_PTR(ret); } @@ -1328,7 +1328,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); if (IS_ERR(leaf)) { - kfree(root); + btrfs_put_fs_root(root); return ERR_CAST(leaf); } @@ -1431,7 +1431,7 @@ out: return root; find_fail: - kfree(root); + btrfs_put_fs_root(root); alloc_fail: root = ERR_PTR(ret); goto out; @@ -1527,17 +1527,17 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); - kfree(fs_info->extent_root); - kfree(fs_info->tree_root); - kfree(fs_info->chunk_root); - kfree(fs_info->dev_root); - kfree(fs_info->csum_root); - kfree(fs_info->quota_root); - kfree(fs_info->uuid_root); - kfree(fs_info->free_space_root); + btrfs_put_fs_root(fs_info->extent_root); + btrfs_put_fs_root(fs_info->tree_root); + btrfs_put_fs_root(fs_info->chunk_root); + btrfs_put_fs_root(fs_info->dev_root); + btrfs_put_fs_root(fs_info->csum_root); + btrfs_put_fs_root(fs_info->quota_root); + btrfs_put_fs_root(fs_info->uuid_root); + btrfs_put_fs_root(fs_info->free_space_root); + btrfs_put_fs_root(fs_info->fs_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); - btrfs_put_fs_root(fs_info->fs_root); kvfree(fs_info); } @@ -2223,12 +2223,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); - kfree(log_tree_root); + btrfs_put_fs_root(log_tree_root); return ret; } else if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); + btrfs_put_fs_root(log_tree_root); return -EIO; } /* returns with log_tree_root freed on success */ @@ -2237,7 +2237,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); + btrfs_put_fs_root(log_tree_root); return ret; } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 258cb3fae17a..c79804c30b17 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1253,7 +1253,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) free_extent_buffer(free_space_root->node); free_extent_buffer(free_space_root->commit_root); - kfree(free_space_root); + btrfs_put_fs_root(free_space_root); return btrfs_commit_transaction(trans); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ff1870ff3474..09fc588ab4bf 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1040,7 +1040,7 @@ out_free_root: if (ret) { free_extent_buffer(quota_root->node); free_extent_buffer(quota_root->commit_root); - kfree(quota_root); + btrfs_put_fs_root(quota_root); } out: if (ret) { @@ -1106,7 +1106,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) free_extent_buffer(quota_root->node); free_extent_buffer(quota_root->commit_root); - kfree(quota_root); + btrfs_put_fs_root(quota_root); end_trans: ret = btrfs_end_transaction(trans); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 84fb3fa940a6..5efd8000e65b 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -227,7 +227,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) /* One for allocate_extent_buffer */ free_extent_buffer(root->node); } - kfree(root); + btrfs_put_fs_root(root); } struct btrfs_block_group * diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 169270c931a9..1776ac466f02 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3289,7 +3289,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); free_extent_buffer(log->node); - kfree(log); + btrfs_put_fs_root(log); } /* @@ -6187,7 +6187,7 @@ again: log->node->len); free_extent_buffer(log->node); free_extent_buffer(log->commit_root); - kfree(log); + btrfs_put_fs_root(log); if (!ret) goto next; @@ -6226,7 +6226,7 @@ again: btrfs_put_fs_root(wc.replay_dest); free_extent_buffer(log->node); free_extent_buffer(log->commit_root); - kfree(log); + btrfs_put_fs_root(log); if (ret) goto error; @@ -6260,7 +6260,7 @@ next: free_extent_buffer(log_root_tree->node); log_root_tree->log_root = NULL; clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); - kfree(log_root_tree); + btrfs_put_fs_root(log_root_tree); return 0; error: From bc44d7c4b2b179c4b74fba208b9908e2ecbc1b4d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:56 -0500 Subject: [PATCH 051/210] btrfs: push btrfs_grab_fs_root into btrfs_get_fs_root Now that all callers of btrfs_get_fs_root are subsequently calling btrfs_grab_fs_root and handling dropping the ref when they are done appropriately, go ahead and push btrfs_grab_fs_root up into btrfs_get_fs_root. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 6 ------ fs/btrfs/disk-io.c | 45 ++++++++++++++++++++++++------------------ fs/btrfs/export.c | 4 ---- fs/btrfs/file.c | 4 ---- fs/btrfs/inode.c | 4 ---- fs/btrfs/ioctl.c | 26 ------------------------ fs/btrfs/relocation.c | 8 +------- fs/btrfs/root-tree.c | 2 -- fs/btrfs/scrub.c | 4 ---- fs/btrfs/send.c | 11 ----------- fs/btrfs/super.c | 5 ----- fs/btrfs/transaction.c | 6 ------ fs/btrfs/tree-log.c | 4 ---- fs/btrfs/volumes.c | 4 ---- 14 files changed, 27 insertions(+), 106 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 193747b6e1f9..b69154d72529 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -527,12 +527,6 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out_free; } - if (!btrfs_grab_fs_root(root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - ret = -ENOENT; - goto out_free; - } - if (btrfs_is_testing(fs_info)) { srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -ENOENT; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5be61619c71b..a0c67200fb2a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1496,6 +1496,8 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)root_id); + if (root) + root = btrfs_grab_fs_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); return root; } @@ -1552,29 +1554,31 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, int ret; if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) - return fs_info->tree_root; + return btrfs_grab_fs_root(fs_info->tree_root); if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) - return fs_info->extent_root; + return btrfs_grab_fs_root(fs_info->extent_root); if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) - return fs_info->chunk_root; + return btrfs_grab_fs_root(fs_info->chunk_root); if (location->objectid == BTRFS_DEV_TREE_OBJECTID) - return fs_info->dev_root; + return btrfs_grab_fs_root(fs_info->dev_root); if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) - return fs_info->csum_root; + return btrfs_grab_fs_root(fs_info->csum_root); if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) - return fs_info->quota_root ? fs_info->quota_root : - ERR_PTR(-ENOENT); + return btrfs_grab_fs_root(fs_info->quota_root) ? + fs_info->quota_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_UUID_TREE_OBJECTID) - return fs_info->uuid_root ? fs_info->uuid_root : - ERR_PTR(-ENOENT); + return btrfs_grab_fs_root(fs_info->uuid_root) ? + fs_info->uuid_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return fs_info->free_space_root ? fs_info->free_space_root : - ERR_PTR(-ENOENT); + return btrfs_grab_fs_root(fs_info->free_space_root) ? + fs_info->free_space_root : ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) { - if (check_ref && btrfs_root_refs(&root->root_item) == 0) + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { + btrfs_put_fs_root(root); return ERR_PTR(-ENOENT); + } return root; } @@ -1607,8 +1611,18 @@ again: if (ret == 0) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); + /* + * All roots have two refs on them at all times, one for the mounted fs, + * and one for being in the radix tree. This way we only free the root + * when we are unmounting or deleting the subvolume. We get one ref + * from __setup_root, one for inserting it into the radix tree, and then + * we have the third for returning it, and the caller will put it when + * it's done with the root. + */ + btrfs_grab_fs_root(root); ret = btrfs_insert_fs_root(fs_info, root); if (ret) { + btrfs_put_fs_root(root); if (ret == -EEXIST) { btrfs_free_fs_root(root); goto again; @@ -3207,13 +3221,6 @@ int __cold open_ctree(struct super_block *sb, goto fail_qgroup; } - if (!btrfs_grab_fs_root(fs_info->fs_root)) { - fs_info->fs_root = NULL; - err = -ENOENT; - btrfs_warn(fs_info, "failed to grab a ref on the fs tree"); - goto fail_qgroup; - } - if (sb_rdonly(sb)) return 0; diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index eba6c6d27bad..f07c2300ade2 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -82,10 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, err = PTR_ERR(root); goto fail; } - if (!btrfs_grab_fs_root(root)) { - err = -ENOENT; - goto fail; - } key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 27237bbaad65..ba19b4debc61 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -292,10 +292,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, ret = PTR_ERR(inode_root); goto cleanup; } - if (!btrfs_grab_fs_root(inode_root)) { - ret = -ENOENT; - goto cleanup; - } key.objectid = defrag->ino; key.type = BTRFS_INODE_ITEM_KEY; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 324fc79174e6..ab67862bbfd6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5158,10 +5158,6 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, err = PTR_ERR(new_root); goto out; } - if (!btrfs_grab_fs_root(new_root)) { - err = -ENOENT; - goto out; - } *sub_root = new_root; location->objectid = btrfs_root_dirid(&new_root->root_item); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f36aa0674ade..bb8106f7dd26 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -672,11 +672,6 @@ static noinline int create_subvol(struct inode *dir, btrfs_abort_transaction(trans, ret); goto fail; } - if (!btrfs_grab_fs_root(new_root)) { - ret = -ENOENT; - btrfs_abort_transaction(trans, ret); - goto fail; - } btrfs_record_root_in_trans(trans, new_root); @@ -2191,10 +2186,6 @@ static noinline int search_ioctl(struct inode *inode, btrfs_free_path(path); return PTR_ERR(root); } - if (!btrfs_grab_fs_root(root)) { - btrfs_free_path(path); - return -ENOENT; - } } key.objectid = sk->min_objectid; @@ -2332,11 +2323,6 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, root = NULL; goto out; } - if (!btrfs_grab_fs_root(root)) { - ret = -ENOENT; - root = NULL; - goto out; - } key.objectid = dirid; key.type = BTRFS_INODE_REF_KEY; @@ -2432,10 +2418,6 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, ret = PTR_ERR(root); goto out; } - if (!btrfs_grab_fs_root(root)) { - ret = -ENOENT; - goto out; - } key.objectid = dirid; key.type = BTRFS_INODE_REF_KEY; @@ -2684,10 +2666,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) ret = PTR_ERR(root); goto out_free; } - if (!btrfs_grab_fs_root(root)) { - ret = -ENOENT; - goto out; - } root_item = &root->root_item; subvol_info->treeid = key.objectid; @@ -4018,10 +3996,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } - if (!btrfs_grab_fs_root(new_root)) { - ret = -ENOENT; - goto out; - } if (!is_fstree(new_root->root_key.objectid)) { ret = -ENOENT; goto out_free; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 440b662c3b0b..360f347d7524 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -659,7 +659,6 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, u64 root_objectid) { struct btrfs_key key; - struct btrfs_root *root; key.objectid = root_objectid; key.type = BTRFS_ROOT_ITEM_KEY; @@ -668,12 +667,7 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, else key.offset = (u64)-1; - root = btrfs_get_fs_root(fs_info, &key, false); - if (IS_ERR(root)) - return root; - if (!btrfs_grab_fs_root(root)) - return ERR_PTR(-ENOENT); - return root; + return btrfs_get_fs_root(fs_info, &key, false); } static noinline_for_stack diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index a32632f43987..2ab2a0239b7d 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -257,8 +257,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) root = btrfs_get_fs_root(fs_info, &root_key, false); err = PTR_ERR_OR_ZERO(root); - if (!err && !btrfs_grab_fs_root(root)) - err = -ENOENT; if (err && err != -ENOENT) { break; } else if (err == -ENOENT) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 158a661ad2b0..8593885e2455 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -658,10 +658,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, ret = PTR_ERR(local_root); goto err; } - if (!btrfs_grab_fs_root(local_root)) { - ret = -ENOENT; - goto err; - } /* * this makes the path point to (inum INODE_ITEM ioff) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 31495cdf1877..3e243ad4a95f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7201,11 +7201,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) ret = PTR_ERR(clone_root); goto out; } - if (!btrfs_grab_fs_root(clone_root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - ret = -ENOENT; - goto out; - } spin_lock(&clone_root->root_item_lock); if (!btrfs_root_readonly(clone_root) || btrfs_root_dead(clone_root)) { @@ -7247,12 +7242,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) ret = PTR_ERR(sctx->parent_root); goto out; } - if (!btrfs_grab_fs_root(sctx->parent_root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - ret = -ENOENT; - sctx->parent_root = ERR_PTR(ret); - goto out; - } spin_lock(&sctx->parent_root->root_item_lock); sctx->parent_root->send_in_progress++; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a70c89b3a223..f1ad1189c930 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1102,11 +1102,6 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, fs_root = NULL; goto err; } - if (!btrfs_grab_fs_root(fs_root)) { - ret = -ENOENT; - fs_root = NULL; - goto err; - } /* * Walk up the filesystem tree by inode refs until we hit the diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3fa2e7d52eda..bcf23b06e67f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1639,12 +1639,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); goto fail; } - if (!btrfs_grab_fs_root(pending->snap)) { - ret = -ENOENT; - pending->snap = NULL; - btrfs_abort_transaction(trans, ret); - goto fail; - } ret = btrfs_reloc_post_snapshot(trans, pending); if (ret) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1776ac466f02..54031a38906a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6163,10 +6163,6 @@ again: tmp_key.offset = (u64)-1; wc.replay_dest = btrfs_get_fs_root(fs_info, &tmp_key, true); - if (!IS_ERR(wc.replay_dest)) { - if (!btrfs_grab_fs_root(wc.replay_dest)) - wc.replay_dest = ERR_PTR(-ENOENT); - } if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3fca9f9e3b5d..089af576ad78 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4431,10 +4431,6 @@ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, ret = 1; goto out; } - if (!btrfs_grab_fs_root(subvol_root)) { - ret = 1; - goto out; - } switch (type) { case BTRFS_UUID_KEY_SUBVOL: From 141386e1a5d641d6d89a7794f806a31945ec1e9f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:57 -0500 Subject: [PATCH 052/210] btrfs: free more things in btrfs_free_fs_info Things like the percpu_counters, the mapping_tree, and the csum hash can all be freed at btrfs_free_fs_info time, since the helpers all check if the structure has been initialized already. This significantly cleans up the error cases in open_ctree. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 60 +++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a0c67200fb2a..1f2661369738 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -98,6 +98,12 @@ void __cold btrfs_end_io_wq_exit(void) kmem_cache_destroy(btrfs_end_io_wq_cache); } +static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) +{ + if (fs_info->csum_shash) + crypto_free_shash(fs_info->csum_shash); +} + /* * async submit bios are used to offload expensive checksumming * onto the worker threads. They checksum file and metadata bios @@ -1527,6 +1533,13 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); + percpu_counter_destroy(&fs_info->delalloc_bytes); + percpu_counter_destroy(&fs_info->dio_bytes); + percpu_counter_destroy(&fs_info->dev_replace.bio_counter); + btrfs_free_csum_hash(fs_info); + btrfs_free_stripe_hash_table(fs_info); + btrfs_free_ref_cache(fs_info); kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); btrfs_put_fs_root(fs_info->extent_root); @@ -2207,11 +2220,6 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) return 0; } -static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) -{ - crypto_free_shash(fs_info->csum_shash); -} - static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { @@ -2688,7 +2696,7 @@ int __cold open_ctree(struct super_block *sb, ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; - goto fail_dio_bytes; + goto fail_srcu; } fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -2696,14 +2704,14 @@ int __cold open_ctree(struct super_block *sb, ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); if (ret) { err = ret; - goto fail_dirty_metadata_bytes; + goto fail_srcu; } ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, GFP_KERNEL); if (ret) { err = ret; - goto fail_delalloc_bytes; + goto fail_srcu; } INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); @@ -2770,7 +2778,7 @@ int __cold open_ctree(struct super_block *sb, fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_bio_counter; + goto fail_srcu; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2883,7 +2891,7 @@ int __cold open_ctree(struct super_block *sb, btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; brelse(bh); - goto fail_csum; + goto fail_alloc; } /* @@ -2920,11 +2928,11 @@ int __cold open_ctree(struct super_block *sb, if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; - goto fail_csum; + goto fail_alloc; } if (!btrfs_super_root(disk_super)) - goto fail_csum; + goto fail_alloc; /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) @@ -2939,7 +2947,7 @@ int __cold open_ctree(struct super_block *sb, ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) { err = ret; - goto fail_csum; + goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super) & @@ -2949,7 +2957,7 @@ int __cold open_ctree(struct super_block *sb, "cannot mount because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_csum; + goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super); @@ -2993,7 +3001,7 @@ int __cold open_ctree(struct super_block *sb, btrfs_err(fs_info, "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", nodesize, sectorsize); - goto fail_csum; + goto fail_alloc; } /* @@ -3009,7 +3017,7 @@ int __cold open_ctree(struct super_block *sb, "cannot mount read-write because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_csum; + goto fail_alloc; } ret = btrfs_init_workqueues(fs_info, fs_devices); @@ -3346,25 +3354,14 @@ fail_tree_roots: fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); -fail_csum: - btrfs_free_csum_hash(fs_info); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); iput(fs_info->btree_inode); -fail_bio_counter: - percpu_counter_destroy(&fs_info->dev_replace.bio_counter); -fail_delalloc_bytes: - percpu_counter_destroy(&fs_info->delalloc_bytes); -fail_dirty_metadata_bytes: - percpu_counter_destroy(&fs_info->dirty_metadata_bytes); -fail_dio_bytes: - percpu_counter_destroy(&fs_info->dio_bytes); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: - btrfs_free_stripe_hash_table(fs_info); btrfs_close_devices(fs_info->fs_devices); return err; } @@ -4071,16 +4068,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_mapping_tree_free(&fs_info->mapping_tree); btrfs_close_devices(fs_info->fs_devices); - - percpu_counter_destroy(&fs_info->dirty_metadata_bytes); - percpu_counter_destroy(&fs_info->delalloc_bytes); - percpu_counter_destroy(&fs_info->dio_bytes); - percpu_counter_destroy(&fs_info->dev_replace.bio_counter); cleanup_srcu_struct(&fs_info->subvol_srcu); - - btrfs_free_csum_hash(fs_info); - btrfs_free_stripe_hash_table(fs_info); - btrfs_free_ref_cache(fs_info); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, From ae18c37ad5a12a73d119dc1d7b30199574a41bf3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:58 -0500 Subject: [PATCH 053/210] btrfs: move fs_info init work into it's own helper function open_ctree mixes initialization of fs stuff and fs_info stuff, which makes it confusing when doing things like adding the root leak detection. Make a separate function that inits all the static structures inside of the fs_info needed for the fs to operate, and then call that before we start setting up the fs_info to be mounted. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 165 +++++++++++++++++++++++---------------------- 1 file changed, 84 insertions(+), 81 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1f2661369738..0bc6bc2c10f0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2649,70 +2649,9 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } -int __cold open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options) +static int init_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) { - u32 sectorsize; - u32 nodesize; - u32 stripesize; - u64 generation; - u64 features; - u16 csum_type; - struct btrfs_key location; - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *tree_root; - struct btrfs_root *chunk_root; int ret; - int err = -EINVAL; - int clear_free_space_tree = 0; - int level; - - tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, - GFP_KERNEL); - fs_info->tree_root = tree_root; - chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID, - GFP_KERNEL); - fs_info->chunk_root = chunk_root; - if (!tree_root || !chunk_root) { - err = -ENOMEM; - goto fail; - } - - ret = init_srcu_struct(&fs_info->subvol_srcu); - if (ret) { - err = ret; - goto fail; - } - - ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); - if (ret) { - err = ret; - goto fail_srcu; - } - - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); - if (ret) { - err = ret; - goto fail_srcu; - } - fs_info->dirty_metadata_batch = PAGE_SIZE * - (1 + ilog2(nr_cpu_ids)); - - ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); - if (ret) { - err = ret; - goto fail_srcu; - } - - ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, - GFP_KERNEL); - if (ret) { - err = ret; - goto fail_srcu; - } INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); @@ -2775,21 +2714,6 @@ int __cold open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->ordered_roots); spin_lock_init(&fs_info->ordered_root_lock); - fs_info->btree_inode = new_inode(sb); - if (!fs_info->btree_inode) { - err = -ENOMEM; - goto fail_srcu; - } - mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); - - fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), - GFP_KERNEL); - if (!fs_info->delayed_root) { - err = -ENOMEM; - goto fail_iput; - } - btrfs_init_delayed_root(fs_info->delayed_root); - btrfs_init_scrub(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY fs_info->check_integrity_print_mask = 0; @@ -2800,8 +2724,6 @@ int __cold open_ctree(struct super_block *sb, sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - btrfs_init_btree_inode(fs_info); - spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; @@ -2847,12 +2769,94 @@ int __cold open_ctree(struct super_block *sb, fs_info->send_in_progress = 0; + ret = init_srcu_struct(&fs_info->subvol_srcu); + if (ret) + return ret; + + ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); + if (ret) + goto fail; + + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); + if (ret) + goto fail; + + fs_info->dirty_metadata_batch = PAGE_SIZE * + (1 + ilog2(nr_cpu_ids)); + + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); + if (ret) + goto fail; + + ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, + GFP_KERNEL); + if (ret) + goto fail; + + fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), + GFP_KERNEL); + if (!fs_info->delayed_root) { + ret = -ENOMEM; + goto fail; + } + btrfs_init_delayed_root(fs_info->delayed_root); + ret = btrfs_alloc_stripe_hash_table(fs_info); + if (ret) + goto fail; + + return 0; +fail: + cleanup_srcu_struct(&fs_info->subvol_srcu); + return ret; +} + +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, + char *options) +{ + u32 sectorsize; + u32 nodesize; + u32 stripesize; + u64 generation; + u64 features; + u16 csum_type; + struct btrfs_key location; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + int ret; + int err = -EINVAL; + int clear_free_space_tree = 0; + int level; + + ret = init_fs_info(fs_info, sb); if (ret) { err = ret; - goto fail_alloc; + goto fail; } + /* These need to be init'ed before we start creating inodes and such. */ + tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, + GFP_KERNEL); + fs_info->tree_root = tree_root; + chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID, + GFP_KERNEL); + fs_info->chunk_root = chunk_root; + if (!tree_root || !chunk_root) { + err = -ENOMEM; + goto fail_srcu; + } + + fs_info->btree_inode = new_inode(sb); + if (!fs_info->btree_inode) { + err = -ENOMEM; + goto fail_srcu; + } + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); + btrfs_init_btree_inode(fs_info); + invalidate_bdev(fs_devices->latest_bdev); /* @@ -3355,7 +3359,6 @@ fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); fail_alloc: -fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); iput(fs_info->btree_inode); From 8260edba67a2e6bd5e709d32188e23aa22cb4a38 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:59 -0500 Subject: [PATCH 054/210] btrfs: make the init of static elements in fs_info separate In adding things like eb leak checking and root leak checking there were a lot of weird corner cases that come from the fact that 1) We do not init the fs_info until we get to open_ctree time in the normal case and 2) The test infrastructure half-init's the fs_info for things that it needs. This makes it really annoying to make changes because you have to add init in two different places, have special cases for testing fs_info's that may not have certain things initialized, and cases for fs_info's that didn't make it to open_ctree and thus are not fully set up. Fix this by extracting out the non-allocating init of the fs info into it's own public function and use that to make sure we're all getting consistent views of an allocated fs_info. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 19 +++++++++++-------- fs/btrfs/disk-io.h | 1 + fs/btrfs/super.c | 7 +++++-- fs/btrfs/tests/btrfs-tests.c | 27 ++++----------------------- 4 files changed, 21 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0bc6bc2c10f0..3e4df0c6a663 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2649,10 +2649,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } -static int init_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) +void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { - int ret; - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); @@ -2695,7 +2693,6 @@ static int init_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) atomic_set(&fs_info->reada_works_cnt, 0); atomic_set(&fs_info->nr_delayed_iputs, 0); atomic64_set(&fs_info->tree_mod_seq, 0); - fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; @@ -2721,9 +2718,6 @@ static int init_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); - sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; - sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; @@ -2768,6 +2762,15 @@ static int init_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) fs_info->swapfile_pins = RB_ROOT; fs_info->send_in_progress = 0; +} + +static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) +{ + int ret; + + fs_info->sb = sb; + sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; + sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) @@ -2831,7 +2834,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device int clear_free_space_tree = 0; int level; - ret = init_fs_info(fs_info, sb); + ret = init_mount_fs_info(fs_info, sb); if (ret) { err = ret; goto fail; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 97e7ac474a52..2414d572bc9a 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,6 +39,7 @@ static inline u64 btrfs_sb_offset(int mirror) struct btrfs_device; struct btrfs_fs_devices; +void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); int btrfs_verify_level_key(struct extent_buffer *eb, int level, struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f1ad1189c930..d6629e693a96 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1522,14 +1522,17 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, /* * Setup a dummy root and fs_info for test/set super. This is because * we don't actually fill this stuff out until open_ctree, but we need - * it for searching for existing supers, so this lets us do that and - * then open_ctree will properly initialize everything later. + * then open_ctree will properly initialize the file system specific + * settings later. btrfs_init_fs_info initializes the static elements + * of the fs_info (locks and such) to make cleanup easier if we find a + * superblock with our given fs_devices later on at sget() time. */ fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); if (!fs_info) { error = -ENOMEM; goto error_sec_opts; } + btrfs_init_fs_info(fs_info); fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 5efd8000e65b..683381a692bc 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -120,6 +120,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) kfree(fs_info); return NULL; } + INIT_LIST_HEAD(&fs_info->fs_devices->devices); + fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), GFP_KERNEL); if (!fs_info->super_copy) { @@ -128,6 +130,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) return NULL; } + btrfs_init_fs_info(fs_info); + fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; @@ -138,29 +142,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) return NULL; } - spin_lock_init(&fs_info->buffer_lock); - spin_lock_init(&fs_info->qgroup_lock); - spin_lock_init(&fs_info->super_lock); - spin_lock_init(&fs_info->fs_roots_radix_lock); - mutex_init(&fs_info->qgroup_ioctl_lock); - mutex_init(&fs_info->qgroup_rescan_lock); - rwlock_init(&fs_info->tree_mod_log_lock); - fs_info->running_transaction = NULL; - fs_info->qgroup_tree = RB_ROOT; - fs_info->qgroup_ulist = NULL; - atomic64_set(&fs_info->tree_mod_seq, 0); - INIT_LIST_HEAD(&fs_info->dirty_qgroups); - INIT_LIST_HEAD(&fs_info->dead_roots); - INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); - INIT_LIST_HEAD(&fs_info->fs_devices->devices); - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - extent_io_tree_init(fs_info, &fs_info->freed_extents[0], - IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); - extent_io_tree_init(fs_info, &fs_info->freed_extents[1], - IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); - extent_map_tree_init(&fs_info->mapping_tree); - fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); test_mnt->mnt_sb->s_fs_info = fs_info; From bd647ce385ec110fe7796267b6555873e48e44eb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:33:00 -0500 Subject: [PATCH 055/210] btrfs: add a leak check for roots Now that we're going to start relying on getting ref counting right for roots, add a list to track allocated roots and print out any roots that aren't freed up at free_fs_info time. Hide this behind CONFIG_BTRFS_DEBUG because this will just be used for developers to verify they aren't breaking things. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 5 +++++ fs/btrfs/disk-io.c | 28 ++++++++++++++++++++++++++++ fs/btrfs/disk-io.h | 9 ++++++++- fs/btrfs/tests/btrfs-tests.c | 1 + 4 files changed, 42 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8c1a8f275773..ffd99c3f64db 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -947,6 +947,7 @@ struct btrfs_fs_info { #ifdef CONFIG_BTRFS_DEBUG struct kobject *debug_kobj; struct kobject *discard_debug_kobj; + struct list_head allocated_roots; #endif }; @@ -1149,6 +1150,10 @@ struct btrfs_root { #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif + +#ifdef CONFIG_BTRFS_DEBUG + struct list_head leak_list; +#endif }; struct btrfs_clone_extent_info { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3e4df0c6a663..c2838a48327f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1202,6 +1202,12 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, spin_lock_init(&root->root_item_lock); btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); +#ifdef CONFIG_BTRFS_DEBUG + INIT_LIST_HEAD(&root->leak_list); + spin_lock(&fs_info->fs_roots_radix_lock); + list_add_tail(&root->leak_list, &fs_info->allocated_roots); + spin_unlock(&fs_info->fs_roots_radix_lock); +#endif } static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, @@ -1531,6 +1537,24 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, return ret; } +void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) +{ +#ifdef CONFIG_BTRFS_DEBUG + struct btrfs_root *root; + + while (!list_empty(&fs_info->allocated_roots)) { + root = list_first_entry(&fs_info->allocated_roots, + struct btrfs_root, leak_list); + btrfs_err(fs_info, "leaked root %llu-%llu refcount %d", + root->root_key.objectid, root->root_key.offset, + refcount_read(&root->refs)); + while (refcount_read(&root->refs) > 1) + btrfs_put_fs_root(root); + btrfs_put_fs_root(root); + } +#endif +} + void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { percpu_counter_destroy(&fs_info->dirty_metadata_bytes); @@ -1551,6 +1575,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_fs_root(fs_info->uuid_root); btrfs_put_fs_root(fs_info->free_space_root); btrfs_put_fs_root(fs_info->fs_root); + btrfs_check_leaked_roots(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); kvfree(fs_info); @@ -2677,6 +2702,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); +#ifdef CONFIG_BTRFS_DEBUG + INIT_LIST_HEAD(&fs_info->allocated_roots); +#endif extent_map_tree_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2414d572bc9a..04a29f961527 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,6 +39,7 @@ static inline u64 btrfs_sb_offset(int mirror) struct btrfs_device; struct btrfs_fs_devices; +void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); int btrfs_verify_level_key(struct extent_buffer *eb, int level, struct btrfs_key *first_key, u64 parent_transid); @@ -101,8 +102,14 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root) { if (!root) return; - if (refcount_dec_and_test(&root->refs)) + if (refcount_dec_and_test(&root->refs)) { +#ifdef CONFIG_BTRFS_DEBUG + spin_lock(&root->fs_info->fs_roots_radix_lock); + list_del_init(&root->leak_list); + spin_unlock(&root->fs_info->fs_roots_radix_lock); +#endif kfree(root); + } } void btrfs_mark_buffer_dirty(struct extent_buffer *buf); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 683381a692bc..609abca4fe3a 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -193,6 +193,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) btrfs_free_fs_roots(fs_info); cleanup_srcu_struct(&fs_info->subvol_srcu); kfree(fs_info->super_copy); + btrfs_check_leaked_roots(fs_info); kfree(fs_info->fs_devices); kfree(fs_info); } From 0024652895e3479cd0d372f63b57d9581a0bdd38 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:33:01 -0500 Subject: [PATCH 056/210] btrfs: rename btrfs_put_fs_root and btrfs_grab_fs_root We are now using these for all roots, rename them to btrfs_put_root() and btrfs_grab_root(); Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 2 +- fs/btrfs/disk-io.c | 78 ++++++++++++++++++------------------ fs/btrfs/disk-io.h | 4 +- fs/btrfs/export.c | 2 +- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/file.c | 2 +- fs/btrfs/free-space-tree.c | 2 +- fs/btrfs/inode.c | 6 +-- fs/btrfs/ioctl.c | 18 ++++----- fs/btrfs/ordered-data.c | 4 +- fs/btrfs/qgroup.c | 4 +- fs/btrfs/relocation.c | 50 +++++++++++------------ fs/btrfs/root-tree.c | 2 +- fs/btrfs/scrub.c | 6 +-- fs/btrfs/send.c | 12 +++--- fs/btrfs/super.c | 4 +- fs/btrfs/tests/btrfs-tests.c | 2 +- fs/btrfs/tree-log.c | 10 ++--- fs/btrfs/volumes.c | 2 +- 19 files changed, 106 insertions(+), 106 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b69154d72529..ded46efac27d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -577,7 +577,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, ret = add_all_parents(root, path, parents, ref, level, time_seq, extent_item_pos, total_refs, ignore_offset); out: - btrfs_put_fs_root(root); + btrfs_put_root(root); out_free: path->lowest_level = 0; btrfs_release_path(path); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c2838a48327f..f859a28255dd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1309,7 +1309,7 @@ fail: free_extent_buffer(root->commit_root); free_extent_buffer(leaf); } - btrfs_put_fs_root(root); + btrfs_put_root(root); return ERR_PTR(ret); } @@ -1340,7 +1340,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); if (IS_ERR(leaf)) { - btrfs_put_fs_root(root); + btrfs_put_root(root); return ERR_CAST(leaf); } @@ -1443,7 +1443,7 @@ out: return root; find_fail: - btrfs_put_fs_root(root); + btrfs_put_root(root); alloc_fail: root = ERR_PTR(ret); goto out; @@ -1509,7 +1509,7 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)root_id); if (root) - root = btrfs_grab_fs_root(root); + root = btrfs_grab_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); return root; } @@ -1528,7 +1528,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, (unsigned long)root->root_key.objectid, root); if (ret == 0) { - btrfs_grab_fs_root(root); + btrfs_grab_root(root); set_bit(BTRFS_ROOT_IN_RADIX, &root->state); } spin_unlock(&fs_info->fs_roots_radix_lock); @@ -1549,8 +1549,8 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) root->root_key.objectid, root->root_key.offset, refcount_read(&root->refs)); while (refcount_read(&root->refs) > 1) - btrfs_put_fs_root(root); - btrfs_put_fs_root(root); + btrfs_put_root(root); + btrfs_put_root(root); } #endif } @@ -1566,15 +1566,15 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_free_ref_cache(fs_info); kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); - btrfs_put_fs_root(fs_info->extent_root); - btrfs_put_fs_root(fs_info->tree_root); - btrfs_put_fs_root(fs_info->chunk_root); - btrfs_put_fs_root(fs_info->dev_root); - btrfs_put_fs_root(fs_info->csum_root); - btrfs_put_fs_root(fs_info->quota_root); - btrfs_put_fs_root(fs_info->uuid_root); - btrfs_put_fs_root(fs_info->free_space_root); - btrfs_put_fs_root(fs_info->fs_root); + btrfs_put_root(fs_info->extent_root); + btrfs_put_root(fs_info->tree_root); + btrfs_put_root(fs_info->chunk_root); + btrfs_put_root(fs_info->dev_root); + btrfs_put_root(fs_info->csum_root); + btrfs_put_root(fs_info->quota_root); + btrfs_put_root(fs_info->uuid_root); + btrfs_put_root(fs_info->free_space_root); + btrfs_put_root(fs_info->fs_root); btrfs_check_leaked_roots(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); @@ -1592,29 +1592,29 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, int ret; if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->tree_root); + return btrfs_grab_root(fs_info->tree_root); if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->extent_root); + return btrfs_grab_root(fs_info->extent_root); if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->chunk_root); + return btrfs_grab_root(fs_info->chunk_root); if (location->objectid == BTRFS_DEV_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->dev_root); + return btrfs_grab_root(fs_info->dev_root); if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->csum_root); + return btrfs_grab_root(fs_info->csum_root); if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->quota_root) ? + return btrfs_grab_root(fs_info->quota_root) ? fs_info->quota_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_UUID_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->uuid_root) ? + return btrfs_grab_root(fs_info->uuid_root) ? fs_info->uuid_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->free_space_root) ? + return btrfs_grab_root(fs_info->free_space_root) ? fs_info->free_space_root : ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) { if (check_ref && btrfs_root_refs(&root->root_item) == 0) { - btrfs_put_fs_root(root); + btrfs_put_root(root); return ERR_PTR(-ENOENT); } return root; @@ -1657,10 +1657,10 @@ again: * we have the third for returning it, and the caller will put it when * it's done with the root. */ - btrfs_grab_fs_root(root); + btrfs_grab_root(root); ret = btrfs_insert_fs_root(fs_info, root); if (ret) { - btrfs_put_fs_root(root); + btrfs_put_root(root); if (ret == -EEXIST) { btrfs_free_fs_root(root); goto again; @@ -2062,7 +2062,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) } else { free_extent_buffer(gang[0]->node); free_extent_buffer(gang[0]->commit_root); - btrfs_put_fs_root(gang[0]); + btrfs_put_root(gang[0]); } } @@ -2270,12 +2270,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); - btrfs_put_fs_root(log_tree_root); + btrfs_put_root(log_tree_root); return ret; } else if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); free_extent_buffer(log_tree_root->node); - btrfs_put_fs_root(log_tree_root); + btrfs_put_root(log_tree_root); return -EIO; } /* returns with log_tree_root freed on success */ @@ -2284,7 +2284,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); - btrfs_put_fs_root(log_tree_root); + btrfs_put_root(log_tree_root); return ret; } @@ -3884,7 +3884,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) - btrfs_put_fs_root(root); + btrfs_put_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); if (btrfs_root_refs(&root->root_item) == 0) @@ -3895,7 +3895,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (root->reloc_root) { free_extent_buffer(root->reloc_root->node); free_extent_buffer(root->reloc_root->commit_root); - btrfs_put_fs_root(root->reloc_root); + btrfs_put_root(root->reloc_root); root->reloc_root = NULL; } } @@ -3919,7 +3919,7 @@ void btrfs_free_fs_root(struct btrfs_root *root) free_extent_buffer(root->commit_root); kfree(root->free_ino_ctl); kfree(root->free_ino_pinned); - btrfs_put_fs_root(root); + btrfs_put_root(root); } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) @@ -3949,7 +3949,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) continue; } /* grab all the search result for later use */ - gang[i] = btrfs_grab_fs_root(gang[i]); + gang[i] = btrfs_grab_root(gang[i]); } srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -3960,7 +3960,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) err = btrfs_orphan_cleanup(gang[i]); if (err) break; - btrfs_put_fs_root(gang[i]); + btrfs_put_root(gang[i]); } root_objectid++; } @@ -3968,7 +3968,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) /* release the uncleaned roots due to error */ for (; i < ret; i++) { if (gang[i]) - btrfs_put_fs_root(gang[i]); + btrfs_put_root(gang[i]); } return err; } @@ -4368,12 +4368,12 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); - root = btrfs_grab_fs_root(root); + root = btrfs_grab_root(root); BUG_ON(!root); spin_unlock(&fs_info->delalloc_root_lock); btrfs_destroy_delalloc_inodes(root); - btrfs_put_fs_root(root); + btrfs_put_root(root); spin_lock(&fs_info->delalloc_root_lock); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 04a29f961527..db21ab614357 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -89,7 +89,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); * If you want to ensure the whole tree is safe, you should use * fs_info->subvol_srcu */ -static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) { if (!root) return NULL; @@ -98,7 +98,7 @@ static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) return NULL; } -static inline void btrfs_put_fs_root(struct btrfs_root *root) +static inline void btrfs_put_root(struct btrfs_root *root) { if (!root) return; diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index f07c2300ade2..657fd6ad6e18 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -88,7 +88,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, key.offset = 0; inode = btrfs_iget(sb, &key, root); - btrfs_put_fs_root(root); + btrfs_put_root(root); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a7bc66121330..2b4c3ca5e651 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5418,7 +5418,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } else { free_extent_buffer(root->node); free_extent_buffer(root->commit_root); - btrfs_put_fs_root(root); + btrfs_put_root(root); } root_dropped = true; out_end_trans: diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ba19b4debc61..3f7f5323aefa 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -297,7 +297,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root); - btrfs_put_fs_root(inode_root); + btrfs_put_root(inode_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto cleanup; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index c79804c30b17..bc43950eb32f 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1253,7 +1253,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) free_extent_buffer(free_space_root->node); free_extent_buffer(free_space_root->commit_root); - btrfs_put_fs_root(free_space_root); + btrfs_put_root(free_space_root); return btrfs_commit_transaction(trans); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ab67862bbfd6..945f8e2043e1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5401,7 +5401,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) inode = btrfs_iget(dir->i_sb, &location, sub_root); } if (root != sub_root) - btrfs_put_fs_root(sub_root); + btrfs_put_root(sub_root); srcu_read_unlock(&fs_info->subvol_srcu, index); if (!IS_ERR(inode) && root != sub_root) { @@ -9729,14 +9729,14 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) while (!list_empty(&splice) && nr) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); - root = btrfs_grab_fs_root(root); + root = btrfs_grab_root(root); BUG_ON(!root); list_move_tail(&root->delalloc_root, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); ret = start_delalloc_inodes(root, nr, false); - btrfs_put_fs_root(root); + btrfs_put_root(root); if (ret < 0) goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bb8106f7dd26..7b0c79eac3d1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -676,7 +676,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_record_root_in_trans(trans, new_root); ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); - btrfs_put_fs_root(new_root); + btrfs_put_root(new_root); if (ret) { /* We potentially lose an unused inode item here */ btrfs_abort_transaction(trans, ret); @@ -870,7 +870,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, d_instantiate(dentry, inode); ret = 0; fail: - btrfs_put_fs_root(pending_snapshot->snap); + btrfs_put_root(pending_snapshot->snap); btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); dec_and_free: if (snapshot_force_cow) @@ -2176,7 +2176,7 @@ static noinline int search_ioctl(struct inode *inode, if (sk->tree_id == 0) { /* search the root of the inode that was passed */ - root = btrfs_grab_fs_root(BTRFS_I(inode)->root); + root = btrfs_grab_root(BTRFS_I(inode)->root); } else { key.objectid = sk->tree_id; key.type = BTRFS_ROOT_ITEM_KEY; @@ -2210,7 +2210,7 @@ static noinline int search_ioctl(struct inode *inode, ret = 0; err: sk->nr_items = num_found; - btrfs_put_fs_root(root); + btrfs_put_root(root); btrfs_free_path(path); return ret; } @@ -2371,7 +2371,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, name[total_len] = '\0'; ret = 0; out: - btrfs_put_fs_root(root); + btrfs_put_root(root); btrfs_free_path(path); return ret; } @@ -2499,7 +2499,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, memmove(args->path, ptr, total_len); args->path[total_len] = '\0'; - btrfs_put_fs_root(root); + btrfs_put_root(root); root = NULL; btrfs_release_path(path); } @@ -2536,7 +2536,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, args->name[item_len] = 0; out_put: - btrfs_put_fs_root(root); + btrfs_put_root(root); out: btrfs_free_path(path); return ret; @@ -2739,7 +2739,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) ret = -EFAULT; out: - btrfs_put_fs_root(root); + btrfs_put_root(root); out_free: btrfs_free_path(path); kzfree(subvol_info); @@ -4034,7 +4034,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); btrfs_end_transaction(trans); out_free: - btrfs_put_fs_root(new_root); + btrfs_put_root(new_root); btrfs_free_path(path); out: mnt_drop_write_file(file); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 66170c8cea6f..632981848649 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -580,7 +580,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, while (!list_empty(&splice) && nr) { root = list_first_entry(&splice, struct btrfs_root, ordered_root); - root = btrfs_grab_fs_root(root); + root = btrfs_grab_root(root); BUG_ON(!root); list_move_tail(&root->ordered_root, &fs_info->ordered_roots); @@ -588,7 +588,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, done = btrfs_wait_ordered_extents(root, nr, range_start, range_len); - btrfs_put_fs_root(root); + btrfs_put_root(root); spin_lock(&fs_info->ordered_root_lock); if (nr != U64_MAX) { diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 09fc588ab4bf..7e02cd61da73 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1040,7 +1040,7 @@ out_free_root: if (ret) { free_extent_buffer(quota_root->node); free_extent_buffer(quota_root->commit_root); - btrfs_put_fs_root(quota_root); + btrfs_put_root(quota_root); } out: if (ret) { @@ -1106,7 +1106,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) free_extent_buffer(quota_root->node); free_extent_buffer(quota_root->commit_root); - btrfs_put_fs_root(quota_root); + btrfs_put_root(quota_root); end_trans: ret = btrfs_end_transaction(trans); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 360f347d7524..034f5f151a74 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -303,7 +303,7 @@ static void free_backref_node(struct backref_cache *cache, { if (node) { cache->nr_nodes--; - btrfs_put_fs_root(node->root); + btrfs_put_root(node->root); kfree(node); } } @@ -637,7 +637,7 @@ static struct btrfs_root *find_reloc_root(struct reloc_control *rc, root = (struct btrfs_root *)node->data; } spin_unlock(&rc->reloc_root_tree.lock); - return btrfs_grab_fs_root(root); + return btrfs_grab_root(root); } static int is_cowonly_root(u64 root_objectid) @@ -948,7 +948,7 @@ again: ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr); if (should_ignore_root(root)) { - btrfs_put_fs_root(root); + btrfs_put_root(root); list_add(&cur->list, &useless); } else { cur->root = root; @@ -965,7 +965,7 @@ again: ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); path2->lowest_level = 0; if (ret < 0) { - btrfs_put_fs_root(root); + btrfs_put_root(root); err = ret; goto out; } @@ -981,7 +981,7 @@ again: root->root_key.objectid, node_key->objectid, node_key->type, node_key->offset); - btrfs_put_fs_root(root); + btrfs_put_root(root); err = -ENOENT; goto out; } @@ -994,7 +994,7 @@ again: ASSERT(btrfs_root_bytenr(&root->root_item) == lower->bytenr); if (should_ignore_root(root)) { - btrfs_put_fs_root(root); + btrfs_put_root(root); list_add(&lower->list, &useless); } else { lower->root = root; @@ -1004,7 +1004,7 @@ again: edge = alloc_backref_edge(cache); if (!edge) { - btrfs_put_fs_root(root); + btrfs_put_root(root); err = -ENOMEM; goto out; } @@ -1014,7 +1014,7 @@ again: if (!rb_node) { upper = alloc_backref_node(cache); if (!upper) { - btrfs_put_fs_root(root); + btrfs_put_root(root); free_backref_edge(cache, edge); err = -ENOMEM; goto out; @@ -1063,7 +1063,7 @@ again: edge->node[UPPER] = upper; if (rb_node) { - btrfs_put_fs_root(root); + btrfs_put_root(root); break; } lower = upper; @@ -1302,7 +1302,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, new_node->level = node->level; new_node->lowest = node->lowest; new_node->checked = 1; - new_node->root = btrfs_grab_fs_root(dest); + new_node->root = btrfs_grab_root(dest); ASSERT(new_node->root); if (!node->lowest) { @@ -2271,7 +2271,7 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans, btrfs_update_reloc_root(trans, root); if (list_empty(&root->reloc_dirty_list)) { - btrfs_grab_fs_root(root); + btrfs_grab_root(root); list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); } } @@ -2303,7 +2303,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) */ smp_wmb(); clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); - btrfs_put_fs_root(root); + btrfs_put_root(root); } else { /* Orphan reloc tree, just clean it up */ ret2 = btrfs_drop_snapshot(root, NULL, 0, 1); @@ -2528,7 +2528,7 @@ again: btrfs_update_reloc_root(trans, root); list_add(&reloc_root->root_list, &reloc_roots); - btrfs_put_fs_root(root); + btrfs_put_root(root); } list_splice(&reloc_roots, &rc->reloc_roots); @@ -2590,7 +2590,7 @@ again: BUG_ON(root->reloc_root != reloc_root); ret = merge_reloc_root(rc, root); - btrfs_put_fs_root(root); + btrfs_put_root(root); if (ret) { if (list_empty(&reloc_root->root_list)) list_add_tail(&reloc_root->root_list, @@ -2651,7 +2651,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, BUG_ON(IS_ERR(root)); BUG_ON(root->reloc_root != reloc_root); ret = btrfs_record_root_in_trans(trans, root); - btrfs_put_fs_root(root); + btrfs_put_root(root); return ret; } @@ -2686,8 +2686,8 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, BUG_ON(next->new_bytenr); BUG_ON(!list_empty(&next->list)); next->new_bytenr = root->node->start; - btrfs_put_fs_root(next->root); - next->root = btrfs_grab_fs_root(root); + btrfs_put_root(next->root); + next->root = btrfs_grab_root(root); ASSERT(next->root); list_add_tail(&next->list, &rc->backref_cache.changed); @@ -3160,8 +3160,8 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, btrfs_record_root_in_trans(trans, root); root = root->reloc_root; node->new_bytenr = root->node->start; - btrfs_put_fs_root(node->root); - node->root = btrfs_grab_fs_root(root); + btrfs_put_root(node->root); + node->root = btrfs_grab_root(root); ASSERT(node->root); list_add_tail(&node->list, &rc->backref_cache.changed); } else { @@ -3857,7 +3857,7 @@ next: } out: - btrfs_put_fs_root(root); + btrfs_put_root(root); out_free: btrfs_free_path(path); return err; @@ -4343,7 +4343,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, trans = btrfs_start_transaction(root, 6); if (IS_ERR(trans)) { - btrfs_put_fs_root(root); + btrfs_put_root(root); return ERR_CAST(trans); } @@ -4363,7 +4363,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, err = btrfs_orphan_add(trans, BTRFS_I(inode)); out: - btrfs_put_fs_root(root); + btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (err) { @@ -4635,7 +4635,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) goto out; } } else { - btrfs_put_fs_root(fs_root); + btrfs_put_root(fs_root); } } @@ -4689,7 +4689,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) err = __add_reloc_root(reloc_root); BUG_ON(err < 0); /* -ENOMEM or logic error */ fs_root->reloc_root = reloc_root; - btrfs_put_fs_root(fs_root); + btrfs_put_root(fs_root); } err = btrfs_commit_transaction(trans); @@ -4725,7 +4725,7 @@ out: err = PTR_ERR(fs_root); } else { err = btrfs_orphan_cleanup(fs_root); - btrfs_put_fs_root(fs_root); + btrfs_put_root(fs_root); } } return err; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 2ab2a0239b7d..b37904327c9e 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -287,7 +287,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); } - btrfs_put_fs_root(root); + btrfs_put_root(root); } btrfs_free_path(path); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 8593885e2455..16aa680134fd 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -668,7 +668,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); if (ret) { - btrfs_put_fs_root(local_root); + btrfs_put_root(local_root); btrfs_release_path(swarn->path); goto err; } @@ -689,7 +689,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, ipath = init_ipath(4096, local_root, swarn->path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) { - btrfs_put_fs_root(local_root); + btrfs_put_root(local_root); ret = PTR_ERR(ipath); ipath = NULL; goto err; @@ -713,7 +713,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, min(isize - offset, (u64)PAGE_SIZE), nlink, (char *)(unsigned long)ipath->fspath->val[i]); - btrfs_put_fs_root(local_root); + btrfs_put_root(local_root); free_ipath(ipath); return 0; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3e243ad4a95f..6b86841315be 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7205,7 +7205,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (!btrfs_root_readonly(clone_root) || btrfs_root_dead(clone_root)) { spin_unlock(&clone_root->root_item_lock); - btrfs_put_fs_root(clone_root); + btrfs_put_root(clone_root); srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; goto out; @@ -7213,7 +7213,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (clone_root->dedupe_in_progress) { dedupe_in_progress_warn(clone_root); spin_unlock(&clone_root->root_item_lock); - btrfs_put_fs_root(clone_root); + btrfs_put_root(clone_root); srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EAGAIN; goto out; @@ -7270,7 +7270,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) * for possible clone sources. */ sctx->clone_roots[sctx->clone_roots_cnt++].root = - btrfs_grab_fs_root(sctx->send_root); + btrfs_grab_root(sctx->send_root); /* We do a bsearch later */ sort(sctx->clone_roots, sctx->clone_roots_cnt, @@ -7358,20 +7358,20 @@ out: for (i = 0; i < sctx->clone_roots_cnt; i++) { btrfs_root_dec_send_in_progress( sctx->clone_roots[i].root); - btrfs_put_fs_root(sctx->clone_roots[i].root); + btrfs_put_root(sctx->clone_roots[i].root); } } else { for (i = 0; sctx && i < clone_sources_to_rollback; i++) { btrfs_root_dec_send_in_progress( sctx->clone_roots[i].root); - btrfs_put_fs_root(sctx->clone_roots[i].root); + btrfs_put_root(sctx->clone_roots[i].root); } btrfs_root_dec_send_in_progress(send_root); } if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) { btrfs_root_dec_send_in_progress(sctx->parent_root); - btrfs_put_fs_root(sctx->parent_root); + btrfs_put_root(sctx->parent_root); } kvfree(clone_sources_tmp); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d6629e693a96..6420df4e86d7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1144,7 +1144,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, ptr[0] = '/'; btrfs_release_path(path); } - btrfs_put_fs_root(fs_root); + btrfs_put_root(fs_root); fs_root = NULL; } @@ -1158,7 +1158,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, return name; err: - btrfs_put_fs_root(fs_root); + btrfs_put_root(fs_root); btrfs_free_path(path); kfree(name); return ERR_PTR(ret); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 609abca4fe3a..69c9afef06e3 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -209,7 +209,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) /* One for allocate_extent_buffer */ free_extent_buffer(root->node); } - btrfs_put_fs_root(root); + btrfs_put_root(root); } struct btrfs_block_group * diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 54031a38906a..4ea47bac5fcf 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3289,7 +3289,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); free_extent_buffer(log->node); - btrfs_put_fs_root(log); + btrfs_put_root(log); } /* @@ -6183,7 +6183,7 @@ again: log->node->len); free_extent_buffer(log->node); free_extent_buffer(log->commit_root); - btrfs_put_fs_root(log); + btrfs_put_root(log); if (!ret) goto next; @@ -6219,10 +6219,10 @@ again: } wc.replay_dest->log_root = NULL; - btrfs_put_fs_root(wc.replay_dest); + btrfs_put_root(wc.replay_dest); free_extent_buffer(log->node); free_extent_buffer(log->commit_root); - btrfs_put_fs_root(log); + btrfs_put_root(log); if (ret) goto error; @@ -6256,7 +6256,7 @@ next: free_extent_buffer(log_root_tree->node); log_root_tree->log_root = NULL; clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); - btrfs_put_fs_root(log_root_tree); + btrfs_put_root(log_root_tree); return 0; error: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 089af576ad78..b092021e41e9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4443,7 +4443,7 @@ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, ret = 1; break; } - btrfs_put_fs_root(subvol_root); + btrfs_put_root(subvol_root); out: return ret; } From b908c334e7a419e5cd08a45d31284b4a93de3bd7 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 17:26:51 +0100 Subject: [PATCH 057/210] btrfs: move root node locking helpers to locking.c The helpers are related to locking so move them there, update comments. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 38 -------------------------------------- fs/btrfs/locking.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 38 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 968faaec0e39..b62721ac5ee8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -143,44 +143,6 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) return eb; } -/* loop around taking references on and locking the root node of the - * tree until you end up with a lock on the root. A locked buffer - * is returned, with a reference held. - */ -struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) -{ - struct extent_buffer *eb; - - while (1) { - eb = btrfs_root_node(root); - btrfs_tree_lock(eb); - if (eb == root->node) - break; - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } - return eb; -} - -/* loop around taking references on and locking the root node of the - * tree until you end up with a lock on the root. A locked buffer - * is returned, with a reference held. - */ -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) -{ - struct extent_buffer *eb; - - while (1) { - eb = btrfs_root_node(root); - btrfs_tree_read_lock(eb); - if (eb == root->node) - break; - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - } - return eb; -} - /* cowonly root (everything not a reference counted cow subvolume), just get * put onto a simple dirty list. transaction.c walks this to make sure they * get properly updated on disk. diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 571c4826c428..e713900f96b6 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -523,3 +523,45 @@ void btrfs_unlock_up_safe(struct btrfs_path *path, int level) path->locks[i] = 0; } } + +/* + * Loop around taking references on and locking the root node of the tree until + * we end up with a lock on the root node. + * + * Return: root extent buffer with write lock held + */ +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_lock(eb); + if (eb == root->node) + break; + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* + * Loop around taking references on and locking the root node of the tree until + * we end up with a lock on the root node. + * + * Return: root extent buffer with read lock held + */ +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_read_lock(eb); + if (eb == root->node) + break; + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} From bf31f87f71cc7a89871ab0a451c047a0c0144bf1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 17:34:34 +0100 Subject: [PATCH 058/210] btrfs: add wrapper for transaction abort predicate The status of aborted transaction can change between calls and it needs to be accessed by READ_ONCE. Add a helper that also wraps the unlikely hint. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- fs/btrfs/delayed-inode.c | 2 +- fs/btrfs/extent-tree.c | 10 +++++----- fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 25 +++++++++++++------------ fs/btrfs/transaction.h | 12 ++++++++++++ 6 files changed, 33 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 7f09147872dc..c12d6399c6d7 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2345,7 +2345,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group, return 0; } - if (trans->aborted) + if (TRANS_ABORTED(trans)) return 0; again: inode = lookup_free_space_inode(block_group, path); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 9b23883a1086..ee70584ddc45 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1139,7 +1139,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) int ret = 0; bool count = (nr > 0); - if (trans->aborted) + if (TRANS_ABORTED(trans)) return -EIO; path = btrfs_alloc_path(); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2b4c3ca5e651..da3d9eaf3b22 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1583,7 +1583,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, int err = 0; int metadata = !extent_op->is_data; - if (trans->aborted) + if (TRANS_ABORTED(trans)) return 0; if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) @@ -1703,7 +1703,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, { int ret = 0; - if (trans->aborted) { + if (TRANS_ABORTED(trans)) { if (insert_reserved) btrfs_pin_extent(trans->fs_info, node->bytenr, node->num_bytes, 1); @@ -2191,7 +2191,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, int run_all = count == (unsigned long)-1; /* We'll clean this up in btrfs_cleanup_transaction */ - if (trans->aborted) + if (TRANS_ABORTED(trans)) return 0; if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) @@ -2913,7 +2913,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) else unpin = &fs_info->freed_extents[0]; - while (!trans->aborted) { + while (!TRANS_ABORTED(trans)) { struct extent_state *cached_state = NULL; mutex_lock(&fs_info->unused_bg_unpin_mutex); @@ -2950,7 +2950,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) u64 trimmed = 0; ret = -EROFS; - if (!trans->aborted) + if (!TRANS_ABORTED(trans)) ret = btrfs_discard_extent(fs_info, block_group->start, block_group->length, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6420df4e86d7..5c16b4bcde9b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -244,7 +244,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; - trans->aborted = errno; + WRITE_ONCE(trans->aborted, errno); /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ if (!trans->dirty && list_empty(&trans->new_bgs)) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bcf23b06e67f..37680351b7c3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -243,7 +243,7 @@ loop: cur_trans = fs_info->running_transaction; if (cur_trans) { - if (cur_trans->aborted) { + if (TRANS_ABORTED(cur_trans)) { spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } @@ -459,7 +459,7 @@ static inline int is_transaction_blocked(struct btrfs_transaction *trans) { return (trans->state >= TRANS_STATE_COMMIT_START && trans->state < TRANS_STATE_UNBLOCKED && - !trans->aborted); + !TRANS_ABORTED(trans)); } /* wait for commit against the current transaction to become unblocked @@ -478,7 +478,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) wait_event(fs_info->transaction_wait, cur_trans->state >= TRANS_STATE_UNBLOCKED || - cur_trans->aborted); + TRANS_ABORTED(cur_trans)); btrfs_put_transaction(cur_trans); } else { spin_unlock(&fs_info->trans_lock); @@ -937,7 +937,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (throttle) btrfs_run_delayed_iputs(info); - if (trans->aborted || + if (TRANS_ABORTED(trans) || test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) { wake_up_process(info->transaction_kthread); err = -EIO; @@ -1794,7 +1794,8 @@ static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info, struct btrfs_transaction *trans) { wait_event(fs_info->transaction_blocked_wait, - trans->state >= TRANS_STATE_COMMIT_START || trans->aborted); + trans->state >= TRANS_STATE_COMMIT_START || + TRANS_ABORTED(trans)); } /* @@ -1806,7 +1807,8 @@ static void wait_current_trans_commit_start_and_unblock( struct btrfs_transaction *trans) { wait_event(fs_info->transaction_wait, - trans->state >= TRANS_STATE_UNBLOCKED || trans->aborted); + trans->state >= TRANS_STATE_UNBLOCKED || + TRANS_ABORTED(trans)); } /* @@ -2026,7 +2028,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) trans->dirty = true; /* Stop the commit early if ->aborted is set */ - if (unlikely(READ_ONCE(cur_trans->aborted))) { + if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; btrfs_end_transaction(trans); return ret; @@ -2100,7 +2102,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wait_for_commit(cur_trans); - if (unlikely(cur_trans->aborted)) + if (TRANS_ABORTED(cur_trans)) ret = cur_trans->aborted; btrfs_put_transaction(cur_trans); @@ -2119,7 +2121,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) spin_unlock(&fs_info->trans_lock); wait_for_commit(prev_trans); - ret = prev_trans->aborted; + ret = READ_ONCE(prev_trans->aborted); btrfs_put_transaction(prev_trans); if (ret) @@ -2173,8 +2175,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); - /* ->aborted might be set after the previous check, so check it */ - if (unlikely(READ_ONCE(cur_trans->aborted))) { + if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; goto scrub_continue; } @@ -2292,7 +2293,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * The tasks which save the space cache and inode cache may also * update ->aborted, check it. */ - if (unlikely(READ_ONCE(cur_trans->aborted))) { + if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; mutex_unlock(&fs_info->tree_log_mutex); mutex_unlock(&fs_info->reloc_mutex); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 49f7196368f5..453cea7c7a72 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -115,6 +115,10 @@ struct btrfs_trans_handle { struct btrfs_block_rsv *orig_rsv; refcount_t use_count; unsigned int type; + /* + * Error code of transaction abort, set outside of locks and must use + * the READ_ONCE/WRITE_ONCE access + */ short aborted; bool adding_csums; bool allocating_chunk; @@ -126,6 +130,14 @@ struct btrfs_trans_handle { struct list_head new_bgs; }; +/* + * The abort status can be changed between calls and is not protected by locks. + * This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's + * set to a non-zero value it does not change, so the macro should be in checks + * but is not necessary for further reads of the value. + */ +#define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted))) + struct btrfs_pending_snapshot { struct dentry *dentry; struct inode *dir; From 45b08405b9f0dc31164b6c28ddcd6aeda3be54be Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 19:09:26 +0100 Subject: [PATCH 059/210] btrfs: remove extent_page_data::tree All functions that set up extent_page_data::tree set it to the inode io_tree. That's passed down the callstack that accesses either the same inode or its pages. In the end submit_extent_page can pull the tree out of the page and we don't have to store it in the structure. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cd07a9c30ec3..3047ab59f19c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -122,7 +122,6 @@ struct tree_entry { struct extent_page_data { struct bio *bio; - struct extent_io_tree *tree; /* tells writepage not to lock the state bits for this range * it still does the unlocking */ @@ -3004,6 +3003,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, sector_t sector = offset >> 9; ASSERT(bio_ret); + ASSERT(tree == &BTRFS_I(page->mapping->host)->io_tree); if (*bio_ret) { bool contig; @@ -3471,7 +3471,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, unsigned long nr_written, int *nr_ret) { - struct extent_io_tree *tree = epd->tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 start = page_offset(page); u64 page_end = start + PAGE_SIZE - 1; u64 end; @@ -3945,11 +3945,9 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; struct extent_buffer *eb, *prev_eb = NULL; struct extent_page_data epd = { .bio = NULL, - .tree = tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4238,7 +4236,6 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) int ret; struct extent_page_data epd = { .bio = NULL, - .tree = &BTRFS_I(page->mapping->host)->io_tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4260,14 +4257,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, { int ret = 0; struct address_space *mapping = inode->i_mapping; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *page; unsigned long nr_pages = (end - start + PAGE_SIZE) >> PAGE_SHIFT; struct extent_page_data epd = { .bio = NULL, - .tree = tree, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, }; @@ -4311,7 +4306,6 @@ int extent_writepages(struct address_space *mapping, int ret = 0; struct extent_page_data epd = { .bio = NULL, - .tree = &BTRFS_I(mapping->host)->io_tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; From 0ceb34bf46d1a849b4d0659f8913bfcb14e5d482 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 19:09:28 +0100 Subject: [PATCH 060/210] btrfs: drop argument tree from submit_extent_page Now that we're sure the tree from argument is same as the one we can get from the page's inode io_tree, drop the redundant argument. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3047ab59f19c..399e9fc2545e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2973,7 +2973,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) /* * @opf: bio REQ_OP_* and REQ_* flags as one value - * @tree: tree so we can call our merge_bio hook * @wbc: optional writeback control for io accounting * @page: page to add to the bio * @pg_offset: offset of the new bio or to check whether we are adding @@ -2986,7 +2985,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) * @prev_bio_flags: flags of previous bio to see if we can merge the current one * @bio_flags: flags of the current bio to see if we can merge them */ -static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, +static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, struct page *page, u64 offset, size_t size, unsigned long pg_offset, @@ -3001,9 +3000,9 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, struct bio *bio; size_t page_size = min_t(size_t, size, PAGE_SIZE); sector_t sector = offset >> 9; + struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; ASSERT(bio_ret); - ASSERT(tree == &BTRFS_I(page->mapping->host)->io_tree); if (*bio_ret) { bool contig; @@ -3290,7 +3289,7 @@ static int __do_readpage(struct extent_io_tree *tree, continue; } - ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, + ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, page, offset, disk_io_size, pg_offset, bio, end_bio_extent_readpage, mirror_num, @@ -3557,7 +3556,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page->index, cur, end); } - ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page, offset, iosize, pg_offset, &epd->bio, end_bio_extent_writepage, @@ -3878,8 +3877,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, struct extent_page_data *epd) { - struct btrfs_fs_info *fs_info = eb->fs_info; - struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; u64 offset = eb->start; u32 nritems; int i, num_pages; @@ -3912,7 +3909,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, p, offset, PAGE_SIZE, 0, &epd->bio, end_bio_extent_buffer_writepage, From ae6957ebbfcd418348550ac02e36b0ea86d32e0a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 19:09:30 +0100 Subject: [PATCH 061/210] btrfs: add assertions for tree == inode->io_tree to extent IO helpers Add assertions to all helpers that get tree as argument and verify that it's the same that can be obtained from the inode or from its pages. In followup patches the redundant arguments and assertions will be removed one by one. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 10 ++++++++++ fs/btrfs/ordered-data.c | 2 ++ 2 files changed, 12 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 399e9fc2545e..7a2f657f5dbd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3134,6 +3134,8 @@ static int __do_readpage(struct extent_io_tree *tree, size_t blocksize = inode->i_sb->s_blocksize; unsigned long this_bio_flag = 0; + ASSERT(tree == &BTRFS_I(inode)->io_tree); + set_page_extent_mapped(page); if (!PageUptodate(page)) { @@ -3327,6 +3329,8 @@ static inline void contiguous_readpages(struct extent_io_tree *tree, struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); int index; + ASSERT(tree == &inode->io_tree); + btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { @@ -3348,6 +3352,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, u64 end = start + PAGE_SIZE - 1; int ret; + ASSERT(tree == &inode->io_tree); + btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, @@ -3362,6 +3368,8 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, unsigned long bio_flags = 0; int ret; + ASSERT(tree == &BTRFS_I(page->mapping->host)->io_tree); + ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags, 0); if (bio) @@ -5449,6 +5457,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; + ASSERT(tree == &BTRFS_I(eb->pages[0]->mapping->host)->io_tree); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 632981848649..f47accad6f05 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -854,6 +854,8 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, struct extent_state *cache = NULL; struct extent_state **cachedp = &cache; + ASSERT(tree == &inode->io_tree); + if (cached_state) cachedp = cached_state; From b272ae22acd2ca688bbf9d94eea4b1da61fdc697 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 19:09:33 +0100 Subject: [PATCH 062/210] btrfs: drop argument tree from btrfs_lock_and_flush_ordered_range The tree pointer can be safely read from the inode so we can drop the redundant argument from btrfs_lock_and_flush_ordered_range. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/ordered-data.c | 10 +++------- fs/btrfs/ordered-data.h | 3 +-- 5 files changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7a2f657f5dbd..35accc9b8ced 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3331,7 +3331,7 @@ static inline void contiguous_readpages(struct extent_io_tree *tree, ASSERT(tree == &inode->io_tree); - btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, @@ -3354,7 +3354,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, ASSERT(tree == &inode->io_tree); - btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, bio_flags, read_flags, NULL); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3f7f5323aefa..fd52ad00b6c8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1561,7 +1561,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; - btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart, + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); num_bytes = lockend - lockstart + 1; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 945f8e2043e1..79a2020a805f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4619,7 +4619,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) if (size <= hole_start) return 0; - btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start, + btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start, block_end - 1, &cached_state); cur_offset = hole_start; while (1) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f47accad6f05..e13b3d28c063 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -835,7 +835,6 @@ out: * btrfs_flush_ordered_range - Lock the passed range and ensures all pending * ordered extents in it are run to completion. * - * @tree: IO tree used for locking out other users of the range * @inode: Inode whose ordered tree is to be searched * @start: Beginning of range to flush * @end: Last byte of range to lock @@ -845,8 +844,7 @@ out: * This function always returns with the given range locked, ensuring after it's * called no order extent can be pending. */ -void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, - struct btrfs_inode *inode, u64 start, +void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state) { @@ -854,13 +852,11 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, struct extent_state *cache = NULL; struct extent_state **cachedp = &cache; - ASSERT(tree == &inode->io_tree); - if (cached_state) cachedp = cached_state; while (1) { - lock_extent_bits(tree, start, end, cachedp); + lock_extent_bits(&inode->io_tree, start, end, cachedp); ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); if (!ordered) { @@ -873,7 +869,7 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, refcount_dec(&cache->refs); break; } - unlock_extent_cached(tree, start, end, cachedp); + unlock_extent_cached(&inode->io_tree, start, end, cachedp); btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); btrfs_put_ordered_extent(ordered); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index a46f319d9ae0..c01c9698250b 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -183,8 +183,7 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); -void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, - struct btrfs_inode *inode, u64 start, +void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); int __init ordered_data_init(void); From 71ad38b44eaa758c70c34ca97a5c3848d6b67e03 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 19:09:35 +0100 Subject: [PATCH 063/210] btrfs: sink argument tree to extent_read_full_page The tree pointer can be safely read from the page's inode, use it and drop the redundant argument. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 +--- fs/btrfs/extent_io.c | 7 +++---- fs/btrfs/extent_io.h | 4 ++-- fs/btrfs/inode.c | 4 +--- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f859a28255dd..49daf6c946df 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -978,9 +978,7 @@ static int btree_writepages(struct address_space *mapping, static int btree_readpage(struct file *file, struct page *page) { - struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btree_get_extent, 0); + return extent_read_full_page(page, btree_get_extent, 0); } static int btree_releasepage(struct page *page, gfp_t gfp_flags) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 35accc9b8ced..7e22f7de733f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3361,15 +3361,14 @@ static int __extent_read_full_page(struct extent_io_tree *tree, return ret; } -int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num) +int extent_read_full_page(struct page *page, get_extent_t *get_extent, + int mirror_num) { struct bio *bio = NULL; unsigned long bio_flags = 0; + struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; int ret; - ASSERT(tree == &BTRFS_I(page->mapping->host)->io_tree); - ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags, 0); if (bio) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5d205bbaafdc..234622101230 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -189,8 +189,8 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num); +int extent_read_full_page(struct page *page, get_extent_t *get_extent, + int mirror_num); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 79a2020a805f..254f5ea17e40 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8268,9 +8268,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int btrfs_readpage(struct file *file, struct page *page) { - struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btrfs_get_extent, 0); + return extent_read_full_page(page, btrfs_get_extent, 0); } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) From 0d44fea77e553ab09a0f6061e0b2be68039852b6 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 19:09:37 +0100 Subject: [PATCH 064/210] btrfs: sink argument tree to __extent_read_full_page The tree pointer can be safely read from the inode, use it and drop the redundant argument. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7e22f7de733f..9e37f9a57c36 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3340,8 +3340,7 @@ static inline void contiguous_readpages(struct extent_io_tree *tree, } } -static int __extent_read_full_page(struct extent_io_tree *tree, - struct page *page, +static int __extent_read_full_page(struct page *page, get_extent_t *get_extent, struct bio **bio, int mirror_num, unsigned long *bio_flags, @@ -3350,10 +3349,9 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; + struct extent_io_tree *tree = &inode->io_tree; int ret; - ASSERT(tree == &inode->io_tree); - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, @@ -3366,10 +3364,9 @@ int extent_read_full_page(struct page *page, get_extent_t *get_extent, { struct bio *bio = NULL; unsigned long bio_flags = 0; - struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; int ret; - ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, + ret = __extent_read_full_page(page, get_extent, &bio, mirror_num, &bio_flags, 0); if (bio) ret = submit_one_bio(bio, mirror_num, bio_flags); @@ -5451,13 +5448,10 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; - struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - ASSERT(tree == &BTRFS_I(eb->pages[0]->mapping->host)->io_tree); - num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; @@ -5501,7 +5495,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = __extent_read_full_page(tree, page, + err = __extent_read_full_page(page, btree_get_extent, &bio, mirror_num, &bio_flags, REQ_META); From b6660e80f1c658fa6b886e213d72f18a9dd32ee0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 19:09:40 +0100 Subject: [PATCH 065/210] btrfs: sink arugment tree to contiguous_readpages The tree pointer can be safely read from the inode, use it and drop the redundant argument. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9e37f9a57c36..48809260c4ce 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3318,8 +3318,7 @@ out: return ret; } -static inline void contiguous_readpages(struct extent_io_tree *tree, - struct page *pages[], int nr_pages, +static inline void contiguous_readpages(struct page *pages[], int nr_pages, u64 start, u64 end, struct extent_map **em_cached, struct bio **bio, @@ -3327,10 +3326,9 @@ static inline void contiguous_readpages(struct extent_io_tree *tree, u64 *prev_em_start) { struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); + struct extent_io_tree *tree = &inode->io_tree; int index; - ASSERT(tree == &inode->io_tree); - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { @@ -4328,7 +4326,6 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, unsigned long bio_flags = 0; struct page *pagepool[16]; struct extent_map *em_cached = NULL; - struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; int nr = 0; u64 prev_em_start = (u64)-1; @@ -4355,7 +4352,7 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - contiguous_readpages(tree, pagepool, nr, contig_start, + contiguous_readpages(pagepool, nr, contig_start, contig_end, &em_cached, &bio, &bio_flags, &prev_em_start); } From f657a31c8667a4d06551e308c1af9ed0b02129f4 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 19:09:42 +0100 Subject: [PATCH 066/210] btrfs: sink argument tree to __do_readpage The tree pointer can be safely read from the inode, use it and drop the redundant argument. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 48809260c4ce..1001438e6a32 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3109,8 +3109,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int __do_readpage(struct extent_io_tree *tree, - struct page *page, +static int __do_readpage(struct page *page, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, @@ -3133,8 +3132,7 @@ static int __do_readpage(struct extent_io_tree *tree, size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; unsigned long this_bio_flag = 0; - - ASSERT(tree == &BTRFS_I(inode)->io_tree); + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; set_page_extent_mapped(page); @@ -3326,13 +3324,12 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, u64 *prev_em_start) { struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); - struct extent_io_tree *tree = &inode->io_tree; int index; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, + __do_readpage(pages[index], btrfs_get_extent, em_cached, bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); put_page(pages[index]); } @@ -3347,12 +3344,11 @@ static int __extent_read_full_page(struct page *page, struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - struct extent_io_tree *tree = &inode->io_tree; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, + ret = __do_readpage(page, get_extent, NULL, bio, mirror_num, bio_flags, read_flags, NULL); return ret; } From 1f6087e69c73b2ce8f1c20a4199488d937bb6b67 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 12 Feb 2020 17:28:12 +0800 Subject: [PATCH 067/210] btrfs: sysfs, use btrfs_sysfs_remove_fsid to celanup errors in add_fsid We have one simple function btrfs_sysfs_remove_fsid() to undo btrfs_sysfs_add_fsid(), which also does proper checks before releasing objects. One difference, if btrfs_sysfs_remove_fsid is used that now we also call kobject_del() which was missing before. This was tested (with kobject debug turned on) and no change in behaviour was found. This is a cleanup patch. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 3c10e78924d0..119edd4341d6 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1371,7 +1371,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) if (!fs_devs->devices_kobj) { btrfs_err(fs_devs->fs_info, "failed to init sysfs device interface"); - kobject_put(&fs_devs->fsid_kobj); + btrfs_sysfs_remove_fsid(fs_devs); return -ENOMEM; } From f3cd2c58110dad14ee37cc47fd1473d90ee68ccb Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 12 Feb 2020 17:28:13 +0800 Subject: [PATCH 068/210] btrfs: sysfs, rename device_link add/remove functions Since commit 668e48af7a94 ("btrfs: sysfs, add devid/dev_state kobject and device attributes"), the functions btrfs_sysfs_add_device_link() and btrfs_sysfs_rm_device_link() do more than just adding and removing the device link as its name indicated. Rename them to be more specific that's about the directory with the attirbutes Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 4 ++-- fs/btrfs/sysfs.c | 10 +++++----- fs/btrfs/sysfs.h | 4 ++-- fs/btrfs/volumes.c | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2aad07cdaea8..db93909b25e0 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -512,7 +512,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); + ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device); if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); @@ -743,7 +743,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* replace the sysfs entry */ - btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); + btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device); btrfs_sysfs_update_devid(tgt_device); btrfs_rm_dev_replace_free_srcdev(src_device); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 119edd4341d6..651aa65b1f3f 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -960,7 +960,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs); - btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL); + btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL); } static const char * const btrfs_feature_set_names[FEAT_MAX] = { @@ -1149,7 +1149,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, /* when one_device is NULL, it removes all device links */ -int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { struct hd_struct *disk; @@ -1269,7 +1269,7 @@ static struct kobj_type devid_ktype = { .release = btrfs_release_devid_kobj, }; -int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { int error = 0; @@ -1395,13 +1395,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) btrfs_set_fs_info_ptr(fs_info); - error = btrfs_sysfs_add_device_link(fs_devs, NULL); + error = btrfs_sysfs_add_devices_dir(fs_devs, NULL); if (error) return error; error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_sysfs_rm_device_link(fs_devs, NULL); + btrfs_sysfs_remove_devices_dir(fs_devs, NULL); return error; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index c68582add92e..718a26c97833 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -14,9 +14,9 @@ enum btrfs_feature_set { char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); const char * const btrfs_feature_set_name(enum btrfs_feature_set set); -int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b092021e41e9..387f80656476 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2054,7 +2054,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (device->bdev) { cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_rm_device_link(fs_devices, device); + btrfs_sysfs_remove_devices_dir(fs_devices, device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; @@ -2174,7 +2174,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_rm_device_link(fs_devices, tgtdev); + btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev); if (tgtdev->bdev) fs_devices->open_devices--; @@ -2522,7 +2522,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path orig_super_num_devices + 1); /* add sysfs device entry */ - btrfs_sysfs_add_device_link(fs_devices, device); + btrfs_sysfs_add_devices_dir(fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2590,7 +2590,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; error_sysfs: - btrfs_sysfs_rm_device_link(fs_devices, device); + btrfs_sysfs_remove_devices_dir(fs_devices, device); mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_del_rcu(&device->dev_list); From 25864778bce70341aad0fe65e9f5429370bbbbb2 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 13 Feb 2020 16:40:53 +0800 Subject: [PATCH 069/210] btrfs: sysfs, unify handler name of devinfo/missing The devinfo attribute handlers were added in 668e48af7a94 ("btrfs: sysfs, add devid/dev_state kobject and device attributes") and the name should contain _devinfo_, there's one that does not conform, so unify it with the rest. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 651aa65b1f3f..93cf76118a04 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1205,7 +1205,7 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, } BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); -static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj, +static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; @@ -1216,7 +1216,7 @@ static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj, return snprintf(buf, PAGE_SIZE, "%d\n", val); } -BTRFS_ATTR(devid, missing, btrfs_sysfs_missing_show); +BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show); static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, struct kobj_attribute *a, From f603bb94abbed5b5542cfb514ddfa3c5dda96bcc Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 20 Jan 2020 16:09:08 +0200 Subject: [PATCH 070/210] btrfs: Perform pinned cleanup directly in btrfs_destroy_delayed_refs Having btrfs_destroy_delayed_refs call btrfs_pin_extent is problematic for making pinned extents tracking per-transaction since btrfs_trans_handle cannot be passed to btrfs_pin_extent in this context. Additionally delayed refs heads pinned in btrfs_destroy_delayed_refs are going to be handled very closely, in btrfs_destroy_pinned_extent. To enable btrfs_pin_extent to take btrfs_trans_handle simply open code it in btrfs_destroy_delayed_refs and call btrfs_error_unpin_extent_range on the range. This enables us to do less work in btrfs_destroy_pinned_extent and leaves btrfs_pin_extent being called in contexts which have a valid btrfs_trans_handle. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 49daf6c946df..194c98a61095 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -42,6 +42,7 @@ #include "ref-verify.h" #include "block-group.h" #include "discard.h" +#include "space-info.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -4308,9 +4309,30 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); - if (pin_bytes) - btrfs_pin_extent(fs_info, head->bytenr, - head->num_bytes, 1); + if (pin_bytes) { + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, head->bytenr); + BUG_ON(!cache); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += head->num_bytes; + btrfs_space_info_update_bytes_pinned(fs_info, + cache->space_info, head->num_bytes); + cache->reserved -= head->num_bytes; + cache->space_info->bytes_reserved -= head->num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + percpu_counter_add_batch( + &cache->space_info->total_bytes_pinned, + head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); + + btrfs_put_block_group(cache); + + btrfs_error_unpin_extent_range(fs_info, head->bytenr, + head->bytenr + head->num_bytes - 1); + } btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); From b25c36f84b59a64fd5815f341b6ddbd8a8a2bb56 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 20 Jan 2020 16:09:09 +0200 Subject: [PATCH 071/210] btrfs: Make btrfs_pin_extent take trans handle Preparation for switching pinned extent tracking to a per-transaction basis. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/extent-tree.c | 17 ++++++----------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ffd99c3f64db..2f0ca1cb1fb9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2463,8 +2463,8 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags); -int btrfs_pin_extent(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num, int reserved); +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, + int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct extent_buffer *eb); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index da3d9eaf3b22..5a90d9d8b665 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1705,8 +1705,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, if (TRANS_ABORTED(trans)) { if (insert_reserved) - btrfs_pin_extent(trans->fs_info, node->bytenr, - node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); return 0; } @@ -1721,8 +1720,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, else BUG(); if (ret && insert_reserved) - btrfs_pin_extent(trans->fs_info, node->bytenr, - node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); return ret; } @@ -1867,8 +1865,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); if (head->must_insert_reserved) { - btrfs_pin_extent(fs_info, head->bytenr, - head->num_bytes, 1); + btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); if (head->is_data) { ret = btrfs_del_csums(trans, fs_info->csum_root, head->bytenr, head->num_bytes); @@ -2612,14 +2609,12 @@ static int pin_down_extent(struct btrfs_block_group *cache, return 0; } -int btrfs_pin_extent(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_block_group *cache; - ASSERT(fs_info->running_transaction); - - cache = btrfs_lookup_block_group(fs_info, bytenr); + cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ pin_down_extent(cache, bytenr, num_bytes, reserved); @@ -3345,7 +3340,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) (ref->type == BTRFS_REF_DATA && ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ - btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1); + btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); old_ref_mod = new_ref_mod = 0; ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { From 6787bb9f35157f718b53e6b63461e446a148e751 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 20 Jan 2020 16:09:10 +0200 Subject: [PATCH 072/210] btrfs: Introduce unaccount_log_buffer This function correctly adjusts the reserved bytes occupied by a log tree extent buffer. It will be used instead of calling btrfs_pin_reserved_extent. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4ea47bac5fcf..32b087f543f2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -18,6 +18,8 @@ #include "compression.h" #include "qgroup.h" #include "inode-map.h" +#include "block-group.h" +#include "space-info.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -2664,6 +2666,29 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, return ret; } +/* + * Correctly adjust the reserved bytes occupied by a log tree extent buffer + */ +static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) +{ + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, start); + if (!cache) { + btrfs_err(fs_info, "unable to find block group for %llu", start); + return; + } + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->reserved -= fs_info->nodesize; + cache->space_info->bytes_reserved -= fs_info->nodesize; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + btrfs_put_block_group(cache); +} + static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level, From 10e958d523f82e976e5bdb13898da0e0b3af61e0 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 20 Jan 2020 16:09:11 +0200 Subject: [PATCH 073/210] btrfs: Call btrfs_pin_reserved_extent only during active transaction Calling btrfs_pin_reserved_extent makes sense only with a valid transaction since pinned extents are processed from transaction commit in btrfs_finish_extent_commit. In case of error it's sufficient to adjust the reserved counter to account for log tree extents allocated in the last transaction. This commit moves btrfs_pin_reserved_extent to be called only with valid transaction handle and otherwise uses the newly introduced unaccount_log_buffer to adjust "reserved". If this is not done if a failure occurs before transaction is committed WARN_ON are going to be triggered on unmount. This was especially pronounced with generic/475 test. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 54 +++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 32b087f543f2..c27e121603ac 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2695,12 +2695,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 root_owner; u64 bytenr; u64 ptr_gen; struct extent_buffer *next; struct extent_buffer *cur; - struct extent_buffer *parent; u32 blocksize; int ret = 0; @@ -2720,9 +2718,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); blocksize = fs_info->nodesize; - parent = path->nodes[*level]; - root_owner = btrfs_header_owner(parent); - next = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(next)) return PTR_ERR(next); @@ -2750,18 +2745,16 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(fs_info, + bytenr, blocksize); + if (ret) { + free_extent_buffer(next); + return ret; + } } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); - } - - WARN_ON(root_owner != - BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_pin_reserved_extent(fs_info, - bytenr, blocksize); - if (ret) { - free_extent_buffer(next); - return ret; + unaccount_log_buffer(fs_info, bytenr); } } free_extent_buffer(next); @@ -2792,7 +2785,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 root_owner; int i; int slot; int ret; @@ -2805,13 +2797,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level == 0); return 0; } else { - struct extent_buffer *parent; - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; - - root_owner = btrfs_header_owner(parent); ret = wc->process_func(root, path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level]), *level); @@ -2829,17 +2814,18 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(fs_info, + path->nodes[*level]->start, + path->nodes[*level]->len); + if (ret) + return ret; } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); - } - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_pin_reserved_extent(fs_info, - path->nodes[*level]->start, - path->nodes[*level]->len); - if (ret) - return ret; + unaccount_log_buffer(fs_info, + path->nodes[*level]->start); + } } free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; @@ -2910,15 +2896,15 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(fs_info, + next->start, next->len); + if (ret) + goto out; } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); + unaccount_log_buffer(fs_info, next->start); } - - ret = btrfs_pin_reserved_extent(fs_info, next->start, - next->len); - if (ret) - goto out; } } From 7bfc10070573591163dae7be9bb09552d4e6dee5 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 20 Jan 2020 16:09:12 +0200 Subject: [PATCH 074/210] btrfs: Make btrfs_pin_reserved_extent take transaction handle btrfs_pin_reserved_extent is now only called with a valid transaction so exploit the fact to take a transaction. This is preparation for tracking pinned extents on a per-transaction basis. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 8 +++++--- fs/btrfs/tree-log.c | 6 +++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2f0ca1cb1fb9..3518cbd07015 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2501,7 +2501,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); -int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5a90d9d8b665..6a61e972593e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4184,14 +4184,16 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, + u64 len) { struct btrfs_block_group *cache; int ret = 0; - cache = btrfs_lookup_block_group(fs_info, start); + cache = btrfs_lookup_block_group(trans->fs_info, start); if (!cache) { - btrfs_err(fs_info, "unable to find block group for %llu", start); + btrfs_err(trans->fs_info, "unable to find block group for %llu", + start); return -ENOSPC; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c27e121603ac..8471f3c5525e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2745,7 +2745,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - ret = btrfs_pin_reserved_extent(fs_info, + ret = btrfs_pin_reserved_extent(trans, bytenr, blocksize); if (ret) { free_extent_buffer(next); @@ -2814,7 +2814,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - ret = btrfs_pin_reserved_extent(fs_info, + ret = btrfs_pin_reserved_extent(trans, path->nodes[*level]->start, path->nodes[*level]->len); if (ret) @@ -2896,7 +2896,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - ret = btrfs_pin_reserved_extent(fs_info, + ret = btrfs_pin_reserved_extent(trans, next->start, next->len); if (ret) goto out; From 9fce5704542c5e97d458cc97f9cecef253f02f06 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 20 Jan 2020 16:09:13 +0200 Subject: [PATCH 075/210] btrfs: Make btrfs_pin_extent_for_log_replay take transaction handle Preparation for refactoring pinned extents tracking. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/tree-log.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3518cbd07015..22d0cb0019d1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2465,7 +2465,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u64 offset, int metadata, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6a61e972593e..e6ea01d76659 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2626,13 +2626,13 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, /* * this function must be called within transaction */ -int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_block_group *cache; int ret; - cache = btrfs_lookup_block_group(fs_info, bytenr); + cache = btrfs_lookup_block_group(trans->fs_info, bytenr); if (!cache) return -EINVAL; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8471f3c5525e..19c107be9ef6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -313,7 +313,7 @@ static int process_one_buffer(struct btrfs_root *log, } if (wc->pin) - ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, + ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, eb->len); if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { @@ -6189,7 +6189,7 @@ again: * each subsequent pass. */ if (ret == -ENOENT) - ret = btrfs_pin_extent_for_log_replay(fs_info, + ret = btrfs_pin_extent_for_log_replay(trans, log->node->start, log->node->len); free_extent_buffer(log->node); From 6690d07126e1eadd37fccb7e8e275b3ab7bdb524 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 20 Jan 2020 16:09:14 +0200 Subject: [PATCH 076/210] btrfs: Make pin_down_extent take transaction handle All callers have a reference to a transaction handle so pass it to pin_down_extent. This is the final step before switching pinned extent tracking to a per-transaction basis. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e6ea01d76659..425eefeaacf8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2585,7 +2585,8 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) return bytenr; } -static int pin_down_extent(struct btrfs_block_group *cache, +static int pin_down_extent(struct btrfs_trans_handle *trans, + struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_fs_info *fs_info = cache->fs_info; @@ -2617,7 +2618,7 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(cache, bytenr, num_bytes, reserved); + pin_down_extent(trans, cache, bytenr, num_bytes, reserved); btrfs_put_block_group(cache); return 0; @@ -2644,7 +2645,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, */ btrfs_cache_block_group(cache, 1); - pin_down_extent(cache, bytenr, num_bytes, 0); + pin_down_extent(trans, cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, bytenr, num_bytes); @@ -3296,7 +3297,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(cache, buf->start, buf->len, 1); + pin_down_extent(trans, cache, buf->start, buf->len, 1); btrfs_put_block_group(cache); goto out; } @@ -4197,7 +4198,7 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, return -ENOSPC; } - ret = pin_down_extent(cache, start, len, 1); + ret = pin_down_extent(trans, cache, start, len, 1); btrfs_put_block_group(cache); return ret; } From 6b45f64172e130be86ab28a3888509ee4498a6ee Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 20 Jan 2020 16:09:15 +0200 Subject: [PATCH 077/210] btrfs: Pass transaction handle to write_pinned_extent_entries Preparation for refactoring pinned extents tracking. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0598fd3c6e3f..9d6372139547 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1067,6 +1067,7 @@ fail: } static noinline_for_stack int write_pinned_extent_entries( + struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, int *entries) @@ -1317,7 +1318,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * If this changes while we are working we'll get added back to * the dirty list and redo it. No locking needed */ - ret = write_pinned_extent_entries(block_group, io_ctl, &entries); + ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); if (ret) goto out_nospc_locked; From f2fb72983bdcf57adf7e272107408391ef3dbb2d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 20 Jan 2020 16:09:16 +0200 Subject: [PATCH 078/210] btrfs: Mark pinned log extents as excluded In preparation to making pinned extents per-transaction ensure that log such extents are always excluded from caching. To achieve this in addition to marking them via btrfs_pin_extent_for_log_replay they also need to be marked with btrfs_add_excluded_extent to prevent log tree extent buffer being loaded by the free space caching thread. That's required since log tree blocks are not recorded in the extent tree, hence they always look free. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 425eefeaacf8..f97e631aaca5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2633,6 +2633,8 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache; int ret; + btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes); + cache = btrfs_lookup_block_group(trans->fs_info, bytenr); if (!cache) return -EINVAL; @@ -2919,6 +2921,12 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { + clear_extent_bits(&fs_info->freed_extents[0], start, + end, EXTENT_UPTODATE); + clear_extent_bits(&fs_info->freed_extents[1], start, + end, EXTENT_UPTODATE); + } if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, From 45bb5d6ae961cdfa84b4a151beb77b2a780b2456 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 20 Jan 2020 16:09:17 +0200 Subject: [PATCH 079/210] btrfs: Factor out pinned extent clean up in btrfs_delete_unused_bgs Next patch is going to refactor how pinned extents are tracked which will necessitate changing this code. To ease that work and contain the changes factor the code now in preparation, this will also help review. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 67 ++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c12d6399c6d7..9fec78a8c759 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1248,6 +1248,43 @@ out: return ret; } +static bool clean_pinned_extents(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + const u64 start = bg->start; + const u64 end = start + bg->length - 1; + int ret; + + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, another + * task might be running finish_extent_commit() for the previous + * transaction N - 1, and have seen a range belonging to the block + * group in freed_extents[] before we were able to clear the whole + * block group range from freed_extents[]. This means that task can + * lookup for the block group after we unpinned it from freed_extents + * and removed it, leading to a BUG_ON() at unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); + ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, + EXTENT_DIRTY); + if (ret) + goto err; + + ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, + EXTENT_DIRTY); + if (ret) + goto err; + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + + return true; + +err: + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_dec_block_group_ro(bg); + return false; +} + /* * Process the unused_bgs list and remove any that don't have any allocated * space inside of them. @@ -1265,7 +1302,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { - u64 start, end; int trimming; block_group = list_first_entry(&fs_info->unused_bgs, @@ -1344,35 +1380,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * We could have pending pinned extents for this block group, * just delete them, we don't care about them anymore. */ - start = block_group->start; - end = start + block_group->length - 1; - /* - * Hold the unused_bg_unpin_mutex lock to avoid racing with - * btrfs_finish_extent_commit(). If we are at transaction N, - * another task might be running finish_extent_commit() for the - * previous transaction N - 1, and have seen a range belonging - * to the block group in freed_extents[] before we were able to - * clear the whole block group range from freed_extents[]. This - * means that task can lookup for the block group after we - * unpinned it from freed_extents[] and removed it, leading to - * a BUG_ON() at btrfs_unpin_extent_range(). - */ - mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, - EXTENT_DIRTY); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(block_group); + if (!clean_pinned_extents(block_group)) goto end_trans; - } - ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, - EXTENT_DIRTY); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(block_group); - goto end_trans; - } - mutex_unlock(&fs_info->unused_bg_unpin_mutex); /* * At this point, the block_group is read only and should fail From fe119a6eeb670585e29dbe3932e00ad29ae8f5f9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 20 Jan 2020 16:09:18 +0200 Subject: [PATCH 080/210] btrfs: switch to per-transaction pinned extents This commit flips the switch to start tracking/processing pinned extents on a per-transaction basis. It mostly replaces all references from btrfs_fs_info::(pinned_extents|freed_extents[]) to btrfs_transaction::pinned_extents. Two notable modifications that warrant explicit mention are changing clean_pinned_extents to get a reference to the previously running transaction. The other one is removal of call to btrfs_destroy_pinned_extent since transactions are going to be cleaned in btrfs_cleanup_one_transaction. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 38 ++++++++++++++++++++++++------------ fs/btrfs/ctree.h | 4 ++-- fs/btrfs/disk-io.c | 30 +++++----------------------- fs/btrfs/extent-io-tree.h | 4 ++-- fs/btrfs/extent-tree.c | 31 ++++++++--------------------- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/transaction.c | 2 ++ fs/btrfs/transaction.h | 1 + include/trace/events/btrfs.h | 4 ++-- 9 files changed, 48 insertions(+), 68 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9fec78a8c759..b8f39a679064 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -460,7 +460,7 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end int ret; while (start < end) { - ret = find_first_extent_bit(info->pinned_extents, start, + ret = find_first_extent_bit(&info->excluded_extents, start, &extent_start, &extent_end, EXTENT_DIRTY | EXTENT_UPTODATE, NULL); @@ -1248,30 +1248,42 @@ out: return ret; } -static bool clean_pinned_extents(struct btrfs_block_group *bg) +static bool clean_pinned_extents(struct btrfs_trans_handle *trans, + struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_transaction *prev_trans = NULL; const u64 start = bg->start; const u64 end = start + bg->length - 1; int ret; + spin_lock(&fs_info->trans_lock); + if (trans->transaction->list.prev != &fs_info->trans_list) { + prev_trans = list_last_entry(&trans->transaction->list, + struct btrfs_transaction, list); + refcount_inc(&prev_trans->use_count); + } + spin_unlock(&fs_info->trans_lock); + /* * Hold the unused_bg_unpin_mutex lock to avoid racing with * btrfs_finish_extent_commit(). If we are at transaction N, another * task might be running finish_extent_commit() for the previous * transaction N - 1, and have seen a range belonging to the block - * group in freed_extents[] before we were able to clear the whole - * block group range from freed_extents[]. This means that task can - * lookup for the block group after we unpinned it from freed_extents - * and removed it, leading to a BUG_ON() at unpin_extent_range(). + * group in pinned_extents before we were able to clear the whole block + * group range from pinned_extents. This means that task can lookup for + * the block group after we unpinned it from pinned_extents and removed + * it, leading to a BUG_ON() at unpin_extent_range(). */ mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, - EXTENT_DIRTY); - if (ret) - goto err; + if (prev_trans) { + ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, + EXTENT_DIRTY); + if (ret) + goto err; + } - ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, + ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, EXTENT_DIRTY); if (ret) goto err; @@ -1380,7 +1392,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * We could have pending pinned extents for this block group, * just delete them, we don't care about them anymore. */ - if (!clean_pinned_extents(block_group)) + if (!clean_pinned_extents(trans, block_group)) goto end_trans; /* @@ -2890,7 +2902,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, &cache->space_info->total_bytes_pinned, num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); - set_extent_dirty(info->pinned_extents, + set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 22d0cb0019d1..bb237d577725 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -596,8 +596,8 @@ struct btrfs_fs_info { /* keep track of unallocated space */ atomic64_t free_chunk_space; - struct extent_io_tree freed_extents[2]; - struct extent_io_tree *pinned_extents; + /* Track ranges which are used by log trees blocks/logged data extents */ + struct extent_io_tree excluded_extents; /* logical->physical extent mapping */ struct extent_map_tree mapping_tree; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 194c98a61095..e1e111c8b08b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2075,10 +2075,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) btrfs_drop_and_free_fs_root(fs_info, gang[i]); } - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_free_log_root_tree(NULL, fs_info); - btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); - } } static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) @@ -2749,11 +2747,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; - extent_io_tree_init(fs_info, &fs_info->freed_extents[0], - IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); - extent_io_tree_init(fs_info, &fs_info->freed_extents[1], - IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); - fs_info->pinned_extents = &fs_info->freed_extents[0]; + extent_io_tree_init(fs_info, &fs_info->excluded_extents, + IO_TREE_FS_EXCLUDED_EXTENTS, NULL); set_bit(BTRFS_FS_BARRIER, &fs_info->flags); mutex_init(&fs_info->ordered_operations_mutex); @@ -4434,16 +4429,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, } static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, - struct extent_io_tree *pinned_extents) + struct extent_io_tree *unpin) { - struct extent_io_tree *unpin; u64 start; u64 end; int ret; - bool loop = true; - unpin = pinned_extents; -again: while (1) { struct extent_state *cached_state = NULL; @@ -4468,15 +4459,6 @@ again: cond_resched(); } - if (loop) { - if (unpin == &fs_info->freed_extents[0]) - unpin = &fs_info->freed_extents[1]; - else - unpin = &fs_info->freed_extents[0]; - loop = false; - goto again; - } - return 0; } @@ -4567,8 +4549,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); - btrfs_destroy_pinned_extent(fs_info, - fs_info->pinned_extents); + btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); @@ -4620,7 +4601,6 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) btrfs_destroy_all_ordered_extents(fs_info); btrfs_destroy_delayed_inodes(fs_info); btrfs_assert_delayed_root_empty(fs_info); - btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); btrfs_destroy_all_delalloc_inodes(fs_info); mutex_unlock(&fs_info->transaction_kthread_mutex); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index cc3037f9765e..b4a7bad3e82e 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -36,8 +36,8 @@ struct io_failure_record; #define CHUNK_TRIMMED EXTENT_DEFRAG enum { - IO_TREE_FS_INFO_FREED_EXTENTS0, - IO_TREE_FS_INFO_FREED_EXTENTS1, + IO_TREE_FS_PINNED_EXTENTS, + IO_TREE_FS_EXCLUDED_EXTENTS, IO_TREE_INODE_IO, IO_TREE_INODE_IO_FAILURE, IO_TREE_RELOC_BLOCKS, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f97e631aaca5..136fffb76428 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -64,10 +64,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { u64 end = start + num_bytes - 1; - set_extent_bits(&fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE); - set_extent_bits(&fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE); + set_extent_bits(&fs_info->excluded_extents, start, end, + EXTENT_UPTODATE); return 0; } @@ -79,10 +77,8 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache) start = cache->start; end = start + cache->length - 1; - clear_extent_bits(&fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE); - clear_extent_bits(&fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE); + clear_extent_bits(&fs_info->excluded_extents, start, end, + EXTENT_UPTODATE); } static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) @@ -2605,7 +2601,7 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); - set_extent_dirty(fs_info->pinned_extents, bytenr, + set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; } @@ -2761,11 +2757,6 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) } } - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - fs_info->pinned_extents = &fs_info->freed_extents[1]; - else - fs_info->pinned_extents = &fs_info->freed_extents[0]; - up_write(&fs_info->commit_root_sem); btrfs_update_global_block_rsv(fs_info); @@ -2906,10 +2897,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) u64 end; int ret; - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - unpin = &fs_info->freed_extents[1]; - else - unpin = &fs_info->freed_extents[0]; + unpin = &trans->transaction->pinned_extents; while (!TRANS_ABORTED(trans)) { struct extent_state *cached_state = NULL; @@ -2921,12 +2909,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } - if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { - clear_extent_bits(&fs_info->freed_extents[0], start, + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + clear_extent_bits(&fs_info->excluded_extents, start, end, EXTENT_UPTODATE); - clear_extent_bits(&fs_info->freed_extents[1], start, - end, EXTENT_UPTODATE); - } if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9d6372139547..bd9c4b5da549 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1086,7 +1086,7 @@ static noinline_for_stack int write_pinned_extent_entries( * We shouldn't have switched the pinned extents yet so this is the * right one */ - unpin = block_group->fs_info->pinned_extents; + unpin = &trans->transaction->pinned_extents; start = block_group->start; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 37680351b7c3..fdfdfc426539 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -336,6 +336,8 @@ loop: list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); + extent_io_tree_init(fs_info, &cur_trans->pinned_extents, + IO_TREE_FS_PINNED_EXTENTS, NULL); fs_info->generation++; cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 453cea7c7a72..31ae8d273065 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -71,6 +71,7 @@ struct btrfs_transaction { */ struct list_head io_bgs; struct list_head dropped_roots; + struct extent_io_tree pinned_extents; /* * we need to make sure block group deletion doesn't race with diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index f1f2b6a04052..bcbc763b8814 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -81,8 +81,8 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS); #define show_extent_io_tree_owner(owner) \ __print_symbolic(owner, \ - { IO_TREE_FS_INFO_FREED_EXTENTS0, "FREED_EXTENTS0" }, \ - { IO_TREE_FS_INFO_FREED_EXTENTS1, "FREED_EXTENTS1" }, \ + { IO_TREE_FS_PINNED_EXTENTS, "PINNED_EXTENTS" }, \ + { IO_TREE_FS_EXCLUDED_EXTENTS, "EXCLUDED_EXTENTS" }, \ { IO_TREE_INODE_IO, "INODE_IO" }, \ { IO_TREE_INODE_IO_FAILURE, "INODE_IO_FAILURE" }, \ { IO_TREE_RELOC_BLOCKS, "RELOC_BLOCKS" }, \ From e19221180dedd31bc8c1f432648add24efbccb9c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 12 Feb 2020 15:43:31 +0800 Subject: [PATCH 081/210] btrfs: relocation: Remove is_cowonly_root() This function is only used in read_fs_root(), which is just a wrapper of btrfs_get_fs_root(). For all the mentioned essential roots except log root tree, btrfs_get_fs_root() has its own quick path to grab them from fs_info directly, thus no need for key.offset modification. For subvolume trees, btrfs_get_fs_root() with key.offset == -1 is completely fine. For log trees and log root tree, it's impossible to hit them, as for relocation all backrefs are fetched from commit root, which never records log tree blocks. Log tree blocks either get freed in regular transaction commit, or replayed at mount time. At runtime we should never hit an backref for log tree in extent tree. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 034f5f151a74..6130ba69bc49 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -640,21 +640,6 @@ static struct btrfs_root *find_reloc_root(struct reloc_control *rc, return btrfs_grab_root(root); } -static int is_cowonly_root(u64 root_objectid) -{ - if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || - root_objectid == BTRFS_EXTENT_TREE_OBJECTID || - root_objectid == BTRFS_CHUNK_TREE_OBJECTID || - root_objectid == BTRFS_DEV_TREE_OBJECTID || - root_objectid == BTRFS_TREE_LOG_OBJECTID || - root_objectid == BTRFS_CSUM_TREE_OBJECTID || - root_objectid == BTRFS_UUID_TREE_OBJECTID || - root_objectid == BTRFS_QUOTA_TREE_OBJECTID || - root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return 1; - return 0; -} - static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, u64 root_objectid) { @@ -662,10 +647,7 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, key.objectid = root_objectid; key.type = BTRFS_ROOT_ITEM_KEY; - if (is_cowonly_root(root_objectid)) - key.offset = 0; - else - key.offset = (u64)-1; + key.offset = (u64)-1; return btrfs_get_fs_root(fs_info, &key, false); } From ab9b2c7b32e6be53cac2e23f5b2db66815a7d972 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 13 Feb 2020 10:47:30 -0500 Subject: [PATCH 082/210] btrfs: handle logged extent failure properly If we're allocating a logged extent we attempt to insert an extent record for the file extent directly. We increase space_info->bytes_reserved, because the extent entry addition will call btrfs_update_block_group(), which will convert the ->bytes_reserved to ->bytes_used. However if we fail at any point while inserting the extent entry we will bail and leave space on ->bytes_reserved, which will trigger a WARN_ON() on umount. Fix this by pinning the space if we fail to insert, which is what happens in every other failure case that involves adding the extent entry. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 136fffb76428..7eef91d6c2b6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4422,7 +4422,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, offset, ins, 1); if (ret) - btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1); + btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); btrfs_put_block_group(block_group); return ret; } From 55ffaabe23c644fb95c50d852fa39e6f7321a815 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Feb 2020 10:20:02 +0000 Subject: [PATCH 083/210] Btrfs: avoid unnecessary splits when setting bits on an extent io tree When attempting to set bits on a range of an exent io tree that already has those bits set we can end up splitting an extent state record, use the preallocated extent state record, insert it into the red black tree, do another search on the red black tree, merge the preallocated extent state record with the previous extent state record, remove that previous record from the red black tree and then free it. This is all unnecessary work that consumes time. This happens specifically at the following case at __set_extent_bit(): $ cat -n fs/btrfs/extent_io.c 957 static int __must_check 958 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, (...) 1044 /* 1045 * | ---- desired range ---- | 1046 * | state | 1047 * or 1048 * | ------------- state -------------- | 1049 * (...) 1060 if (state->start < start) { 1061 if (state->state & exclusive_bits) { 1062 *failed_start = start; 1063 err = -EEXIST; 1064 goto out; 1065 } 1066 1067 prealloc = alloc_extent_state_atomic(prealloc); 1068 BUG_ON(!prealloc); 1069 err = split_state(tree, state, prealloc, start); 1070 if (err) 1071 extent_io_tree_panic(tree, err); 1072 1073 prealloc = NULL; So if our extent state represents a range from 0 to 1MiB for example, and we want to set bits in the range 128KiB to 256KiB for example, and that extent state record already has all those bits set, we end up splitting that record, so we end up with extent state records in the tree which represent the ranges from 0 to 128KiB and from 128KiB to 1MiB. This is temporary because a subsequent iteration in that function will end up merging the records. The splitting requires using the preallocated extent state record, so a future iteration that needs to do another split will need to allocate another extent state record in an atomic context, something not ideal that we try to avoid as much as possible. The splitting also requires an insertion in the red black tree, and a subsequent merge will require a deletion from the red black tree and freeing an extent state record. This change just skips the splitting of an extent state record when it already has all the bits the we need to set. Setting a bit that is already set for a range is very common in the inode's 'file_extent_tree' extent io tree for example, where we keep setting the EXTENT_DIRTY bit every time we replace an extent. This change also fixes a bug that happens after the recent patchset from Josef that avoids having implicit holes after a power failure when not using the NO_HOLES feature, more specifically the patch with the subject: "btrfs: introduce the inode->file_extent_tree" This patch introduced an extent io tree per inode to keep track of completed ordered extents and figure out at any time what is the safe value for the inode's disk_i_size. This assumes that for contiguous ranges in a file we always end up with a single extent state record in the io tree, but that is not the case, as there is a short time window where we can have two extent state records representing contiguous ranges. When this happens we end setting up an incorrect value for the inode's disk_i_size, resulting in data loss after a clean unmount of the filesystem. The following example explains how this can happen. Suppose we have an inode with an i_size and a disk_i_size of 1MiB, so in the inode's file_extent_tree we have a single extent state record that represents the range [0, 1MiB) with the EXTENT_DIRTY bit set. Then the following steps happen: 1) A buffered write against file range [512KiB, 768KiB) is made. At this point delalloc was not flushed yet; 2) Deduplication from some other inode into this inode's range [128KiB, 256KiB) is made. This causes btrfs_inode_set_file_extent_range() to be called, from btrfs_insert_clone_extent(), to mark the range [128KiB, 256KiB) with EXTENT_DIRTY in the inode's file_extent_tree; 3) When btrfs_inode_set_file_extent_range() calls set_extent_bits(), we end up at __set_extent_bit(). In the first iteration of that function's loop we end up in the following branch: $ cat -n fs/btrfs/extent_io.c 957 static int __must_check 958 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, (...) 1044 /* 1045 * | ---- desired range ---- | 1046 * | state | 1047 * or 1048 * | ------------- state -------------- | 1049 * (...) 1060 if (state->start < start) { 1061 if (state->state & exclusive_bits) { 1062 *failed_start = start; 1063 err = -EEXIST; 1064 goto out; 1065 } 1066 1067 prealloc = alloc_extent_state_atomic(prealloc); 1068 BUG_ON(!prealloc); 1069 err = split_state(tree, state, prealloc, start); 1070 if (err) 1071 extent_io_tree_panic(tree, err); 1072 1073 prealloc = NULL; (...) 1089 goto search_again; This splits the state record into two, one for range [0, 128KiB) and another for the range [128KiB, 1MiB). Both already have the EXTENT_DIRTY bit set. Then we jump to the 'search_again' label, where we unlock the the spinlock protecting the extent io tree before jumping to the 'again' label to perform the next iteration; 4) In the meanwhile, delalloc is flushed, the ordered extent for the range [512KiB, 768KiB) is created and when it completes, at btrfs_finish_ordered_io(), it calls btrfs_inode_safe_disk_i_size_write() with a value of 0 for its 'new_size' argument; 5) Before the deduplication task currently at __set_extent_bit() moves to the next iteration, the task finishing the ordered extent calls find_first_extent_bit() through btrfs_inode_safe_disk_i_size_write() and gets 'start' set to 0 and 'end' set to 128KiB - because at this moment the io tree has two extent state records, one representing the range [0, 128KiB) and another representing the range [128KiB, 1MiB), both with EXTENT_DIRTY set. Then we set 'isize' to: isize = min(isize, end + 1) = min(1MiB, 128KiB - 1 + 1) = 128KiB Then we set the inode's disk_i_size to 128KiB (isize). After a clean unmount of the filesystem and mounting it again, we have the file with a size of 128KiB, and effectively lost all the data it had before in the range from 128KiB to 1MiB. This change fixes that issue too, as we never end up splitting extent state records when they already have all the bits we want set. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1001438e6a32..4f4b41c454f2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1063,6 +1063,16 @@ hit_next: goto out; } + /* + * If this extent already has all the bits we want set, then + * skip it, not necessary to split it or do anything with it. + */ + if ((state->state & bits) == bits) { + start = state->end + 1; + cache_state(state, cached_state); + goto search_again; + } + prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); From f6d9abbc1f957f0fcf88d567ae2523d37d2731da Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 14 Feb 2020 00:24:29 +0900 Subject: [PATCH 084/210] btrfs: Export btrfs_release_disk_super Preparatory patch for removal of buffer_head usage in btrfs. Signed-off-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- fs/btrfs/volumes.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 387f80656476..89be36a0fed6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1247,7 +1247,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } -static void btrfs_release_disk_super(struct page *page) +void btrfs_release_disk_super(struct page *page) { kunmap(page); put_page(page); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f01552a0785e..f197fac0a3ae 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -484,6 +484,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length); +void btrfs_release_disk_super(struct page *page); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) From c514c9b10bc1ec8b487e82900ab57f46a21956d9 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 14 Feb 2020 00:24:30 +0900 Subject: [PATCH 085/210] btrfs: don't kmap() pages from block devices Block device mappings are never in highmem so kmap() / kunmap() calls for pages from block devices are unneeded. Use page_address() instead of kmap() to get to the virtual addreses. While we're at it, read_cache_page_gfp() doesn't return NULL on error, only an ERR_PTR, so use IS_ERR() to check for errors. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 89be36a0fed6..00baa806662a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1249,7 +1249,6 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void btrfs_release_disk_super(struct page *page) { - kunmap(page); put_page(page); } @@ -1277,10 +1276,10 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); - if (IS_ERR_OR_NULL(*page)) + if (IS_ERR(*page)) return 1; - p = kmap(*page); + p = page_address(*page); /* align our pointer to the offset of the super block */ *disk_super = p + offset_in_page(bytenr); From 6fbceb9fa4f6749445bc0e2ac6750f0ae8e85b42 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 14 Feb 2020 00:24:31 +0900 Subject: [PATCH 086/210] btrfs: reduce scope of btrfs_scratch_superblocks() btrfs_scratch_superblocks() isn't used anywhere outside volumes.c so remove it from the header file and mark it as static. Also move it above it's callers so we don't need a forward declaration. Reviewed-by: Anand Jain Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 59 +++++++++++++++++++++++----------------------- fs/btrfs/volumes.h | 1 - 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 00baa806662a..bfe04233fcaa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1948,6 +1948,35 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) return num_devices; } +static void btrfs_scratch_superblocks(struct block_device *bdev, + const char *device_path) +{ + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + int copy_num; + + if (!bdev) + return; + + for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { + if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) + continue; + + disk_super = (struct btrfs_super_block *)bh->b_data; + + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + } + + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); +} + int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 devid) { @@ -7316,36 +7345,6 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, return 0; } -void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) -{ - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - int copy_num; - - if (!bdev) - return; - - for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; - copy_num++) { - - if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) - continue; - - disk_super = (struct btrfs_super_block *)bh->b_data; - - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - brelse(bh); - } - - /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); - - /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); -} - /* * Update the size and bytes used for each device where it changed. This is * delayed since we would otherwise get errors while writing out the diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f197fac0a3ae..52181897446e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -474,7 +474,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); -void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, From 8f32380d3f2904ad0e98574ae17f2018f6dd277e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 14 Feb 2020 00:24:32 +0900 Subject: [PATCH 087/210] btrfs: use the page cache for super block reading Super-block reading in BTRFS is done using buffer_heads. Buffer_heads have some drawbacks, like not being able to propagate errors from the lower layers. Directly use the page cache for reading the super blocks from disk or invalidating an on-disk super block. We have to use the page cache so to avoid races between mkfs and udev. See also 6f60cbd3ae44 ("btrfs: access superblock via pagecache in scan_one_device"). This patch unwraps the buffer head API and does not change the way the super block is actually read. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 74 +++++++++++++++++++------------------------- fs/btrfs/disk-io.h | 6 ++-- fs/btrfs/volumes.c | 77 ++++++++++++++++++++++++++-------------------- fs/btrfs/volumes.h | 4 +-- 4 files changed, 79 insertions(+), 82 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e1e111c8b08b..ac9554d71a46 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2846,7 +2846,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device u64 features; u16 csum_type; struct btrfs_key location; - struct buffer_head *bh; struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; @@ -2887,9 +2886,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* * Read super block and check the signature bytes only */ - bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (IS_ERR(bh)) { - err = PTR_ERR(bh); + disk_super = btrfs_read_dev_super(fs_devices->latest_bdev); + if (IS_ERR(disk_super)) { + err = PTR_ERR(disk_super); goto fail_alloc; } @@ -2897,18 +2896,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * Verify the type first, if that or the the checksum value are * corrupted, we'll find out */ - csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data); + csum_type = btrfs_super_csum_type(disk_super); if (!btrfs_supported_super_csum(csum_type)) { btrfs_err(fs_info, "unsupported checksum algorithm: %u", csum_type); err = -EINVAL; - brelse(bh); + btrfs_release_disk_super(disk_super); goto fail_alloc; } ret = btrfs_init_csum_hash(fs_info, csum_type); if (ret) { err = ret; + btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -2916,10 +2916,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ - if (btrfs_check_super_csum(fs_info, bh->b_data)) { + if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; - brelse(bh); + btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -2928,8 +2928,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * following bytes up to INFO_SIZE, the checksum is calculated from * the whole block of INFO_SIZE */ - memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); - brelse(bh); + memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy)); + btrfs_release_disk_super(disk_super); disk_super = fs_info->super_copy; @@ -3416,45 +3416,38 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) put_bh(bh); } -int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, - struct buffer_head **bh_ret) +struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, + int copy_num) { - struct buffer_head *bh; struct btrfs_super_block *super; + struct page *page; u64 bytenr; + struct address_space *mapping = bdev->bd_inode->i_mapping; bytenr = btrfs_sb_offset(copy_num); if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) - return -EINVAL; + return ERR_PTR(-EINVAL); - bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE); - /* - * If we fail to read from the underlying devices, as of now - * the best option we have is to mark it EIO. - */ - if (!bh) - return -EIO; + page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); + if (IS_ERR(page)) + return ERR_CAST(page); - super = (struct btrfs_super_block *)bh->b_data; + super = page_address(page); if (btrfs_super_bytenr(super) != bytenr || btrfs_super_magic(super) != BTRFS_MAGIC) { - brelse(bh); - return -EINVAL; + btrfs_release_disk_super(super); + return ERR_PTR(-EINVAL); } - *bh_ret = bh; - return 0; + return super; } -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) +struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) { - struct buffer_head *bh; - struct buffer_head *latest = NULL; - struct btrfs_super_block *super; + struct btrfs_super_block *super, *latest = NULL; int i; u64 transid = 0; - int ret = -EINVAL; /* we would like to check all the supers, but that would make * a btrfs mount succeed after a mkfs from a different FS. @@ -3462,25 +3455,20 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ for (i = 0; i < 1; i++) { - ret = btrfs_read_dev_one_super(bdev, i, &bh); - if (ret) + super = btrfs_read_dev_one_super(bdev, i); + if (IS_ERR(super)) continue; - super = (struct btrfs_super_block *)bh->b_data; - if (!latest || btrfs_super_generation(super) > transid) { - brelse(latest); - latest = bh; + if (latest) + btrfs_release_disk_super(super); + + latest = super; transid = btrfs_super_generation(super); - } else { - brelse(bh); } } - if (!latest) - return ERR_PTR(ret); - - return latest; + return super; } /* diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index db21ab614357..59c885860bf8 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -56,9 +56,9 @@ int __cold open_ctree(struct super_block *sb, char *options); void __cold close_ctree(struct btrfs_fs_info *fs_info); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); -int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, - struct buffer_head **bh_ret); +struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); +struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, + int copy_num); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, struct btrfs_key *key); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bfe04233fcaa..d8a88866aaa3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -500,7 +499,7 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( static int btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, int flush, struct block_device **bdev, - struct buffer_head **bh) + struct btrfs_super_block **disk_super) { int ret; @@ -519,9 +518,9 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, goto error; } invalidate_bdev(*bdev); - *bh = btrfs_read_dev_super(*bdev); - if (IS_ERR(*bh)) { - ret = PTR_ERR(*bh); + *disk_super = btrfs_read_dev_super(*bdev); + if (IS_ERR(*disk_super)) { + ret = PTR_ERR(*disk_super); blkdev_put(*bdev, flags); goto error; } @@ -530,7 +529,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, error: *bdev = NULL; - *bh = NULL; return ret; } @@ -611,7 +609,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, { struct request_queue *q; struct block_device *bdev; - struct buffer_head *bh; struct btrfs_super_block *disk_super; u64 devid; int ret; @@ -622,17 +619,16 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &bh); + &bdev, &disk_super); if (ret) return ret; - disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); if (devid != device->devid) - goto error_brelse; + goto error_free_page; if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) - goto error_brelse; + goto error_free_page; device->generation = btrfs_super_generation(disk_super); @@ -641,7 +637,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { pr_err( "BTRFS: Invalid seeding and uuid-changed device detected\n"); - goto error_brelse; + goto error_free_page; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); @@ -667,12 +663,12 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, fs_devices->rw_devices++; list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); } - brelse(bh); + btrfs_release_disk_super(disk_super); return 0; -error_brelse: - brelse(bh); +error_free_page: + btrfs_release_disk_super(disk_super); blkdev_put(bdev, flags); return -EINVAL; @@ -1247,8 +1243,10 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } -void btrfs_release_disk_super(struct page *page) +void btrfs_release_disk_super(struct btrfs_super_block *super) { + struct page *page = virt_to_page(super); + put_page(page); } @@ -1286,7 +1284,7 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, if (btrfs_super_bytenr(*disk_super) != bytenr || btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { - btrfs_release_disk_super(*page); + btrfs_release_disk_super(p); return 1; } @@ -1349,7 +1347,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, btrfs_free_stale_devices(path, device); } - btrfs_release_disk_super(page); + btrfs_release_disk_super(disk_super); error_bdev_put: blkdev_put(bdev, flags); @@ -1948,10 +1946,10 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) return num_devices; } -static void btrfs_scratch_superblocks(struct block_device *bdev, +static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, const char *device_path) { - struct buffer_head *bh; struct btrfs_super_block *disk_super; int copy_num; @@ -1959,15 +1957,26 @@ static void btrfs_scratch_superblocks(struct block_device *bdev, return; for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { - if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) + struct page *page; + int ret; + + disk_super = btrfs_read_dev_one_super(bdev, copy_num); + if (IS_ERR(disk_super)) continue; - disk_super = (struct btrfs_super_block *)bh->b_data; - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - brelse(bh); + + page = virt_to_page(disk_super); + set_page_dirty(page); + lock_page(page); + /* write_on_page() unlocks the page */ + ret = write_one_page(page); + if (ret) + btrfs_warn(fs_info, + "error clearing superblock number %d (%d)", + copy_num, ret); + btrfs_release_disk_super(disk_super); + } /* Notify udev that device has changed */ @@ -2095,7 +2104,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, * supers and free the device. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) - btrfs_scratch_superblocks(device->bdev, device->name->str); + btrfs_scratch_superblocks(fs_info, device->bdev, + device->name->str); btrfs_close_bdev(device); synchronize_rcu(); @@ -2163,7 +2173,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); + btrfs_scratch_superblocks(fs_info, srcdev->bdev, + srcdev->name->str); } btrfs_close_bdev(srcdev); @@ -2222,7 +2233,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) * is already out of device list, so we don't have to hold * the device_list_mutex lock. */ - btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, + tgtdev->name->str); btrfs_close_bdev(tgtdev); synchronize_rcu(); @@ -2237,14 +2249,13 @@ static struct btrfs_device *btrfs_find_device_by_path( u64 devid; u8 *dev_uuid; struct block_device *bdev; - struct buffer_head *bh; struct btrfs_device *device; ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, - fs_info->bdev_holder, 0, &bdev, &bh); + fs_info->bdev_holder, 0, &bdev, &disk_super); if (ret) return ERR_PTR(ret); - disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; if (btrfs_fs_incompat(fs_info, METADATA_UUID)) @@ -2254,7 +2265,7 @@ static struct btrfs_device *btrfs_find_device_by_path( device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, disk_super->fsid, true); - brelse(bh); + btrfs_release_disk_super(disk_super); if (!device) device = ERR_PTR(-ENOENT); blkdev_put(bdev, FMODE_READ); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 52181897446e..7fa392f38262 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -17,8 +17,6 @@ extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN SZ_64K -struct buffer_head; - struct btrfs_io_geometry { /* remaining bytes before crossing a stripe */ u64 len; @@ -483,7 +481,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length); -void btrfs_release_disk_super(struct page *page); +void btrfs_release_disk_super(struct btrfs_super_block *super); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) From 314b6dd0eebfa99621202f2be0b5f76e01d849e1 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 14 Feb 2020 00:24:33 +0900 Subject: [PATCH 088/210] btrfs: use bios instead of buffer_heads from super block writeout Similar to the superblock read path, change the write path to using bios and pages instead of buffer_heads. This allows us to skip over the buffer_head code, for writing the superblock to disk. This is based on a patch originally authored by Nikolay Borisov. Co-developed-by: Nikolay Borisov Signed-off-by: Nikolay Borisov Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Reviewed-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 127 ++++++++++++++++++++++++++------------------- 1 file changed, 73 insertions(+), 54 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ac9554d71a46..756bf2ab64cd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -3395,25 +3394,34 @@ fail: } ALLOW_ERROR_INJECTION(open_ctree, ERRNO); -static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +static void btrfs_end_super_write(struct bio *bio) { - if (uptodate) { - set_buffer_uptodate(bh); - } else { - struct btrfs_device *device = (struct btrfs_device *) - bh->b_private; + struct btrfs_device *device = bio->bi_private; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + struct page *page; - btrfs_warn_rl_in_rcu(device->fs_info, - "lost page write due to IO error on %s", - rcu_str_deref(device->name)); - /* note, we don't set_buffer_write_io_error because we have - * our own ways of dealing with the IO errors - */ - clear_buffer_uptodate(bh); - btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); + bio_for_each_segment_all(bvec, bio, iter_all) { + page = bvec->bv_page; + + if (bio->bi_status) { + btrfs_warn_rl_in_rcu(device->fs_info, + "lost page write due to IO error on %s (%d)", + rcu_str_deref(device->name), + blk_status_to_errno(bio->bi_status)); + ClearPageUptodate(page); + SetPageError(page); + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_WRITE_ERRS); + } else { + SetPageUptodate(page); + } + + put_page(page); + unlock_page(page); } - unlock_buffer(bh); - put_bh(bh); + + bio_put(bio); } struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, @@ -3473,25 +3481,23 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) /* * Write superblock @sb to the @device. Do not wait for completion, all the - * buffer heads we write are pinned. + * pages we use for writing are locked. * * Write @max_mirrors copies of the superblock, where 0 means default that fit * the expected device size at commit time. Note that max_mirrors must be * same for write and wait phases. * - * Return number of errors when buffer head is not found or submission fails. + * Return number of errors when page is not found or submission fails. */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { struct btrfs_fs_info *fs_info = device->fs_info; + struct address_space *mapping = device->bdev->bd_inode->i_mapping; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - struct buffer_head *bh; int i; - int ret; int errors = 0; u64 bytenr; - int op_flags; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; @@ -3499,6 +3505,10 @@ static int write_dev_supers(struct btrfs_device *device, shash->tfm = fs_info->csum_shash; for (i = 0; i < max_mirrors; i++) { + struct page *page; + struct bio *bio; + struct btrfs_super_block *disk_super; + bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) @@ -3511,37 +3521,45 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); crypto_shash_final(shash, sb->csum); - /* One reference for us, and we leave it for the caller */ - bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, - BTRFS_SUPER_INFO_SIZE); - if (!bh) { + page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, + GFP_NOFS); + if (!page) { btrfs_err(device->fs_info, - "couldn't get super buffer head for bytenr %llu", + "couldn't get super block page for bytenr %llu", bytenr); errors++; continue; } - memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + /* Bump the refcount for wait_dev_supers() */ + get_page(page); - /* one reference for submit_bh */ - get_bh(bh); - - set_buffer_uptodate(bh); - lock_buffer(bh); - bh->b_end_io = btrfs_end_buffer_write_sync; - bh->b_private = device; + disk_super = page_address(page); + memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); /* - * we fua the first super. The others we allow - * to go down lazy. + * Directly use bios here instead of relying on the page cache + * to do I/O, so we don't lose the ability to do integrity + * checking. */ - op_flags = REQ_SYNC | REQ_META | REQ_PRIO; + bio = bio_alloc(GFP_NOFS, 1); + bio_set_dev(bio, device->bdev); + bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; + bio->bi_private = device; + bio->bi_end_io = btrfs_end_super_write; + __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, + offset_in_page(bytenr)); + + /* + * We FUA only the first super block. The others we allow to + * go down lazy and there's a short window where the on-disk + * copies might still contain the older version. + */ + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO; if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) - op_flags |= REQ_FUA; - ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh); - if (ret) - errors++; + bio->bi_opf |= REQ_FUA; + + btrfsic_submit_bio(bio); } return errors < i ? 0 : -1; } @@ -3550,12 +3568,11 @@ static int write_dev_supers(struct btrfs_device *device, * Wait for write completion of superblocks done by write_dev_supers, * @max_mirrors same for write and wait phases. * - * Return number of errors when buffer head is not found or not marked up to + * Return number of errors when page is not found or not marked up to * date. */ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) { - struct buffer_head *bh; int i; int errors = 0; bool primary_failed = false; @@ -3565,32 +3582,34 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { + struct page *page; + bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; - bh = __find_get_block(device->bdev, - bytenr / BTRFS_BDEV_BLOCKSIZE, - BTRFS_SUPER_INFO_SIZE); - if (!bh) { + page = find_get_page(device->bdev->bd_inode->i_mapping, + bytenr >> PAGE_SHIFT); + if (!page) { errors++; if (i == 0) primary_failed = true; continue; } - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { + /* Page is submitted locked and unlocked once the IO completes */ + wait_on_page_locked(page); + if (PageError(page)) { errors++; if (i == 0) primary_failed = true; } - /* drop our reference */ - brelse(bh); + /* Drop our reference */ + put_page(page); - /* drop the reference from the writing run */ - brelse(bh); + /* Drop the reference from the writing run */ + put_page(page); } /* log error, force error return */ From 61ecc5fc18e5037d8719d091e366c4c201d1dcf4 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 14 Feb 2020 00:24:34 +0900 Subject: [PATCH 089/210] btrfs: remove btrfsic_submit_bh() Now that the last use of btrfsic_submit_bh() is gone as the super block is now written using bios, remove the function as well. Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 57 -------------------------------------- fs/btrfs/check-integrity.h | 2 -- 2 files changed, 59 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index a0ce69f2d27c..e7507985435e 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2730,63 +2730,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) &btrfsic_dev_state_hashtable); } -int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh) -{ - struct btrfsic_dev_state *dev_state; - - if (!btrfsic_is_initialized) - return submit_bh(op, op_flags, bh); - - mutex_lock(&btrfsic_mutex); - /* since btrfsic_submit_bh() might also be called before - * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bh->b_bdev->bd_dev); - - /* Only called to write the superblock (incl. FLUSH/FUA) */ - if (NULL != dev_state && - (op == REQ_OP_WRITE) && bh->b_size > 0) { - u64 dev_bytenr; - - dev_bytenr = BTRFS_BDEV_BLOCKSIZE * bh->b_blocknr; - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bh(op=0x%x,0x%x, blocknr=%llu (bytenr %llu), size=%zu, data=%p, bdev=%p)\n", - op, op_flags, (unsigned long long)bh->b_blocknr, - dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev); - btrfsic_process_written_block(dev_state, dev_bytenr, - &bh->b_data, 1, NULL, - NULL, bh, op_flags); - } else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) { - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bh(op=0x%x,0x%x FLUSH, bdev=%p)\n", - op, op_flags, bh->b_bdev); - if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - pr_info("btrfsic_submit_bh(%s) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->name); - } else { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; - - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = op_flags; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; - } - } - mutex_unlock(&btrfsic_mutex); - return submit_bh(op, op_flags, bh); -} - static void __btrfsic_submit_bio(struct bio *bio) { struct btrfsic_dev_state *dev_state; diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index 9bf4359cc44c..bcc730a06cb5 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -7,11 +7,9 @@ #define BTRFS_CHECK_INTEGRITY_H #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh); void btrfsic_submit_bio(struct bio *bio); int btrfsic_submit_bio_wait(struct bio *bio); #else -#define btrfsic_submit_bh submit_bh #define btrfsic_submit_bio submit_bio #define btrfsic_submit_bio_wait submit_bio_wait #endif From 59aaad503fb81b653bcae4ec6be72145e5323e5f Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 14 Feb 2020 00:24:35 +0900 Subject: [PATCH 090/210] btrfs: remove buffer_heads from btrfsic_process_written_block() Now that the last caller of btrfsic_process_written_block() with buffer_heads is gone, remove the buffer_head processing path from it as well. Reviewed-by: Josef Bacik Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 103 +++++++++---------------------------- 1 file changed, 25 insertions(+), 78 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index e7507985435e..4f6db2fe482a 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -152,11 +152,8 @@ struct btrfsic_block { struct list_head ref_to_list; /* list */ struct list_head ref_from_list; /* list */ struct btrfsic_block *next_in_same_bio; - void *orig_bio_bh_private; - union { - bio_end_io_t *bio; - bh_end_io_t *bh; - } orig_bio_bh_end_io; + void *orig_bio_private; + bio_end_io_t *orig_bio_end_io; int submit_bio_bh_rw; u64 flush_gen; /* only valid if !never_written */ }; @@ -325,14 +322,12 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, u64 dev_bytenr, char **mapped_datav, unsigned int num_pages, struct bio *bio, int *bio_is_patched, - struct buffer_head *bh, int submit_bio_bh_rw); static int btrfsic_process_written_superblock( struct btrfsic_state *state, struct btrfsic_block *const block, struct btrfs_super_block *const super_hdr); static void btrfsic_bio_end_io(struct bio *bp); -static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, const struct btrfsic_block *block, int recursion_level); @@ -399,8 +394,8 @@ static void btrfsic_block_init(struct btrfsic_block *b) b->never_written = 0; b->mirror_num = 0; b->next_in_same_bio = NULL; - b->orig_bio_bh_private = NULL; - b->orig_bio_bh_end_io.bio = NULL; + b->orig_bio_private = NULL; + b->orig_bio_end_io = NULL; INIT_LIST_HEAD(&b->collision_resolving_node); INIT_LIST_HEAD(&b->all_blocks_node); INIT_LIST_HEAD(&b->ref_to_list); @@ -1743,7 +1738,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, u64 dev_bytenr, char **mapped_datav, unsigned int num_pages, struct bio *bio, int *bio_is_patched, - struct buffer_head *bh, int submit_bio_bh_rw) { int is_metadata; @@ -1902,9 +1896,9 @@ again: block->is_iodone = 0; BUG_ON(NULL == bio_is_patched); if (!*bio_is_patched) { - block->orig_bio_bh_private = + block->orig_bio_private = bio->bi_private; - block->orig_bio_bh_end_io.bio = + block->orig_bio_end_io = bio->bi_end_io; block->next_in_same_bio = NULL; bio->bi_private = block; @@ -1916,25 +1910,17 @@ again: bio->bi_private; BUG_ON(NULL == chained_block); - block->orig_bio_bh_private = - chained_block->orig_bio_bh_private; - block->orig_bio_bh_end_io.bio = - chained_block->orig_bio_bh_end_io. - bio; + block->orig_bio_private = + chained_block->orig_bio_private; + block->orig_bio_end_io = + chained_block->orig_bio_end_io; block->next_in_same_bio = chained_block; bio->bi_private = block; } - } else if (NULL != bh) { - block->is_iodone = 0; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; } else { block->is_iodone = 1; - block->orig_bio_bh_private = NULL; - block->orig_bio_bh_end_io.bio = NULL; + block->orig_bio_private = NULL; + block->orig_bio_end_io = NULL; block->next_in_same_bio = NULL; } } @@ -2042,8 +2028,8 @@ again: block->is_iodone = 0; BUG_ON(NULL == bio_is_patched); if (!*bio_is_patched) { - block->orig_bio_bh_private = bio->bi_private; - block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->orig_bio_private = bio->bi_private; + block->orig_bio_end_io = bio->bi_end_io; block->next_in_same_bio = NULL; bio->bi_private = block; bio->bi_end_io = btrfsic_bio_end_io; @@ -2054,24 +2040,17 @@ again: bio->bi_private; BUG_ON(NULL == chained_block); - block->orig_bio_bh_private = - chained_block->orig_bio_bh_private; - block->orig_bio_bh_end_io.bio = - chained_block->orig_bio_bh_end_io.bio; + block->orig_bio_private = + chained_block->orig_bio_private; + block->orig_bio_end_io = + chained_block->orig_bio_end_io; block->next_in_same_bio = chained_block; bio->bi_private = block; } - } else if (NULL != bh) { - block->is_iodone = 0; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; } else { block->is_iodone = 1; - block->orig_bio_bh_private = NULL; - block->orig_bio_bh_end_io.bio = NULL; + block->orig_bio_private = NULL; + block->orig_bio_end_io = NULL; block->next_in_same_bio = NULL; } if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) @@ -2112,8 +2091,8 @@ static void btrfsic_bio_end_io(struct bio *bp) iodone_w_error = 1; BUG_ON(NULL == block); - bp->bi_private = block->orig_bio_bh_private; - bp->bi_end_io = block->orig_bio_bh_end_io.bio; + bp->bi_private = block->orig_bio_private; + bp->bi_end_io = block->orig_bio_end_io; do { struct btrfsic_block *next_block; @@ -2146,38 +2125,6 @@ static void btrfsic_bio_end_io(struct bio *bp) bp->bi_end_io(bp); } -static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) -{ - struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private; - int iodone_w_error = !uptodate; - struct btrfsic_dev_state *dev_state; - - BUG_ON(NULL == block); - dev_state = block->dev_state; - if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", - iodone_w_error, - btrfsic_get_block_type(dev_state->state, block), - block->logical_bytenr, block->dev_state->name, - block->dev_bytenr, block->mirror_num); - - block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_PREFLUSH) { - dev_state->last_flush_gen++; - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bh_end_io() new %s flush_gen=%llu\n", - dev_state->name, dev_state->last_flush_gen); - } - if (block->submit_bio_bh_rw & REQ_FUA) - block->flush_gen = 0; /* FUA completed means block is on disk */ - - bh->b_private = block->orig_bio_bh_private; - bh->b_end_io = block->orig_bio_bh_end_io.bh; - block->is_iodone = 1; /* for FLUSH, this releases the block */ - bh->b_end_io(bh, uptodate); -} - static int btrfsic_process_written_superblock( struct btrfsic_state *state, struct btrfsic_block *const superblock, @@ -2781,7 +2728,7 @@ static void __btrfsic_submit_bio(struct bio *bio) btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, bio, &bio_is_patched, - NULL, bio->bi_opf); + bio->bi_opf); bio_for_each_segment(bvec, bio, iter) kunmap(bvec.bv_page); kfree(mapped_datav); @@ -2805,8 +2752,8 @@ static void __btrfsic_submit_bio(struct bio *bio) block->iodone_w_error = 0; block->flush_gen = dev_state->last_flush_gen + 1; block->submit_bio_bh_rw = bio->bi_opf; - block->orig_bio_bh_private = bio->bi_private; - block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->orig_bio_private = bio->bi_private; + block->orig_bio_end_io = bio->bi_end_io; block->next_in_same_bio = NULL; bio->bi_private = block; bio->bi_end_io = btrfsic_bio_end_io; From 9da2b242e21689514a5aaf27c8f4e7531c671543 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 14 Feb 2020 00:24:36 +0900 Subject: [PATCH 091/210] btrfs: remove buffer_heads form super block mirror integrity checking The integrity checking code for the super block mirrors is the last remaining user of buffer_heads, change it to using plain bios as well. Reviewed-by: Josef Bacik Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 40 ++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 4f6db2fe482a..32e11a23b47f 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -77,7 +77,6 @@ #include #include -#include #include #include #include @@ -762,29 +761,31 @@ static int btrfsic_process_superblock_dev_mirror( struct btrfs_fs_info *fs_info = state->fs_info; struct btrfs_super_block *super_tmp; u64 dev_bytenr; - struct buffer_head *bh; struct btrfsic_block *superblock_tmp; int pass; struct block_device *const superblock_bdev = device->bdev; + struct page *page; + struct address_space *mapping = superblock_bdev->bd_inode->i_mapping; + int ret = 0; /* super block bytenr is always the unmapped device bytenr */ dev_bytenr = btrfs_sb_offset(superblock_mirror_num); if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) return -1; - bh = __bread(superblock_bdev, dev_bytenr / BTRFS_BDEV_BLOCKSIZE, - BTRFS_SUPER_INFO_SIZE); - if (NULL == bh) + + page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS); + if (IS_ERR(page)) return -1; - super_tmp = (struct btrfs_super_block *) - (bh->b_data + (dev_bytenr & (BTRFS_BDEV_BLOCKSIZE - 1))); + + super_tmp = page_address(page); if (btrfs_super_bytenr(super_tmp) != dev_bytenr || btrfs_super_magic(super_tmp) != BTRFS_MAGIC || memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || btrfs_super_nodesize(super_tmp) != state->metablock_size || btrfs_super_sectorsize(super_tmp) != state->datablock_size) { - brelse(bh); - return 0; + ret = 0; + goto out; } superblock_tmp = @@ -795,8 +796,8 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp = btrfsic_block_alloc(); if (NULL == superblock_tmp) { pr_info("btrfsic: error, kmalloc failed!\n"); - brelse(bh); - return -1; + ret = -1; + goto out; } /* for superblock, only the dev_bytenr makes sense */ superblock_tmp->dev_bytenr = dev_bytenr; @@ -880,8 +881,8 @@ static int btrfsic_process_superblock_dev_mirror( mirror_num)) { pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n", next_bytenr, mirror_num); - brelse(bh); - return -1; + ret = -1; + goto out; } next_block = btrfsic_block_lookup_or_add( @@ -890,8 +891,8 @@ static int btrfsic_process_superblock_dev_mirror( mirror_num, NULL); if (NULL == next_block) { btrfsic_release_block_ctx(&tmp_next_block_ctx); - brelse(bh); - return -1; + ret = -1; + goto out; } next_block->disk_key = tmp_disk_key; @@ -902,16 +903,17 @@ static int btrfsic_process_superblock_dev_mirror( BTRFSIC_GENERATION_UNKNOWN); btrfsic_release_block_ctx(&tmp_next_block_ctx); if (NULL == l) { - brelse(bh); - return -1; + ret = -1; + goto out; } } } if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) btrfsic_dump_tree_sub(state, superblock_tmp, 0); - brelse(bh); - return 0; +out: + put_page(page); + return ret; } static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) From 7ac8b88ee668a5b4743ebf3e9888fabac85c334a Mon Sep 17 00:00:00 2001 From: ethanwu Date: Fri, 7 Feb 2020 17:38:15 +0800 Subject: [PATCH 092/210] btrfs: backref, only collect file extent items matching backref offset When resolving one backref of type EXTENT_DATA_REF, we collect all references that simply reference the EXTENT_ITEM even though their (file_pos - file_extent_item::offset) are not the same as the btrfs_extent_data_ref::offset we are searching for. This patch adds additional check so that we only collect references whose (file_pos - file_extent_item::offset) == btrfs_extent_data_ref::offset. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: ethanwu Signed-off-by: David Sterba --- fs/btrfs/backref.c | 63 ++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ded46efac27d..f847141c8b41 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -347,33 +347,10 @@ static int add_prelim_ref(const struct btrfs_fs_info *fs_info, return -ENOMEM; ref->root_id = root_id; - if (key) { + if (key) ref->key_for_search = *key; - /* - * We can often find data backrefs with an offset that is too - * large (>= LLONG_MAX, maximum allowed file offset) due to - * underflows when subtracting a file's offset with the data - * offset of its corresponding extent data item. This can - * happen for example in the clone ioctl. - * So if we detect such case we set the search key's offset to - * zero to make sure we will find the matching file extent item - * at add_all_parents(), otherwise we will miss it because the - * offset taken form the backref is much larger then the offset - * of the file extent item. This can make us scan a very large - * number of file extent items, but at least it will not make - * us miss any. - * This is an ugly workaround for a behaviour that should have - * never existed, but it does and a fix for the clone ioctl - * would touch a lot of places, cause backwards incompatibility - * and would not fix the problem for extents cloned with older - * kernels. - */ - if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY && - ref->key_for_search.offset >= LLONG_MAX) - ref->key_for_search.offset = 0; - } else { + else memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); - } ref->inode_list = NULL; ref->level = level; @@ -424,6 +401,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, u64 disk_byte; u64 wanted_disk_byte = ref->wanted_disk_byte; u64 count = 0; + u64 data_offset; if (level != 0) { eb = path->nodes[level]; @@ -457,11 +435,15 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + data_offset = btrfs_file_extent_offset(eb, fi); if (disk_byte == wanted_disk_byte) { eie = NULL; old = NULL; - count++; + if (ref->key_for_search.offset == key.offset - data_offset) + count++; + else + goto next; if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, @@ -513,6 +495,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, int root_level; int level = ref->level; int index; + struct btrfs_key search_key = ref->key_for_search; root_key.objectid = ref->root_id; root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -545,13 +528,33 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } + /* + * We can often find data backrefs with an offset that is too large + * (>= LLONG_MAX, maximum allowed file offset) due to underflows when + * subtracting a file's offset with the data offset of its + * corresponding extent data item. This can happen for example in the + * clone ioctl. + * + * So if we detect such case we set the search key's offset to zero to + * make sure we will find the matching file extent item at + * add_all_parents(), otherwise we will miss it because the offset + * taken form the backref is much larger then the offset of the file + * extent item. This can make us scan a very large number of file + * extent items, but at least it will not make us miss any. + * + * This is an ugly workaround for a behaviour that should have never + * existed, but it does and a fix for the clone ioctl would touch a lot + * of places, cause backwards incompatibility and would not fix the + * problem for extents cloned with older kernels. + */ + if (search_key.type == BTRFS_EXTENT_DATA_KEY && + search_key.offset >= LLONG_MAX) + search_key.offset = 0; path->lowest_level = level; if (time_seq == SEQ_LAST) - ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, - 0, 0); + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); else - ret = btrfs_search_old_slot(root, &ref->key_for_search, path, - time_seq); + ret = btrfs_search_old_slot(root, &search_key, path, time_seq); /* root node has been locked, we can release @subvol_srcu safely here */ srcu_read_unlock(&fs_info->subvol_srcu, index); From ed58f2e66e849c34826083e5a6c1b506ee8a4d8e Mon Sep 17 00:00:00 2001 From: ethanwu Date: Fri, 7 Feb 2020 17:38:16 +0800 Subject: [PATCH 093/210] btrfs: backref, don't add refs from shared block when resolving normal backref All references from the block of SHARED_DATA_REF belong to that shared block backref. For example: item 11 key (40831553536 EXTENT_ITEM 4194304) itemoff 15460 itemsize 95 extent refs 24 gen 7302 flags DATA extent data backref root 257 objectid 260 offset 65536 count 5 extent data backref root 258 objectid 265 offset 0 count 9 shared data backref parent 394985472 count 10 Block 394985472 might be leaf from root 257, and the item obejctid and (file_pos - file_extent_item::offset) in that leaf just happens to be 260 and 65536 which is equal to the first extent data backref entry. Before this patch, when we resolve backref: root 257 objectid 260 offset 65536 we will add those refs in block 394985472 and wrongly treat those as the refs we want. Fix this by checking if the leaf we are processing is shared data backref, if so, just skip this leaf. Shared data refs added into preftrees.direct have all entry value = 0 (root_id = 0, key = NULL, level = 0) except parent entry. Other refs from indirect tree will have key value and root id != 0, and these values won't be changed when their parent is resolved and added to preftrees.direct. Therefore, we could reuse the preftrees.direct and search ref with all values = 0 except parent is set to avoid getting those resolved refs block. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: ethanwu Signed-off-by: David Sterba --- fs/btrfs/backref.c | 61 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f847141c8b41..04e9d2efa958 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -386,8 +386,34 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info, wanted_disk_byte, count, sc, gfp_mask); } +static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) +{ + struct rb_node **p = &preftrees->direct.root.rb_root.rb_node; + struct rb_node *parent = NULL; + struct prelim_ref *ref = NULL; + struct prelim_ref target = {0}; + int result; + + target.parent = bytenr; + + while (*p) { + parent = *p; + ref = rb_entry(parent, struct prelim_ref, rbnode); + result = prelim_ref_compare(ref, &target); + + if (result < 0) + p = &(*p)->rb_left; + else if (result > 0) + p = &(*p)->rb_right; + else + return 1; + } + return 0; +} + static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, struct prelim_ref *ref, + struct ulist *parents, + struct preftrees *preftrees, struct prelim_ref *ref, int level, u64 time_seq, const u64 *extent_item_pos, u64 total_refs, bool ignore_offset) { @@ -412,11 +438,16 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, } /* - * We normally enter this function with the path already pointing to - * the first item to check. But sometimes, we may enter it with - * slot==nritems. In that case, go to the next leaf before we continue. + * 1. We normally enter this function with the path already pointing to + * the first item to check. But sometimes, we may enter it with + * slot == nritems. + * 2. We are searching for normal backref but bytenr of this leaf + * matches shared data backref + * For these cases, go to the next leaf before we continue. */ - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + eb = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(eb) || + is_shared_data_backref(preftrees, eb->start)) { if (time_seq == SEQ_LAST) ret = btrfs_next_leaf(root, path); else @@ -433,6 +464,17 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, key.type != BTRFS_EXTENT_DATA_KEY) break; + /* + * We are searching for normal backref but bytenr of this leaf + * matches shared data backref. + */ + if (slot == 0 && is_shared_data_backref(preftrees, eb->start)) { + if (time_seq == SEQ_LAST) + ret = btrfs_next_leaf(root, path); + else + ret = btrfs_next_old_leaf(root, path, time_seq); + continue; + } fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); data_offset = btrfs_file_extent_offset(eb, fi); @@ -484,6 +526,7 @@ next: */ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, + struct preftrees *preftrees, struct prelim_ref *ref, struct ulist *parents, const u64 *extent_item_pos, u64 total_refs, bool ignore_offset) @@ -577,8 +620,8 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; } - ret = add_all_parents(root, path, parents, ref, level, time_seq, - extent_item_pos, total_refs, ignore_offset); + ret = add_all_parents(root, path, parents, preftrees, ref, level, + time_seq, extent_item_pos, total_refs, ignore_offset); out: btrfs_put_root(root); out_free: @@ -658,8 +701,8 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, ret = BACKREF_FOUND_SHARED; goto out; } - err = resolve_indirect_ref(fs_info, path, time_seq, ref, - parents, extent_item_pos, + err = resolve_indirect_ref(fs_info, path, time_seq, preftrees, + ref, parents, extent_item_pos, total_refs, ignore_offset); /* * we can only tolerate ENOENT,otherwise,we should catch error From cfc0eed0ec89db7c4a8d461174cabfaa4a0912c7 Mon Sep 17 00:00:00 2001 From: ethanwu Date: Fri, 7 Feb 2020 17:38:17 +0800 Subject: [PATCH 094/210] btrfs: backref, only search backref entries from leaves of the same root We could have some nodes/leaves in subvolume whose owner are not the that subvolume. In this way, when we resolve normal backrefs of that subvolume, we should avoid collecting those references from these blocks. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: ethanwu Signed-off-by: David Sterba --- fs/btrfs/backref.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 04e9d2efa958..19a3ae79a0a8 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -443,11 +443,14 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * slot == nritems. * 2. We are searching for normal backref but bytenr of this leaf * matches shared data backref + * 3. The leaf owner is not equal to the root we are searching + * * For these cases, go to the next leaf before we continue. */ eb = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(eb) || - is_shared_data_backref(preftrees, eb->start)) { + is_shared_data_backref(preftrees, eb->start) || + ref->root_id != btrfs_header_owner(eb)) { if (time_seq == SEQ_LAST) ret = btrfs_next_leaf(root, path); else @@ -466,9 +469,12 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, /* * We are searching for normal backref but bytenr of this leaf - * matches shared data backref. + * matches shared data backref, OR + * the leaf owner is not equal to the root we are searching for */ - if (slot == 0 && is_shared_data_backref(preftrees, eb->start)) { + if (slot == 0 && + (is_shared_data_backref(preftrees, eb->start) || + ref->root_id != btrfs_header_owner(eb))) { if (time_seq == SEQ_LAST) ret = btrfs_next_leaf(root, path); else From b25b0b871f206936d5bca02b80d38c05623e27da Mon Sep 17 00:00:00 2001 From: ethanwu Date: Fri, 7 Feb 2020 17:38:18 +0800 Subject: [PATCH 095/210] btrfs: backref, use correct count to resolve normal data refs With the following patches: - btrfs: backref, only collect file extent items matching backref offset - btrfs: backref, not adding refs from shared block when resolving normal backref - btrfs: backref, only search backref entries from leaves of the same root we only collect the normal data refs we want, so the imprecise upper bound total_refs of that EXTENT_ITEM could now be changed to the count of the normal backref entry we want to search. Background and how the patches fit together: Btrfs has two types of data backref. For BTRFS_EXTENT_DATA_REF_KEY type of backref, we don't have the exact block number. Therefore, we need to call resolve_indirect_refs. It uses btrfs_search_slot to locate the leaf block. Then we need to walk through the leaves to search for the EXTENT_DATA items that have disk bytenr matching the extent item (add_all_parents). When resolving indirect refs, we could take entries that don't belong to the backref entry we are searching for right now. For that reason when searching backref entry, we always use total refs of that EXTENT_ITEM rather than individual count. For example: item 11 key (40831553536 EXTENT_ITEM 4194304) itemoff 15460 itemsize extent refs 24 gen 7302 flags DATA shared data backref parent 394985472 count 10 #1 extent data backref root 257 objectid 260 offset 1048576 count 3 #2 extent data backref root 256 objectid 260 offset 65536 count 6 #3 extent data backref root 257 objectid 260 offset 65536 count 5 #4 For example, when searching backref entry #4, we'll use total_refs 24, a very loose loop ending condition, instead of total_refs = 5. But using total_refs = 24 is not accurate. Sometimes, we'll never find all the refs from specific root. As a result, the loop keeps on going until we reach the end of that inode. The first 3 patches, handle 3 different types refs we might encounter. These refs do not belong to the normal backref we are searching, and hence need to be skipped. This patch changes the total_refs to correct number so that we could end loop as soon as we find all the refs we want. btrfs send uses backref to find possible clone sources, the following is a simple test to compare the results with and without this patch: $ btrfs subvolume create /sub1 $ for i in `seq 1 163840`; do dd if=/dev/zero of=/sub1/file bs=64K count=1 seek=$((i-1)) conv=notrunc oflag=direct done $ btrfs subvolume snapshot /sub1 /sub2 $ for i in `seq 1 163840`; do dd if=/dev/zero of=/sub1/file bs=4K count=1 seek=$(((i-1)*16+10)) conv=notrunc oflag=direct done $ btrfs subvolume snapshot -r /sub1 /snap1 $ time btrfs send /snap1 | btrfs receive /volume2 Without this patch: real 69m48.124s user 0m50.199s sys 70m15.600s With this patch: real 1m59.683s user 0m35.421s sys 2m42.684s Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: ethanwu [ add patchset cover letter with background and numbers ] Signed-off-by: David Sterba --- fs/btrfs/backref.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 19a3ae79a0a8..327e4480957b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -415,7 +415,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, struct preftrees *preftrees, struct prelim_ref *ref, int level, u64 time_seq, const u64 *extent_item_pos, - u64 total_refs, bool ignore_offset) + bool ignore_offset) { int ret = 0; int slot; @@ -457,7 +457,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, ret = btrfs_next_old_leaf(root, path, time_seq); } - while (!ret && count < total_refs) { + while (!ret && count < ref->count) { eb = path->nodes[0]; slot = path->slots[0]; @@ -534,8 +534,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct preftrees *preftrees, struct prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos, u64 total_refs, - bool ignore_offset) + const u64 *extent_item_pos, bool ignore_offset) { struct btrfs_root *root; struct btrfs_key root_key; @@ -627,7 +626,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, } ret = add_all_parents(root, path, parents, preftrees, ref, level, - time_seq, extent_item_pos, total_refs, ignore_offset); + time_seq, extent_item_pos, ignore_offset); out: btrfs_put_root(root); out_free: @@ -663,7 +662,7 @@ unode_aux_to_inode_list(struct ulist_node *node) static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct preftrees *preftrees, - const u64 *extent_item_pos, u64 total_refs, + const u64 *extent_item_pos, struct share_check *sc, bool ignore_offset) { int err; @@ -709,7 +708,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, } err = resolve_indirect_ref(fs_info, path, time_seq, preftrees, ref, parents, extent_item_pos, - total_refs, ignore_offset); + ignore_offset); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. @@ -812,8 +811,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, */ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head, u64 seq, - struct preftrees *preftrees, u64 *total_refs, - struct share_check *sc) + struct preftrees *preftrees, struct share_check *sc) { struct btrfs_delayed_ref_node *node; struct btrfs_delayed_extent_op *extent_op = head->extent_op; @@ -847,7 +845,6 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, default: BUG(); } - *total_refs += count; switch (node->type) { case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ @@ -930,7 +927,7 @@ out: static int add_inline_refs(const struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, int *info_level, struct preftrees *preftrees, - u64 *total_refs, struct share_check *sc) + struct share_check *sc) { int ret = 0; int slot; @@ -954,7 +951,6 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); - *total_refs += btrfs_extent_refs(leaf, ei); btrfs_item_key_to_cpu(leaf, &found_key, slot); ptr = (unsigned long)(ei + 1); @@ -1179,8 +1175,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct prelim_ref *ref; struct rb_node *node; struct extent_inode_elem *eie = NULL; - /* total of both direct AND indirect refs! */ - u64 total_refs = 0; struct preftrees preftrees = { .direct = PREFTREE_INIT, .indirect = PREFTREE_INIT, @@ -1249,7 +1243,7 @@ again: } spin_unlock(&delayed_refs->lock); ret = add_delayed_refs(fs_info, head, time_seq, - &preftrees, &total_refs, sc); + &preftrees, sc); mutex_unlock(&head->mutex); if (ret) goto out; @@ -1270,8 +1264,7 @@ again: (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { ret = add_inline_refs(fs_info, path, bytenr, - &info_level, &preftrees, - &total_refs, sc); + &info_level, &preftrees, sc); if (ret) goto out; ret = add_keyed_refs(fs_info, path, bytenr, info_level, @@ -1290,7 +1283,7 @@ again: WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, - extent_item_pos, total_refs, sc, ignore_offset); + extent_item_pos, sc, ignore_offset); if (ret) goto out; From c17af96554a8a8777cbb0fd53b8497250e548b43 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 19 Feb 2020 15:17:20 +0100 Subject: [PATCH 096/210] btrfs: raid56: simplify tracking of Q stripe presence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are temporary variables tracking the index of P and Q stripes, but none of them is really used as such, merely for determining if the Q stripe is present. This leads to compiler warnings with -Wunused-but-set-variable and has been reported several times. fs/btrfs/raid56.c: In function ‘finish_rmw’: fs/btrfs/raid56.c:1199:6: warning: variable ‘p_stripe’ set but not used [-Wunused-but-set-variable] 1199 | int p_stripe = -1; | ^~~~~~~~ fs/btrfs/raid56.c: In function ‘finish_parity_scrub’: fs/btrfs/raid56.c:2356:6: warning: variable ‘p_stripe’ set but not used [-Wunused-but-set-variable] 2356 | int p_stripe = -1; | ^~~~~~~~ Replace the two variables with one that has a clear meaning and also get rid of the warnings. The logic that verifies that there are only 2 valid cases is unchanged. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index a8e53c8e7b01..406b1efd3ba5 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1196,22 +1196,19 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) int nr_data = rbio->nr_data; int stripe; int pagenr; - int p_stripe = -1; - int q_stripe = -1; + bool has_qstripe; struct bio_list bio_list; struct bio *bio; int ret; bio_list_init(&bio_list); - if (rbio->real_stripes - rbio->nr_data == 1) { - p_stripe = rbio->real_stripes - 1; - } else if (rbio->real_stripes - rbio->nr_data == 2) { - p_stripe = rbio->real_stripes - 2; - q_stripe = rbio->real_stripes - 1; - } else { + if (rbio->real_stripes - rbio->nr_data == 1) + has_qstripe = false; + else if (rbio->real_stripes - rbio->nr_data == 2) + has_qstripe = true; + else BUG(); - } /* at this point we either have a full stripe, * or we've read the full stripe from the drive. @@ -1255,7 +1252,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) SetPageUptodate(p); pointers[stripe++] = kmap(p); - if (q_stripe != -1) { + if (has_qstripe) { /* * raid6, add the qstripe and call the @@ -2353,8 +2350,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int nr_data = rbio->nr_data; int stripe; int pagenr; - int p_stripe = -1; - int q_stripe = -1; + bool has_qstripe; struct page *p_page = NULL; struct page *q_page = NULL; struct bio_list bio_list; @@ -2364,14 +2360,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, bio_list_init(&bio_list); - if (rbio->real_stripes - rbio->nr_data == 1) { - p_stripe = rbio->real_stripes - 1; - } else if (rbio->real_stripes - rbio->nr_data == 2) { - p_stripe = rbio->real_stripes - 2; - q_stripe = rbio->real_stripes - 1; - } else { + if (rbio->real_stripes - rbio->nr_data == 1) + has_qstripe = false; + else if (rbio->real_stripes - rbio->nr_data == 2) + has_qstripe = true; + else BUG(); - } if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { is_replace = 1; @@ -2393,7 +2387,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, goto cleanup; SetPageUptodate(p_page); - if (q_stripe != -1) { + if (has_qstripe) { q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!q_page) { __free_page(p_page); @@ -2416,8 +2410,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, /* then add the parity stripe */ pointers[stripe++] = kmap(p_page); - if (q_stripe != -1) { - + if (has_qstripe) { /* * raid6, add the qstripe and call the * library function to fill in our p/q From 560b7a4aa2258e27ad38eb417aabebc1e2c05f5f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 18 Feb 2020 16:56:07 +0200 Subject: [PATCH 097/210] btrfs: call btrfs_check_uuid_tree_entry directly in btrfs_uuid_tree_iterate btrfs_uuid_tree_iterate is called from only once place and its 2nd argument is always btrfs_check_uuid_tree_entry. Simplify btrfs_uuid_tree_iterate's signature by removing its 2nd argument and directly calling btrfs_check_uuid_tree_entry. Also move the latter into uuid-tree.h. No functional changes. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 +--- fs/btrfs/uuid-tree.c | 53 ++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/volumes.c | 47 +-------------------------------------- 3 files changed, 51 insertions(+), 53 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index bb237d577725..ad275d06e95f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2738,9 +2738,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); -int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, - int (*check_func)(struct btrfs_fs_info *, u8 *, u8, - u64)); +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 76b84f2397b1..dc95e954ebbe 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -246,9 +246,53 @@ out: return ret; } -int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, - int (*check_func)(struct btrfs_fs_info *, u8 *, u8, - u64)) +/* + * Check if there's an matching subvolume for given UUID + * + * Return: + * 0 check succeeded, the entry is not outdated + * > 0 if the check failed, the caller should remove the entry + * < 0 if an error occurred + */ +static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, + u8 *uuid, u8 type, u64 subvolid) +{ + struct btrfs_key key; + int ret = 0; + struct btrfs_root *subvol_root; + + if (type != BTRFS_UUID_KEY_SUBVOL && + type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) + goto out; + + key.objectid = subvolid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + subvol_root = btrfs_get_fs_root(fs_info, &key, true); + if (IS_ERR(subvol_root)) { + ret = PTR_ERR(subvol_root); + if (ret == -ENOENT) + ret = 1; + goto out; + } + + switch (type) { + case BTRFS_UUID_KEY_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) + ret = 1; + break; + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.received_uuid, + BTRFS_UUID_SIZE)) + ret = 1; + break; + } + btrfs_put_root(subvol_root); +out: + return ret; +} + +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->uuid_root; struct btrfs_key key; @@ -305,7 +349,8 @@ again_search_slot: read_extent_buffer(leaf, &subid_le, offset, sizeof(subid_le)); subid_cpu = le64_to_cpu(subid_le); - ret = check_func(fs_info, uuid, key.type, subid_cpu); + ret = btrfs_check_uuid_tree_entry(fs_info, uuid, + key.type, subid_cpu); if (ret < 0) goto out; if (ret > 0) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d8a88866aaa3..63e499d47820 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4442,51 +4442,6 @@ out: return 0; } -/* - * Callback for btrfs_uuid_tree_iterate(). - * returns: - * 0 check succeeded, the entry is not outdated. - * < 0 if an error occurred. - * > 0 if the check failed, which means the caller shall remove the entry. - */ -static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, - u8 *uuid, u8 type, u64 subid) -{ - struct btrfs_key key; - int ret = 0; - struct btrfs_root *subvol_root; - - if (type != BTRFS_UUID_KEY_SUBVOL && - type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) - goto out; - - key.objectid = subid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - subvol_root = btrfs_get_fs_root(fs_info, &key, true); - if (IS_ERR(subvol_root)) { - ret = PTR_ERR(subvol_root); - if (ret == -ENOENT) - ret = 1; - goto out; - } - - switch (type) { - case BTRFS_UUID_KEY_SUBVOL: - if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) - ret = 1; - break; - case BTRFS_UUID_KEY_RECEIVED_SUBVOL: - if (memcmp(uuid, subvol_root->root_item.received_uuid, - BTRFS_UUID_SIZE)) - ret = 1; - break; - } - btrfs_put_root(subvol_root); -out: - return ret; -} - static int btrfs_uuid_rescan_kthread(void *data) { struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; @@ -4497,7 +4452,7 @@ static int btrfs_uuid_rescan_kthread(void *data) * to delete all entries that contain outdated data. * 2nd step is to add all missing entries to the UUID tree. */ - ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); + ret = btrfs_uuid_tree_iterate(fs_info); if (ret < 0) { btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); up(&fs_info->uuid_tree_rescan_sem); From 97f4dd09dad05943a9c4184ace47258ed09e8e74 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 18 Feb 2020 16:56:08 +0200 Subject: [PATCH 098/210] btrfs: make btrfs_check_uuid_tree private to disk-io.c It's used only during filesystem mount as such it can be made private to disk-io.c file. Also use the occasion to move btrfs_uuid_rescan_kthread as btrfs_check_uuid_tree is its sole caller. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 35 +++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.c | 37 +------------------------------------ fs/btrfs/volumes.h | 2 +- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 756bf2ab64cd..074b77041f89 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2835,6 +2835,41 @@ fail: return ret; } +static int btrfs_uuid_rescan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; + int ret; + + /* + * 1st step is to iterate through the existing UUID tree and + * to delete all entries that contain outdated data. + * 2nd step is to add all missing entries to the UUID tree. + */ + ret = btrfs_uuid_tree_iterate(fs_info); + if (ret < 0) { + btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); + up(&fs_info->uuid_tree_rescan_sem); + return ret; + } + return btrfs_uuid_scan_kthread(data); +} + +static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_rescan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 63e499d47820..cb2f18575ef4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4313,7 +4313,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) return 0; } -static int btrfs_uuid_scan_kthread(void *data) +int btrfs_uuid_scan_kthread(void *data) { struct btrfs_fs_info *fs_info = data; struct btrfs_root *root = fs_info->tree_root; @@ -4442,25 +4442,6 @@ out: return 0; } -static int btrfs_uuid_rescan_kthread(void *data) -{ - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; - int ret; - - /* - * 1st step is to iterate through the existing UUID tree and - * to delete all entries that contain outdated data. - * 2nd step is to add all missing entries to the UUID tree. - */ - ret = btrfs_uuid_tree_iterate(fs_info); - if (ret < 0) { - btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); - up(&fs_info->uuid_tree_rescan_sem); - return ret; - } - return btrfs_uuid_scan_kthread(data); -} - int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) { struct btrfs_trans_handle *trans; @@ -4503,22 +4484,6 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) -{ - struct task_struct *task; - - down(&fs_info->uuid_tree_rescan_sem); - task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); - if (IS_ERR(task)) { - /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - btrfs_warn(fs_info, "failed to start uuid_rescan task"); - up(&fs_info->uuid_tree_rescan_sem); - return PTR_ERR(task); - } - - return 0; -} - /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7fa392f38262..7b657f2cba57 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -459,7 +459,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); -int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_uuid_scan_kthread(void *data); int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); From c94bec2c6190a5d13f59c0f36a660be5e2d85644 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 15:05:01 -0500 Subject: [PATCH 099/210] btrfs: bail out of uuid tree scanning if we're closing In doing my fsstress+EIO stress testing I started running into issues where umount would get stuck forever because the uuid checker was chewing through the thousands of subvolumes I had created. We shouldn't block umount on this, simply bail if we're unmounting the fs. We need to make sure we don't mark the UUID tree as ok, so we only set that bit if we made it through the whole rescan operation, but otherwise this is completely safe. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 +++- fs/btrfs/uuid-tree.c | 4 ++++ fs/btrfs/volumes.c | 7 ++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 074b77041f89..864ffc6c81c1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2847,7 +2847,9 @@ static int btrfs_uuid_rescan_kthread(void *data) */ ret = btrfs_uuid_tree_iterate(fs_info); if (ret < 0) { - btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); + if (ret != -EINTR) + btrfs_warn(fs_info, "iterating uuid_tree failed %d", + ret); up(&fs_info->uuid_tree_rescan_sem); return ret; } diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index dc95e954ebbe..76671a6bcb61 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -322,6 +322,10 @@ again_search_slot: } while (1) { + if (btrfs_fs_closing(fs_info)) { + ret = -EINTR; + goto out; + } cond_resched(); leaf = path->nodes[0]; slot = path->slots[0]; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cb2f18575ef4..b9787945fe26 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4325,6 +4325,7 @@ int btrfs_uuid_scan_kthread(void *data) struct btrfs_root_item root_item; u32 item_size; struct btrfs_trans_handle *trans = NULL; + bool closing = false; path = btrfs_alloc_path(); if (!path) { @@ -4337,6 +4338,10 @@ int btrfs_uuid_scan_kthread(void *data) key.offset = 0; while (1) { + if (btrfs_fs_closing(fs_info)) { + closing = true; + break; + } ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); if (ret) { @@ -4436,7 +4441,7 @@ out: btrfs_end_transaction(trans); if (ret) btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); - else + else if (!closing) set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); up(&fs_info->uuid_tree_rescan_sem); return 0; From 75ec1db8717a8f0a9d9c8d033e542fdaa7b73898 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 15:22:06 -0500 Subject: [PATCH 100/210] btrfs: set update the uuid generation as soon as possible In my EIO stress testing I noticed I was getting forced to rescan the uuid tree pretty often, which was weird. This is because my error injection stuff would sometimes inject an error after log replay but before we loaded the UUID tree. If log replay committed the transaction it wouldn't have updated the uuid tree generation, but the tree was valid and didn't change, so there's no reason to not update the generation here. Fix this by setting the BTRFS_FS_UPDATE_UUID_TREE_GEN bit immediately after reading all the fs roots if the uuid tree generation matches the fs generation. Then any transaction commits that happen during mount won't screw up our uuid tree state, forcing us to do needless uuid rescans. Fixes: 70f801754728 ("Btrfs: check UUID tree during mount if required") CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 864ffc6c81c1..770d469e1d9c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3151,6 +3151,18 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (ret) goto fail_tree_roots; + /* + * If we have a uuid root and we're not being told to rescan we need to + * check the generation here so we can set the + * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the + * transaction during a balance or the log replay without updating the + * uuid generation, and then if we crash we would rescan the uuid tree, + * even though it was perfectly fine. + */ + if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) && + fs_info->generation == btrfs_super_uuid_tree_generation(disk_super)) + set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); + ret = btrfs_verify_dev_extents(fs_info); if (ret) { btrfs_err(fs_info, @@ -3375,8 +3387,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device close_ctree(fs_info); return ret; } - } else { - set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); } set_bit(BTRFS_FS_OPEN, &fs_info->flags); From 5ce48d0f0e2656050a68dc3dbbf0d7d1be862c99 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Sun, 23 Feb 2020 23:16:42 +0000 Subject: [PATCH 101/210] btrfs: Add missing lock annotation for release_extent_buffer() Sparse reports a warning at release_extent_buffer() warning: context imbalance in release_extent_buffer() - unexpected unlock The root cause is the missing annotation at release_extent_buffer() Add the missing __releases(&eb->refs_lock) annotation Reviewed-by: Nikolay Borisov Signed-off-by: Jules Irenge Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4f4b41c454f2..14b7007634b7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5276,6 +5276,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) } static int release_extent_buffer(struct extent_buffer *eb) + __releases(&eb->refs_lock) { lockdep_assert_held(&eb->refs_lock); From eed0269053cd37ea6ec76ad10cdd2bccc4b807a1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 21 Feb 2020 13:16:33 +0100 Subject: [PATCH 102/210] btrfs: define support masks for ioctl volume args v2 The ioctl data for devices or subvolumes can be passed via btrfs_ioctl_vol_args or btrfs_ioctl_vol_args_v2. The latter is more versatile and needs some caution as some of the flags make sense only for some ioctls. As we're going to extend the flags, define support masks for each ioctl class separately. Reviewed-by: Marcos Paulo de Souza Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 7a8bc8b920f5..49ed71df5e94 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -97,16 +97,26 @@ struct btrfs_ioctl_qgroup_limit_args { }; /* - * flags for subvolumes + * Arguments for specification of subvolumes or devices, supporting by-name or + * by-id and flags * - * Used by: - * struct btrfs_ioctl_vol_args_v2.flags + * The set of supported flags depends on the ioctl * * BTRFS_SUBVOL_RDONLY is also provided/consumed by the following ioctls: * - BTRFS_IOC_SUBVOL_GETFLAGS * - BTRFS_IOC_SUBVOL_SETFLAGS */ +/* Supported flags for BTRFS_IOC_RM_DEV_V2 */ +#define BTRFS_DEVICE_REMOVE_ARGS_MASK \ + (BTRFS_DEVICE_SPEC_BY_ID) + +/* Supported flags for BTRFS_IOC_SNAP_CREATE_V2 and BTRFS_IOC_SUBVOL_CREATE_V2 */ +#define BTRFS_SUBVOL_CREATE_ARGS_MASK \ + (BTRFS_SUBVOL_CREATE_ASYNC | \ + BTRFS_SUBVOL_RDONLY | \ + BTRFS_SUBVOL_QGROUP_INHERIT) + struct btrfs_ioctl_vol_args_v2 { __s64 fd; __u64 transid; From 673990dba332d99693e90f7ed84954f5d2d6b634 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 21 Feb 2020 13:24:37 +0100 Subject: [PATCH 103/210] btrfs: use ioctl args support mask for subvolume create/delete Using the defined mask instead of flag enumeration in the ioctl handler is preferred. No functional changes. Reviewed-by: Marcos Paulo de Souza Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7b0c79eac3d1..9e173c1ad9c4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1838,9 +1838,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, return PTR_ERR(vol_args); vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; - if (vol_args->flags & - ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | - BTRFS_SUBVOL_QGROUP_INHERIT)) { + if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) { ret = -EOPNOTSUPP; goto free_args; } From 748449cdbe434731aac68c8829158741a6f8f249 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 21 Feb 2020 13:30:14 +0100 Subject: [PATCH 104/210] btrfs: use ioctl args support mask for device delete When the device remove v2 ioctl was added, the full support mask was added to sanity check the flags. However this would allow to let the subvolume related flags to be accepted. This is not supposed to happen. Use the correct support mask, which means that now any of BTRFS_SUBVOL_CREATE_ASYNC, BTRFS_SUBVOL_RDONLY or BTRFS_SUBVOL_QGROUP_INHERIT will be rejected as ENOTSUPP. Though this is a user-visible change, specifying subvolume flags for device deletion does not make sense and there are hopefully no applications doing that. Reviewed-by: Marcos Paulo de Souza Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9e173c1ad9c4..7a7d5d4753cd 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3075,8 +3075,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; } - /* Check for compatibility reject unknown flags */ - if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) { + if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) { ret = -EOPNOTSUPP; goto out; } From c0c907a47dccf2cf26251a8fb4a8e7a3bf79ce84 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 21 Feb 2020 14:56:12 +0100 Subject: [PATCH 105/210] btrfs: export helpers for subvolume name/id resolution The functions will be used outside of export.c and super.c to allow resolving subvolume name from a given id, eg. for subvolume deletion by id ioctl. Signed-off-by: Marcos Paulo de Souza Reviewed-by: David Sterba [ split from the next patch ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/export.c | 8 ++++---- fs/btrfs/export.h | 5 +++++ fs/btrfs/super.c | 8 ++++---- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ad275d06e95f..2ee7d8b4968f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2999,6 +2999,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); int btrfs_sync_fs(struct super_block *sb, int wait); +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid); static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 657fd6ad6e18..25bd4317bf5a 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -57,9 +57,9 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, return type; } -static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, - int check_generation) +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation, + int check_generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; @@ -153,7 +153,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); } -static struct dentry *btrfs_get_parent(struct dentry *child) +struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = d_inode(child); struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index 57488ecd7d4e..f32f4113c976 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -18,4 +18,9 @@ struct btrfs_fid { u64 parent_root_objectid; } __attribute__ ((packed)); +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation, + int check_generation); +struct dentry *btrfs_get_parent(struct dentry *child); + #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5c16b4bcde9b..4969fcce0704 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1024,8 +1024,8 @@ out: return error; } -static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, - u64 subvol_objectid) +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) { struct btrfs_root *root = fs_info->tree_root; struct btrfs_root *fs_root = NULL; @@ -1442,8 +1442,8 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, goto out; } } - subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), - subvol_objectid); + subvol_name = btrfs_get_subvol_name_from_objectid( + btrfs_sb(mnt->mnt_sb), subvol_objectid); if (IS_ERR(subvol_name)) { root = ERR_CAST(subvol_name); subvol_name = NULL; From 949964c928430a42e0d4f514d888a58a201b6fcb Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 7 Feb 2020 10:05:46 -0300 Subject: [PATCH 106/210] btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl This ioctl will be responsible for deleting a subvolume using its id. This can be used when a system has a file system mounted from a subvolume, rather than the root file system, like below: / @subvol1/ @subvol2/ @subvol_default/ If only @subvol_default is mounted, we have no path to reach @subvol1 and @subvol2, thus no way to delete them. Current subvolume delete ioctl takes a file handle point as argument, and if @subvol_default is mounted, we can't reach @subvol1 and @subvol2 from the same mount point. This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes the extended structure with flags to allow to delete subvolume using subvolid. Now, we can use this new ioctl specifying the subvolume id and refer to the same mount point. It doesn't matter which subvolume was mounted, since we can reach to the desired one using the subvolume id, and then delete it. The full path to the subvolume id is resolved internally and access is verified as if the subvolume was accessed by path. The volume args v2 structure is extended to use the existing union for subvolume id specification, that's valid in case the BTRFS_SUBVOL_SPEC_BY_ID is set. Signed-off-by: Marcos Paulo de Souza Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 136 +++++++++++++++++++++++++++++++------ include/uapi/linux/btrfs.h | 12 +++- 2 files changed, 127 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7a7d5d4753cd..56bd3ea7fb67 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -28,6 +28,7 @@ #include #include "ctree.h" #include "disk-io.h" +#include "export.h" #include "transaction.h" #include "btrfs_inode.h" #include "print-tree.h" @@ -2842,7 +2843,8 @@ out: } static noinline int btrfs_ioctl_snap_destroy(struct file *file, - void __user *arg) + void __user *arg, + bool destroy_v2) { struct dentry *parent = file->f_path.dentry; struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); @@ -2851,34 +2853,120 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; - struct btrfs_ioctl_vol_args *vol_args; - int namelen; + struct btrfs_ioctl_vol_args *vol_args = NULL; + struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; + char *subvol_name, *subvol_name_ptr = NULL; + int subvol_namelen; int err = 0; + bool destroy_parent = false; - if (!S_ISDIR(dir->i_mode)) - return -ENOTDIR; + if (destroy_v2) { + vol_args2 = memdup_user(arg, sizeof(*vol_args2)); + if (IS_ERR(vol_args2)) + return PTR_ERR(vol_args2); - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) { + err = -EOPNOTSUPP; + goto out; + } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - namelen = strlen(vol_args->name); - if (strchr(vol_args->name, '/') || - strncmp(vol_args->name, "..", namelen) == 0) { - err = -EINVAL; - goto out; + /* + * If SPEC_BY_ID is not set, we are looking for the subvolume by + * name, same as v1 currently does. + */ + if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { + vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0; + subvol_name = vol_args2->name; + + err = mnt_want_write_file(file); + if (err) + goto out; + } else { + if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) { + err = -EINVAL; + goto out; + } + + err = mnt_want_write_file(file); + if (err) + goto out; + + dentry = btrfs_get_dentry(fs_info->sb, + BTRFS_FIRST_FREE_OBJECTID, + vol_args2->subvolid, 0, 0); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_drop_write; + } + + /* + * Change the default parent since the subvolume being + * deleted can be outside of the current mount point. + */ + parent = btrfs_get_parent(dentry); + + /* + * At this point dentry->d_name can point to '/' if the + * subvolume we want to destroy is outsite of the + * current mount point, so we need to release the + * current dentry and execute the lookup to return a new + * one with ->d_name pointing to the + * /subvol_name. + */ + dput(dentry); + if (IS_ERR(parent)) { + err = PTR_ERR(parent); + goto out_drop_write; + } + dir = d_inode(parent); + + /* + * If v2 was used with SPEC_BY_ID, a new parent was + * allocated since the subvolume can be outside of the + * current mount point. Later on we need to release this + * new parent dentry. + */ + destroy_parent = true; + + subvol_name_ptr = btrfs_get_subvol_name_from_objectid( + fs_info, vol_args2->subvolid); + if (IS_ERR(subvol_name_ptr)) { + err = PTR_ERR(subvol_name_ptr); + goto free_parent; + } + /* subvol_name_ptr is already NULL termined */ + subvol_name = (char *)kbasename(subvol_name_ptr); + } + } else { + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = 0; + subvol_name = vol_args->name; + + err = mnt_want_write_file(file); + if (err) + goto out; } - err = mnt_want_write_file(file); - if (err) - goto out; + subvol_namelen = strlen(subvol_name); + if (strchr(subvol_name, '/') || + strncmp(subvol_name, "..", subvol_namelen) == 0) { + err = -EINVAL; + goto free_subvol_name; + } + + if (!S_ISDIR(dir->i_mode)) { + err = -ENOTDIR; + goto free_subvol_name; + } err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (err == -EINTR) - goto out_drop_write; - dentry = lookup_one_len(vol_args->name, parent, namelen); + goto free_subvol_name; + dentry = lookup_one_len(subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_unlock_dir; @@ -2947,9 +3035,15 @@ out_dput: dput(dentry); out_unlock_dir: inode_unlock(dir); +free_subvol_name: + kfree(subvol_name_ptr); +free_parent: + if (destroy_parent) + dput(parent); out_drop_write: mnt_drop_write_file(file); out: + kfree(vol_args2); kfree(vol_args); return err; } @@ -5474,7 +5568,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SUBVOL_CREATE_V2: return btrfs_ioctl_snap_create_v2(file, argp, 1); case BTRFS_IOC_SNAP_DESTROY: - return btrfs_ioctl_snap_destroy(file, argp); + return btrfs_ioctl_snap_destroy(file, argp, false); + case BTRFS_IOC_SNAP_DESTROY_V2: + return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: return btrfs_ioctl_subvol_getflags(file, argp); case BTRFS_IOC_SUBVOL_SETFLAGS: diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 49ed71df5e94..b5f3ea36d3cb 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -42,11 +42,14 @@ struct btrfs_ioctl_vol_args { #define BTRFS_DEVICE_SPEC_BY_ID (1ULL << 3) +#define BTRFS_SUBVOL_SPEC_BY_ID (1ULL << 4) + #define BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED \ (BTRFS_SUBVOL_CREATE_ASYNC | \ BTRFS_SUBVOL_RDONLY | \ BTRFS_SUBVOL_QGROUP_INHERIT | \ - BTRFS_DEVICE_SPEC_BY_ID) + BTRFS_DEVICE_SPEC_BY_ID | \ + BTRFS_SUBVOL_SPEC_BY_ID) #define BTRFS_FSID_SIZE 16 #define BTRFS_UUID_SIZE 16 @@ -117,6 +120,10 @@ struct btrfs_ioctl_qgroup_limit_args { BTRFS_SUBVOL_RDONLY | \ BTRFS_SUBVOL_QGROUP_INHERIT) +/* Supported flags for BTRFS_IOC_SNAP_DESTROY_V2 */ +#define BTRFS_SUBVOL_DELETE_ARGS_MASK \ + (BTRFS_SUBVOL_SPEC_BY_ID) + struct btrfs_ioctl_vol_args_v2 { __s64 fd; __u64 transid; @@ -131,6 +138,7 @@ struct btrfs_ioctl_vol_args_v2 { union { char name[BTRFS_SUBVOL_NAME_MAX + 1]; __u64 devid; + __u64 subvolid; }; }; @@ -959,5 +967,7 @@ enum btrfs_err_code { struct btrfs_ioctl_get_subvol_rootref_args) #define BTRFS_IOC_INO_LOOKUP_USER _IOWR(BTRFS_IOCTL_MAGIC, 62, \ struct btrfs_ioctl_ino_lookup_user_args) +#define BTRFS_IOC_SNAP_DESTROY_V2 _IOW(BTRFS_IOCTL_MAGIC, 63, \ + struct btrfs_ioctl_vol_args_v2) #endif /* _UAPI_LINUX_BTRFS_H */ From 831fa14f1ef0bfa9740b4e939da3263dc60502e3 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 12 Feb 2020 00:10:19 +0900 Subject: [PATCH 107/210] btrfs: use inode from io_ctl in io_ctl_prepare_pages io_ctl_prepare_pages() gets a 'struct btrfs_io_ctl' as well as a 'struct inode', but btrfs_io_ctl::inode points to the same struct inode as this is assgined in io_ctl_init(). Use the inode form io_ctl to reduce the arguments of io_ctl_prepare_pages. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bd9c4b5da549..7813640c12aa 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -371,10 +371,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) } } -static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode, - int uptodate) +static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, int uptodate) { struct page *page; + struct inode *inode = io_ctl->inode; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int i; @@ -732,7 +732,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, readahead_cache(inode); - ret = io_ctl_prepare_pages(&io_ctl, inode, 1); + ret = io_ctl_prepare_pages(&io_ctl, 1); if (ret) goto out; @@ -1292,7 +1292,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, } /* Lock all pages first so we can lock the extent safely. */ - ret = io_ctl_prepare_pages(io_ctl, inode, 0); + ret = io_ctl_prepare_pages(io_ctl, 0); if (ret) goto out_unlock; From 7a195f6db90fd8664af9ade0cee4e3876ac8d090 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 12 Feb 2020 00:10:20 +0900 Subject: [PATCH 108/210] btrfs: make the uptodate argument of io_ctl_add_pages() boolean Make the uptodate argument of io_ctl_add_pages() boolean. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 7813640c12aa..c65b0e69265c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -371,7 +371,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) } } -static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, int uptodate) +static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) { struct page *page; struct inode *inode = io_ctl->inode; @@ -732,7 +732,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, readahead_cache(inode); - ret = io_ctl_prepare_pages(&io_ctl, 1); + ret = io_ctl_prepare_pages(&io_ctl, true); if (ret) goto out; @@ -1292,7 +1292,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, } /* Lock all pages first so we can lock the extent safely. */ - ret = io_ctl_prepare_pages(io_ctl, 0); + ret = io_ctl_prepare_pages(io_ctl, false); if (ret) goto out_unlock; From 1afb648e94542863c6affd3b8c10d8d3011a6dcb Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 12 Feb 2020 00:10:21 +0900 Subject: [PATCH 109/210] btrfs: use standard debug config option to enable free-space-cache debug prints free-space-cache.c has it's own set of DEBUG ifdefs which need to be turned on instead of the global CONFIG_BTRFS_DEBUG to print debug messages about failed block-group writes. Switch this over to CONFIG_BTRFS_DEBUG so we always see these messages when running a debug kernel. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index c65b0e69265c..460a6c1ef8cf 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1191,7 +1191,7 @@ out: invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; if (block_group) { -#ifdef DEBUG +#ifdef CONFIG_BTRFS_DEBUG btrfs_err(root->fs_info, "failed to write free space cache for block group %llu", block_group->start); @@ -1417,7 +1417,7 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, block_group, &block_group->io_ctl, trans); if (ret) { -#ifdef DEBUG +#ifdef CONFIG_BTRFS_DEBUG btrfs_err(fs_info, "failed to write free space cache for block group %llu", block_group->start); @@ -4037,7 +4037,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, if (release_metadata) btrfs_delalloc_release_metadata(BTRFS_I(inode), inode->i_size, true); -#ifdef DEBUG +#ifdef CONFIG_BTRFS_DEBUG btrfs_err(fs_info, "failed to write free ino cache for root %llu", root->root_key.objectid); From fd8efa818c55300ffb69c11e573c5ed18cda541c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 12 Feb 2020 00:10:22 +0900 Subject: [PATCH 110/210] btrfs: simplify error handling in __btrfs_write_out_cache() The error cleanup gotos in __btrfs_write_out_cache() needlessly jump back making the code less readable then needed. Flatten them out so no back-jump is necessary and the read flow is uninterrupted. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 460a6c1ef8cf..3613da065a73 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1367,18 +1367,6 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, return 0; -out: - io_ctl->inode = NULL; - io_ctl_free(io_ctl); - if (ret) { - invalidate_inode_pages2(inode->i_mapping); - BTRFS_I(inode)->generation = 0; - } - btrfs_update_inode(trans, root, inode); - if (must_iput) - iput(inode); - return ret; - out_nospc_locked: cleanup_bitmap_list(&bitmap_list); spin_unlock(&ctl->tree_lock); @@ -1391,7 +1379,17 @@ out_unlock: if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); - goto out; +out: + io_ctl->inode = NULL; + io_ctl_free(io_ctl); + if (ret) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + } + btrfs_update_inode(trans, root, inode); + if (must_iput) + iput(inode); + return ret; } int btrfs_write_out_cache(struct btrfs_trans_handle *trans, From 2992df73268f78ec9281692b9b44ae92f3933b54 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 30 Jan 2020 14:59:44 +0200 Subject: [PATCH 111/210] btrfs: Implement DREW lock A (D)ouble (R)eader (W)riter (E)xclustion lock is a locking primitive that allows to have multiple readers or multiple writers but not multiple readers and writers holding it concurrently. The code is factored out from the existing open-coded locking scheme used to exclude pending snapshots from nocow writers and vice-versa. Current implementation actually favors Readers (that is snapshot creaters) to writers (nocow writers of the filesystem). The API provides lock/unlock/trylock for reads and writes. Formal specification for TLA+ provided by Valentin Schneider is at https://lore.kernel.org/linux-btrfs/2dcaf81c-f0d3-409e-cb29-733d8b3b4cc9@arm.com/ Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/locking.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/locking.h | 20 ++++++++++ 3 files changed, 114 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2ee7d8b4968f..ab8151247b93 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@ #include "extent_map.h" #include "async-thread.h" #include "block-rsv.h" +#include "locking.h" struct btrfs_trans_handle; struct btrfs_transaction; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index e713900f96b6..fb647d8cf527 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -565,3 +565,96 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) } return eb; } + +/* + * DREW locks + * ========== + * + * DREW stands for double-reader-writer-exclusion lock. It's used in situation + * where you want to provide A-B exclusion but not AA or BB. + * + * Currently implementation gives more priority to reader. If a reader and a + * writer both race to acquire their respective sides of the lock the writer + * would yield its lock as soon as it detects a concurrent reader. Additionally + * if there are pending readers no new writers would be allowed to come in and + * acquire the lock. + */ + +int btrfs_drew_lock_init(struct btrfs_drew_lock *lock) +{ + int ret; + + ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL); + if (ret) + return ret; + + atomic_set(&lock->readers, 0); + init_waitqueue_head(&lock->pending_readers); + init_waitqueue_head(&lock->pending_writers); + + return 0; +} + +void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock) +{ + percpu_counter_destroy(&lock->writers); +} + +/* Return true if acquisition is successful, false otherwise */ +bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) +{ + if (atomic_read(&lock->readers)) + return false; + + percpu_counter_inc(&lock->writers); + + /* Ensure writers count is updated before we check for pending readers */ + smp_mb(); + if (atomic_read(&lock->readers)) { + btrfs_drew_write_unlock(lock); + return false; + } + + return true; +} + +void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) +{ + while (true) { + if (btrfs_drew_try_write_lock(lock)) + return; + wait_event(lock->pending_writers, !atomic_read(&lock->readers)); + } +} + +void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) +{ + percpu_counter_dec(&lock->writers); + cond_wake_up(&lock->pending_readers); +} + +void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) +{ + atomic_inc(&lock->readers); + + /* + * Ensure the pending reader count is perceieved BEFORE this reader + * goes to sleep in case of active writers. This guarantees new writers + * won't be allowed and that the current reader will be woken up when + * the last active writer finishes its jobs. + */ + smp_mb__after_atomic(); + + wait_event(lock->pending_readers, + percpu_counter_sum(&lock->writers) == 0); +} + +void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) +{ + /* + * atomic_dec_and_test implies a full barrier, so woken up writers + * are guaranteed to see the decrement + */ + if (atomic_dec_and_test(&lock->readers)) + wake_up(&lock->pending_writers); +} diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 21a285883e89..d715846c10b8 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -6,6 +6,9 @@ #ifndef BTRFS_LOCKING_H #define BTRFS_LOCKING_H +#include +#include +#include #include "extent_io.h" #define BTRFS_WRITE_LOCK 1 @@ -13,6 +16,8 @@ #define BTRFS_WRITE_LOCK_BLOCKING 3 #define BTRFS_READ_LOCK_BLOCKING 4 +struct btrfs_path; + void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); @@ -48,4 +53,19 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) BUG(); } +struct btrfs_drew_lock { + atomic_t readers; + struct percpu_counter writers; + wait_queue_head_t pending_writers; + wait_queue_head_t pending_readers; +}; + +int btrfs_drew_lock_init(struct btrfs_drew_lock *lock); +void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock); +void btrfs_drew_write_lock(struct btrfs_drew_lock *lock); +bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock); +void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock); +void btrfs_drew_read_lock(struct btrfs_drew_lock *lock); +void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock); + #endif From dcc3eb9638c3c927f1597075e851d0a16300a876 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 30 Jan 2020 14:59:45 +0200 Subject: [PATCH 112/210] btrfs: convert snapshot/nocow exlcusion to drew lock This patch removes all haphazard code implementing nocow writers exclusion from pending snapshot creation and switches to using the drew lock to ensure this invariant still holds. 'Readers' are snapshot creators from create_snapshot and 'writers' are nocow writers from buffered write path or btrfs_setsize. This locking scheme allows for multiple snapshots to happen while any nocow writers are blocked, since writes to page cache in the nocow path will make snapshots inconsistent. So for performance reasons we'd like to have the ability to run multiple concurrent snapshots and also favors readers in this case. And in case there aren't pending snapshots (which will be the majority of the cases) we rely on the percpu's writers counter to avoid cacheline contention. The main gain from using the drew lock is it's now a lot easier to reason about the guarantees of the locking scheme and whether there is some silent breakage lurking. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 10 +++------- fs/btrfs/disk-io.c | 45 ++++++++++-------------------------------- fs/btrfs/extent-tree.c | 44 ----------------------------------------- fs/btrfs/file.c | 11 +++++------ fs/btrfs/inode.c | 8 ++++---- fs/btrfs/ioctl.c | 10 +++------- 6 files changed, 25 insertions(+), 103 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ab8151247b93..db9e872bcc79 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -957,11 +957,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) return sb->s_fs_info; } -struct btrfs_subvolume_writers { - struct percpu_counter counter; - wait_queue_head_t wait; -}; - /* * The state of btrfs root */ @@ -1133,8 +1128,9 @@ struct btrfs_root { * root_item_lock. */ int dedupe_in_progress; - struct btrfs_subvolume_writers *subv_writers; - atomic_t will_be_snapshotted; + /* For exclusion of snapshot creation and nocow writes */ + struct btrfs_drew_lock snapshot_lock; + atomic_t snapshot_force_cow; /* For qgroup metadata reserved space */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 770d469e1d9c..06819c41e4f4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1104,32 +1104,6 @@ void btrfs_clean_tree_block(struct extent_buffer *buf) } } -static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) -{ - struct btrfs_subvolume_writers *writers; - int ret; - - writers = kmalloc(sizeof(*writers), GFP_NOFS); - if (!writers) - return ERR_PTR(-ENOMEM); - - ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS); - if (ret < 0) { - kfree(writers); - return ERR_PTR(ret); - } - - init_waitqueue_head(&writers->wait); - return writers; -} - -static void -btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) -{ - percpu_counter_destroy(&writers->counter); - kfree(writers); -} - static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { @@ -1178,7 +1152,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); refcount_set(&root->refs, 1); - atomic_set(&root->will_be_snapshotted, 0); atomic_set(&root->snapshot_force_cow, 0); atomic_set(&root->nr_swapfiles, 0); root->log_transid = 0; @@ -1450,7 +1423,7 @@ alloc_fail: static int btrfs_init_fs_root(struct btrfs_root *root) { int ret; - struct btrfs_subvolume_writers *writers; + unsigned int nofs_flag; root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), @@ -1460,12 +1433,15 @@ static int btrfs_init_fs_root(struct btrfs_root *root) goto fail; } - writers = btrfs_alloc_subvolume_writers(); - if (IS_ERR(writers)) { - ret = PTR_ERR(writers); + /* + * We might be called under a transaction (e.g. indirect backref + * resolution) which could deadlock if it triggers memory reclaim + */ + nofs_flag = memalloc_nofs_save(); + ret = btrfs_drew_lock_init(&root->snapshot_lock); + memalloc_nofs_restore(nofs_flag); + if (ret) goto fail; - } - root->subv_writers = writers; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { set_bit(BTRFS_ROOT_REF_COWS, &root->state); @@ -3961,8 +3937,7 @@ void btrfs_free_fs_root(struct btrfs_root *root) WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); if (root->anon_dev) free_anon_bdev(root->anon_dev); - if (root->subv_writers) - btrfs_free_subvolume_writers(root->subv_writers); + btrfs_drew_lock_destroy(&root->snapshot_lock); free_extent_buffer(root->node); free_extent_buffer(root->commit_root); kfree(root->free_ino_ctl); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7eef91d6c2b6..9dcd70cc3ca3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5740,47 +5740,3 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) return bg_ret; return dev_ret; } - -/* - * btrfs_{start,end}_write_no_snapshotting() are similar to - * mnt_{want,drop}_write(), they are used to prevent some tasks from writing - * data into the page cache through nocow before the subvolume is snapshoted, - * but flush the data into disk after the snapshot creation, or to prevent - * operations while snapshotting is ongoing and that cause the snapshot to be - * inconsistent (writes followed by expanding truncates for example). - */ -void btrfs_end_write_no_snapshotting(struct btrfs_root *root) -{ - percpu_counter_dec(&root->subv_writers->counter); - cond_wake_up(&root->subv_writers->wait); -} - -int btrfs_start_write_no_snapshotting(struct btrfs_root *root) -{ - if (atomic_read(&root->will_be_snapshotted)) - return 0; - - percpu_counter_inc(&root->subv_writers->counter); - /* - * Make sure counter is updated before we check for snapshot creation. - */ - smp_mb(); - if (atomic_read(&root->will_be_snapshotted)) { - btrfs_end_write_no_snapshotting(root); - return 0; - } - return 1; -} - -void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) -{ - while (true) { - int ret; - - ret = btrfs_start_write_no_snapshotting(root); - if (ret) - break; - wait_var_event(&root->will_be_snapshotted, - !atomic_read(&root->will_be_snapshotted)); - } -} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fd52ad00b6c8..8a974a82be51 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1553,8 +1553,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, u64 num_bytes; int ret; - ret = btrfs_start_write_no_snapshotting(root); - if (!ret) + if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); @@ -1569,7 +1568,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, NULL, NULL, NULL); if (ret <= 0) { ret = 0; - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); @@ -1675,7 +1674,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, data_reserved, pos, write_bytes); else - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); break; } @@ -1779,7 +1778,7 @@ again: release_bytes = 0; if (only_release_metadata) - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); if (only_release_metadata && copied > 0) { lockstart = round_down(pos, @@ -1808,7 +1807,7 @@ again: if (release_bytes) { if (only_release_metadata) { - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); } else { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 254f5ea17e40..1e138c83cc6e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4726,16 +4726,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * truncation, it must capture all writes that happened before * this truncation. */ - btrfs_wait_for_snapshot_creation(root); + btrfs_drew_write_lock(&root->snapshot_lock); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); return ret; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); return PTR_ERR(trans); } @@ -4743,7 +4743,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) btrfs_inode_safe_disk_i_size_write(inode, 0); pagecache_isize_extended(inode, oldsize, newsize); ret = btrfs_update_inode(trans, root, inode); - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 56bd3ea7fb67..6ded5e346821 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -791,11 +791,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, * possible. This is to avoid later writeback (running dealloc) to * fallback to COW mode and unexpectedly fail with ENOSPC. */ - atomic_inc(&root->will_be_snapshotted); - smp_mb__after_atomic(); - /* wait for no snapshot writes */ - wait_event(root->subv_writers->wait, - percpu_counter_sum(&root->subv_writers->counter) == 0); + btrfs_drew_read_lock(&root->snapshot_lock); ret = btrfs_start_delalloc_snapshot(root); if (ret) @@ -876,8 +872,8 @@ fail: dec_and_free: if (snapshot_force_cow) atomic_dec(&root->snapshot_force_cow); - if (atomic_dec_and_test(&root->will_be_snapshotted)) - wake_up_var(&root->will_be_snapshotted); + btrfs_drew_read_unlock(&root->snapshot_lock); + free_pending: kfree(pending_snapshot->root_item); btrfs_free_path(pending_snapshot->path); From ee787f9550534612a4694cc4ec9f2d397d4b5752 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 29 Mar 2019 02:07:02 +0100 Subject: [PATCH 113/210] btrfs: use struct_size to calculate size of raid hash table The struct_size macro does the same calculation and is safe regarding overflows. Though we're not expecting them to happen, use the helper for clarity. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 406b1efd3ba5..c870ef70f817 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -206,7 +206,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) struct btrfs_stripe_hash *h; int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; int i; - int table_size; if (info->stripe_hash_table) return 0; @@ -218,8 +217,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) * Try harder to allocate and fallback to vmalloc to lower the chance * of a failing mount. */ - table_size = sizeof(*table) + sizeof(*h) * num_entries; - table = kvzalloc(table_size, GFP_KERNEL); + table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); if (!table) return -ENOMEM; From 75fb2e9e49c1bff94cf6652c40889690575ece7c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 3 Aug 2018 00:36:29 +0200 Subject: [PATCH 114/210] btrfs: move mapping of block for discard to its caller There's a simple forwarded call based on the operation that would better fit the caller btrfs_map_block that's until now a trivial wrapper. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b9787945fe26..c93cafe83280 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5878,10 +5878,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, struct btrfs_io_geometry geom; ASSERT(bbio_ret); - - if (op == BTRFS_MAP_DISCARD) - return __btrfs_map_block_for_discard(fs_info, logical, - length, bbio_ret); + ASSERT(op != BTRFS_MAP_DISCARD); ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); if (ret < 0) @@ -6111,6 +6108,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { + if (op == BTRFS_MAP_DISCARD) + return __btrfs_map_block_for_discard(fs_info, logical, + length, bbio_ret); + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, mirror_num, 0); } From 9a8658e33d8fd45879eebc44178b2a172e76bb47 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 20 Mar 2019 13:15:57 +0100 Subject: [PATCH 115/210] btrfs: open code trivial helper btrfs_header_fsid The helper btrfs_header_fsid follows naming convention of other struct accessors but does something compeletly different. As the offsetof calculation is clear in the context of extent buffer operations we can remove it. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 5 ----- fs/btrfs/disk-io.c | 6 ++++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index db9e872bcc79..f66c4ea7491d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1973,11 +1973,6 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, btrfs_set_header_flags(eb, flags); } -static inline unsigned long btrfs_header_fsid(void) -{ - return offsetof(struct btrfs_header, fsid); -} - static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb) { return offsetof(struct btrfs_header, chunk_tree_uuid); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 06819c41e4f4..2e482657a1b4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -541,7 +541,8 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) return -EUCLEAN; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, - btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); + offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE) == 0); if (csum_tree_block(eb, result)) return -EINVAL; @@ -571,7 +572,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb) u8 fsid[BTRFS_FSID_SIZE]; int ret = 1; - read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); + read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE); while (fs_devices) { u8 *metadata_uuid; From c4ac75419826c7afc997bbe4da07ad6f963da22f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 20 Mar 2019 13:17:13 +0100 Subject: [PATCH 116/210] btrfs: open code trivial helper btrfs_header_chunk_tree_uuid The helper btrfs_header_chunk_tree_uuid follows naming convention of other struct accessors but does something compeletly different. As the offsetof calculation is clear in the context of extent buffer operations we can remove it. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 5 ----- fs/btrfs/disk-io.c | 3 ++- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f66c4ea7491d..1cde3f1d8f20 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1973,11 +1973,6 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, btrfs_set_header_flags(eb, flags); } -static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb) -{ - return offsetof(struct btrfs_header, chunk_tree_uuid); -} - static inline int btrfs_is_leaf(const struct extent_buffer *eb) { return btrfs_header_level(eb) == 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2e482657a1b4..abd7a613c8e6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3106,7 +3106,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); + offsetof(struct btrfs_header, chunk_tree_uuid), + BTRFS_UUID_SIZE); ret = btrfs_read_chunk_tree(fs_info); if (ret) { From 42c9d0b524cf9af180dcb788a938cdc4c678e8cb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 20 Mar 2019 11:54:13 +0100 Subject: [PATCH 117/210] btrfs: simplify parameters of btrfs_set_disk_extent_flags All callers pass extent buffer start and length so the extent buffer itself should work fine. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 +--- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 7 +++---- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b62721ac5ee8..f948435e87df 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -925,9 +925,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (new_flags != 0) { int level = btrfs_header_level(buf); - ret = btrfs_set_disk_extent_flags(trans, - buf->start, - buf->len, + ret = btrfs_set_disk_extent_flags(trans, buf, new_flags, level, 0); if (ret) return ret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1cde3f1d8f20..ea5d0675465a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2482,7 +2482,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 flags, + struct extent_buffer *eb, u64 flags, int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9dcd70cc3ca3..161274118853 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2231,7 +2231,7 @@ out: } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 flags, + struct extent_buffer *eb, u64 flags, int level, int is_data) { struct btrfs_delayed_extent_op *extent_op; @@ -2247,7 +2247,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->is_data = is_data ? true : false; extent_op->level = level; - ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; @@ -4741,8 +4741,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, eb->start, - eb->len, flag, + ret = btrfs_set_disk_extent_flags(trans, eb, flag, btrfs_header_level(eb), 0); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; From 7e8f19e50e3ccf0608133cd76e845c282cd84a55 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 27 Nov 2019 16:02:05 +0100 Subject: [PATCH 118/210] btrfs: adjust message level for unrecognized mount option An unrecognized option is a failure that should get user/administrator attention, the info level is often below what gets logged, so make it error. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4969fcce0704..7932d8d07cff 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -873,7 +873,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; #endif case Opt_err: - btrfs_info(info, "unrecognized mount option '%s'", p); + btrfs_err(info, "unrecognized mount option '%s'", p); ret = -EINVAL; goto out; default: From eeb6f17200e20fbdc0ab472def6cf41a2a10e4eb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 28 Nov 2019 15:31:17 +0100 Subject: [PATCH 119/210] btrfs: raid56: simplify sort_parity_stripes Remove trivial comprator and open coded swap of two values. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c93cafe83280..2d8ffefeef34 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5353,31 +5353,19 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return preferred_mirror; } -static inline int parity_smaller(u64 a, u64 b) -{ - return a > b; -} - /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) { - struct btrfs_bio_stripe s; int i; - u64 l; int again = 1; while (again) { again = 0; for (i = 0; i < num_stripes - 1; i++) { - if (parity_smaller(bbio->raid_map[i], - bbio->raid_map[i+1])) { - s = bbio->stripes[i]; - l = bbio->raid_map[i]; - bbio->stripes[i] = bbio->stripes[i+1]; - bbio->raid_map[i] = bbio->raid_map[i+1]; - bbio->stripes[i+1] = s; - bbio->raid_map[i+1] = l; - + /* Swap if parity is on a smaller index */ + if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { + swap(bbio->stripes[i], bbio->stripes[i + 1]); + swap(bbio->raid_map[i], bbio->raid_map[i + 1]); again = 1; } } From 1db45a35f071f2818683cf2c6c99ec28b6ce3f2d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 28 Nov 2019 15:31:46 +0100 Subject: [PATCH 120/210] btrfs: replace u_long type cast with unsigned long We don't use the u_XX types anywhere, though they're defined. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2d8ffefeef34..3d35466f34b0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6194,8 +6194,8 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfs_debug_in_rcu(fs_info, "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, - (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, - bio->bi_iter.bi_size); + (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), + dev->devid, bio->bi_iter.bi_size); bio_set_dev(bio, dev->bdev); btrfs_bio_counter_inc_noblocked(fs_info); From b79ce3dddd3f1be7515452adfed633d13e45d806 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 28 Nov 2019 15:34:28 +0100 Subject: [PATCH 121/210] btrfs: adjust delayed refs message level The message seems to be for debugging and has little value for users. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index abd7a613c8e6..f109607ff40b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4302,7 +4302,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, spin_lock(&delayed_refs->lock); if (atomic_read(&delayed_refs->num_entries) == 0) { spin_unlock(&delayed_refs->lock); - btrfs_info(fs_info, "delayed_refs has NO entry"); + btrfs_debug(fs_info, "delayed_refs has NO entry"); return ret; } From 15b6e8a83e910c71e98c368e51b581135539efdf Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 28 Nov 2019 16:15:04 +0100 Subject: [PATCH 122/210] btrfs: reduce pointer intdirections in btree_readpage_end_io_hook All we need to read is checksum size from fs_info superblock, and fs_info is provided by extent buffer so we can get rid of the wild pointer indirections from page/inode/root. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f109607ff40b..a4493e5dac22 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -604,9 +604,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 found_start; int found_level; struct extent_buffer *eb; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct btrfs_fs_info *fs_info = root->fs_info; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + struct btrfs_fs_info *fs_info; + u16 csum_size; int ret = 0; u8 result[BTRFS_CSUM_SIZE]; int reads_done; @@ -615,6 +614,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto out; eb = (struct extent_buffer *)page->private; + fs_info = eb->fs_info; + csum_size = btrfs_super_csum_size(fs_info->super_copy); /* the pending IO might have been the only thing that kept this buffer * in memory. Make sure we have a ref for all this other checks From 56e9f6ea32da884a2ec8ce7ed99171b8c5385d90 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 28 Nov 2019 16:03:00 +0100 Subject: [PATCH 123/210] btrfs: merge unlocking to common exit block in btrfs_commit_transaction The tree_log_mutex and reloc_mutex locks are properly nested so we can simplify error handling and add labels for them. This reduces line count of the function. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 58 +++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index fdfdfc426539..702e0f2b8307 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2194,10 +2194,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * core function of the snapshot creation. */ ret = create_pending_snapshots(trans); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_reloc; /* * We insert the dir indexes of the snapshots and update the inode @@ -2210,16 +2208,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * the nodes and leaves. */ ret = btrfs_run_delayed_items(trans); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_reloc; ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_reloc; /* * make sure none of the code above managed to slip in a @@ -2245,11 +2239,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) mutex_lock(&fs_info->tree_log_mutex); ret = commit_fs_roots(trans); - if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_tree_log; /* * Since the transaction is done, we can apply the pending changes @@ -2267,29 +2258,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * new delayed refs. Must handle them or qgroup can be wrong. */ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_tree_log; /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. */ ret = btrfs_qgroup_account_extents(trans); - if (ret < 0) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret < 0) + goto unlock_tree_log; ret = commit_cowonly_roots(trans); - if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_tree_log; /* * The tasks which save the space cache and inode cache may also @@ -2297,9 +2279,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; + goto unlock_tree_log; } btrfs_prepare_extent_commit(fs_info); @@ -2346,6 +2326,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) { btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction"); + /* + * reloc_mutex has been unlocked, tree_log_mutex is still held + * but we can't jump to unlock_tree_log causing double unlock + */ mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; } @@ -2394,6 +2378,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; +unlock_tree_log: + mutex_unlock(&fs_info->tree_log_mutex); +unlock_reloc: + mutex_unlock(&fs_info->reloc_mutex); scrub_continue: btrfs_scrub_continue(fs_info); cleanup_transaction: From b82582d668b581c6b3c488278e18def2e98bb2ab Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 4 Feb 2020 19:05:58 +0800 Subject: [PATCH 124/210] btrfs: slightly simplify global block reserve calculations In btrfs_update_global_block_rsv the lines: num_bytes = block_rsv->size - block_rsv->reserved; block_rsv->reserved += num_bytes; imply: block_rsv->reserved = block_rsv->size; Assign block_rsv->size to block_rsv->reserved directly and reorder lines so they match the other branch. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index e46dc3688983..2f6439435cc3 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -389,9 +389,9 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved < block_rsv->size) { num_bytes = block_rsv->size - block_rsv->reserved; - block_rsv->reserved += num_bytes; btrfs_space_info_update_bytes_may_use(fs_info, sinfo, num_bytes); + block_rsv->reserved = block_rsv->size; } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; btrfs_space_info_update_bytes_may_use(fs_info, sinfo, From faf8f7b957134a68f8ca409ae9957affd098d5c1 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Tue, 11 Feb 2020 10:55:26 -0300 Subject: [PATCH 125/210] btrfs: ioctl: resize: only show message if size is changed There is no point to inform the user about size change if there's none. Update the message to conform to a commonly used format where the path and devid are printed and also print old and new sizes. Reviewed-by: Johannes Thumshirn Signed-off-by: Marcos Paulo de Souza Reviewed-by: David Sterba [ enhance message ] Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6ded5e346821..0bedb5da061d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1710,9 +1710,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = round_down(new_size, fs_info->sectorsize); - btrfs_info_in_rcu(fs_info, "new size for %s is %llu", - rcu_str_deref(device->name), new_size); - if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -1725,6 +1722,11 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = btrfs_shrink_device(device, new_size); } /* equal, nothing need to do */ + if (ret == 0 && new_size != old_size) + btrfs_info_in_rcu(fs_info, + "resize device %s (devid %llu) from %llu to %llu", + rcu_str_deref(device->name), device->devid, + old_size, new_size); out_free: kfree(vol_args); out: From b3ff8f1d380e65dddd772542aa9bff6c86bf715a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 12 Feb 2020 14:12:44 +0800 Subject: [PATCH 126/210] btrfs: Don't submit any btree write bio if the fs has errors [BUG] There is a fuzzed image which could cause KASAN report at unmount time. BUG: KASAN: use-after-free in btrfs_queue_work+0x2c1/0x390 Read of size 8 at addr ffff888067cf6848 by task umount/1922 CPU: 0 PID: 1922 Comm: umount Tainted: G W 5.0.21 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0x5b/0x8b print_address_description+0x70/0x280 kasan_report+0x13a/0x19b btrfs_queue_work+0x2c1/0x390 btrfs_wq_submit_bio+0x1cd/0x240 btree_submit_bio_hook+0x18c/0x2a0 submit_one_bio+0x1be/0x320 flush_write_bio.isra.41+0x2c/0x70 btree_write_cache_pages+0x3bb/0x7f0 do_writepages+0x5c/0x130 __writeback_single_inode+0xa3/0x9a0 writeback_single_inode+0x23d/0x390 write_inode_now+0x1b5/0x280 iput+0x2ef/0x600 close_ctree+0x341/0x750 generic_shutdown_super+0x126/0x370 kill_anon_super+0x31/0x50 btrfs_kill_super+0x36/0x2b0 deactivate_locked_super+0x80/0xc0 deactivate_super+0x13c/0x150 cleanup_mnt+0x9a/0x130 task_work_run+0x11a/0x1b0 exit_to_usermode_loop+0x107/0x130 do_syscall_64+0x1e5/0x280 entry_SYSCALL_64_after_hwframe+0x44/0xa9 [CAUSE] The fuzzed image has a completely screwd up extent tree: leaf 29421568 gen 8 total ptrs 6 free space 3587 owner EXTENT_TREE refs 2 lock (w:0 r:0 bw:0 br:0 sw:0 sr:0) lock_owner 0 current 5938 item 0 key (12587008 168 4096) itemoff 3942 itemsize 53 extent refs 1 gen 9 flags 1 ref#0: extent data backref root 5 objectid 259 offset 0 count 1 item 1 key (12591104 168 8192) itemoff 3889 itemsize 53 extent refs 1 gen 9 flags 1 ref#0: extent data backref root 5 objectid 271 offset 0 count 1 item 2 key (12599296 168 4096) itemoff 3836 itemsize 53 extent refs 1 gen 9 flags 1 ref#0: extent data backref root 5 objectid 259 offset 4096 count 1 item 3 key (29360128 169 0) itemoff 3803 itemsize 33 extent refs 1 gen 9 flags 2 ref#0: tree block backref root 5 item 4 key (29368320 169 1) itemoff 3770 itemsize 33 extent refs 1 gen 9 flags 2 ref#0: tree block backref root 5 item 5 key (29372416 169 0) itemoff 3737 itemsize 33 extent refs 1 gen 9 flags 2 ref#0: tree block backref root 5 Note that leaf 29421568 doesn't have its backref in the extent tree. Thus extent allocator can re-allocate leaf 29421568 for other trees. In short, the bug is caused by: - Existing tree block gets allocated to log tree This got its generation bumped. - Log tree balance cleaned dirty bit of offending tree block It will not be written back to disk, thus no WRITTEN flag. - Original owner of the tree block gets COWed Since the tree block has higher transid, no WRITTEN flag, it's reused, and not traced by transaction::dirty_pages. - Transaction aborted Tree blocks get cleaned according to transaction::dirty_pages. But the offending tree block is not recorded at all. - Filesystem unmount All pages are assumed to be are clean, destroying all workqueue, then call iput(btree_inode). But offending tree block is still dirty, which triggers writeback, and causes use-after-free bug. The detailed sequence looks like this: - Initial status eb: 29421568, header=WRITTEN bflags_dirty=0, page_dirty=0, gen=8, not traced by any dirty extent_iot_tree. - New tree block is allocated Since there is no backref for 29421568, it's re-allocated as new tree block. Keep in mind that tree block 29421568 is still referred by extent tree. - Tree block 29421568 is filled for log tree eb: 29421568, header=0 bflags_dirty=1, page_dirty=1, gen=9 << (gen bumped) traced by btrfs_root::dirty_log_pages - Some log tree operations Since the fs is using node size 4096, the log tree can easily go a level higher. - Log tree needs balance Tree block 29421568 gets all its content pushed to right, thus now it is empty, and we don't need it. btrfs_clean_tree_block() from __push_leaf_right() get called. eb: 29421568, header=0 bflags_dirty=0, page_dirty=0, gen=9 traced by btrfs_root::dirty_log_pages - Log tree write back btree_write_cache_pages() goes through dirty pages ranges, but since page of tree block 29421568 gets cleaned already, it's not written back to disk. Thus it doesn't have WRITTEN bit set. But ranges in dirty_log_pages are cleared. eb: 29421568, header=0 bflags_dirty=0, page_dirty=0, gen=9 not traced by any dirty extent_iot_tree. - Extent tree update when committing transaction Since tree block 29421568 has transid equal to running trans, and has no WRITTEN bit, should_cow_block() will use it directly without adding it to btrfs_transaction::dirty_pages. eb: 29421568, header=0 bflags_dirty=1, page_dirty=1, gen=9 not traced by any dirty extent_iot_tree. At this stage, we're doomed. We have a dirty eb not tracked by any extent io tree. - Transaction gets aborted due to corrupted extent tree Btrfs cleans up dirty pages according to transaction::dirty_pages and btrfs_root::dirty_log_pages. But since tree block 29421568 is not tracked by neither of them, it's still dirty. eb: 29421568, header=0 bflags_dirty=1, page_dirty=1, gen=9 not traced by any dirty extent_iot_tree. - Filesystem unmount Since all cleanup is assumed to be done, all workqueus are destroyed. Then iput(btree_inode) is called, expecting no dirty pages. But tree 29421568 is still dirty, thus triggering writeback. Since all workqueues are already freed, we cause use-after-free. This shows us that, log tree blocks + bad extent tree can cause wild dirty pages. [FIX] To fix the problem, don't submit any btree write bio if the filesytem has any error. This is the last safe net, just in case other cleanup haven't caught catch it. Link: https://github.com/bobfuzzer/CVE/tree/master/CVE-2019-19377 CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 14b7007634b7..837262d54e28 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3956,6 +3956,7 @@ int btree_write_cache_pages(struct address_space *mapping, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; int done = 0; int nr_to_write_done = 0; @@ -4069,7 +4070,39 @@ retry: end_write_bio(&epd, ret); return ret; } - ret = flush_write_bio(&epd); + /* + * If something went wrong, don't allow any metadata write bio to be + * submitted. + * + * This would prevent use-after-free if we had dirty pages not + * cleaned up, which can still happen by fuzzed images. + * + * - Bad extent tree + * Allowing existing tree block to be allocated for other trees. + * + * - Log tree operations + * Exiting tree blocks get allocated to log tree, bumps its + * generation, then get cleaned in tree re-balance. + * Such tree block will not be written back, since it's clean, + * thus no WRITTEN flag set. + * And after log writes back, this tree block is not traced by + * any dirty extent_io_tree. + * + * - Offending tree block gets re-dirtied from its original owner + * Since it has bumped generation, no WRITTEN flag, it can be + * reused without COWing. This tree block will not be traced + * by btrfs_transaction::dirty_pages. + * + * Now such dirty tree block will not be cleaned by any dirty + * extent io tree. Thus we don't want to submit such wild eb + * if the fs already has error. + */ + if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + ret = flush_write_bio(&epd); + } else { + ret = -EUCLEAN; + end_write_bio(&epd, ret); + } return ret; } From d01cd62400b3a42d012069518614d944c35f9c99 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Feb 2020 17:37:49 +0200 Subject: [PATCH 127/210] uuid: Add inline helpers to import / export UUIDs Sometimes we may need to import UUID from or export to the raw buffer, which is provided outside of kernel and can't be declared as UUID type. With current API this operation will require an explicit casting to one of UUID types and length, that is always a constant derived as sizeof the certain UUID type. Provide a helpful set of inline helpers to minimize developer's effort in the cases when raw buffers are involved. Suggested-by: David Sterba Acked-by: Christoph Hellwig Signed-off-by: Andy Shevchenko Signed-off-by: David Sterba --- include/linux/uuid.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/include/linux/uuid.h b/include/linux/uuid.h index 0c631e2a73b6..8e4a5000da03 100644 --- a/include/linux/uuid.h +++ b/include/linux/uuid.h @@ -43,6 +43,16 @@ static inline void guid_copy(guid_t *dst, const guid_t *src) memcpy(dst, src, sizeof(guid_t)); } +static inline void import_guid(guid_t *dst, const __u8 *src) +{ + memcpy(dst, src, sizeof(guid_t)); +} + +static inline void export_guid(__u8 *dst, const guid_t *src) +{ + memcpy(dst, src, sizeof(guid_t)); +} + static inline bool guid_is_null(const guid_t *guid) { return guid_equal(guid, &guid_null); @@ -58,6 +68,16 @@ static inline void uuid_copy(uuid_t *dst, const uuid_t *src) memcpy(dst, src, sizeof(uuid_t)); } +static inline void import_uuid(uuid_t *dst, const __u8 *src) +{ + memcpy(dst, src, sizeof(uuid_t)); +} + +static inline void export_uuid(__u8 *dst, const uuid_t *src) +{ + memcpy(dst, src, sizeof(uuid_t)); +} + static inline bool uuid_is_null(const uuid_t *uuid) { return uuid_equal(uuid, &uuid_null); From 48a2e88f53aea4dface64883157ad3c428132f75 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Feb 2020 17:37:50 +0200 Subject: [PATCH 128/210] uuid: Provide a GUID generator for raw buffer In some cases we would like to generate a GUID and export it. Though it would require either casting to internal kernel types or an intermediate buffer. Instead we may achieve this by supplying a pointer to raw buffer and make a complimentary API to existing one for UUIDs. Reviewed-by: Christoph Hellwig Signed-off-by: Andy Shevchenko Signed-off-by: David Sterba --- include/linux/uuid.h | 1 + lib/uuid.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/include/linux/uuid.h b/include/linux/uuid.h index 8e4a5000da03..3780460a9a85 100644 --- a/include/linux/uuid.h +++ b/include/linux/uuid.h @@ -84,6 +84,7 @@ static inline bool uuid_is_null(const uuid_t *uuid) } void generate_random_uuid(unsigned char uuid[16]); +void generate_random_guid(unsigned char guid[16]); extern void guid_gen(guid_t *u); extern void uuid_gen(uuid_t *u); diff --git a/lib/uuid.c b/lib/uuid.c index b6a1edb61d87..562d53977cab 100644 --- a/lib/uuid.c +++ b/lib/uuid.c @@ -40,6 +40,16 @@ void generate_random_uuid(unsigned char uuid[16]) } EXPORT_SYMBOL(generate_random_uuid); +void generate_random_guid(unsigned char guid[16]) +{ + get_random_bytes(guid, 16); + /* Set GUID version to 4 --- truly random generation */ + guid[7] = (guid[7] & 0x0F) | 0x40; + /* Set the GUID variant to DCE */ + guid[8] = (guid[8] & 0x3F) | 0x80; +} +EXPORT_SYMBOL(generate_random_guid); + static void __uuid_gen_common(__u8 b[16]) { prandom_bytes(b, 16); From 807fc790aa804102df75b6ac0c943a4d205fc4b5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Feb 2020 17:37:51 +0200 Subject: [PATCH 129/210] btrfs: switch to use new generic UUID API There are new types and helpers that are supposed to be used in new code. As a preparation to get rid of legacy types and API functions do the conversion here. Reviewed-by: Christoph Hellwig Signed-off-by: Andy Shevchenko Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++--- fs/btrfs/ioctl.c | 4 +--- fs/btrfs/root-tree.c | 4 +--- fs/btrfs/transaction.c | 7 +++---- 4 files changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a4493e5dac22..f01f19c76cf5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1223,7 +1223,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_key key; unsigned int nofs_flag; int ret = 0; - uuid_le uuid = NULL_UUID_LE; /* * We're holding a transaction handle, so use a NOFS memory allocation @@ -1262,8 +1261,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); if (is_fstree(objectid)) - uuid_le_gen(&uuid); - memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); + generate_random_guid(root->root_item.uuid); + else + export_guid(root->root_item.uuid, &guid_null); root->root_item.drop_level = 0; key.objectid = objectid; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0bedb5da061d..04899d3d775e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -574,7 +574,6 @@ static noinline int create_subvol(struct inode *dir, u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; - uuid_le new_uuid; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -644,8 +643,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_root_generation_v2(root_item, btrfs_root_generation(root_item)); - uuid_le_gen(&new_uuid); - memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + generate_random_guid(root_item->uuid); btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); root_item->ctime = root_item->otime; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index b37904327c9e..98b6e0d980f9 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -22,7 +22,6 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, struct btrfs_root_item *item) { - uuid_le uuid; u32 len; int need_reset = 0; @@ -44,8 +43,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, sizeof(*item) - offsetof(struct btrfs_root_item, generation_v2)); - uuid_le_gen(&uuid); - memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE); + generate_random_guid(item->uuid); } } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 702e0f2b8307..ca617441ecbb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1479,7 +1479,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 index = 0; u64 objectid; u64 root_flags; - uuid_le new_uuid; ASSERT(pending->path); path = pending->path; @@ -1572,8 +1571,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_generation_v2(new_root_item, trans->transid); - uuid_le_gen(&new_uuid); - memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + generate_random_guid(new_root_item->uuid); memcpy(new_root_item->parent_uuid, root->root_item.uuid, BTRFS_UUID_SIZE); if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) { @@ -1684,7 +1682,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); goto fail; } - ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL, + ret = btrfs_uuid_tree_add(trans, new_root_item->uuid, + BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { btrfs_abort_transaction(trans, ret); From 86eba9d34c41d5c2cffd82febafa34de574da6f7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Feb 2020 17:37:52 +0200 Subject: [PATCH 130/210] uuid: Remove no more needed macro uuid_le_gen() is no used anymore, remove it for good. Reviewed-by: Christoph Hellwig Signed-off-by: Andy Shevchenko Signed-off-by: David Sterba --- include/linux/uuid.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/uuid.h b/include/linux/uuid.h index 3780460a9a85..d41b0d3e9474 100644 --- a/include/linux/uuid.h +++ b/include/linux/uuid.h @@ -98,7 +98,6 @@ int guid_parse(const char *uuid, guid_t *u); int uuid_parse(const char *uuid, uuid_t *u); /* backwards compatibility, don't use in new code */ -#define uuid_le_gen(u) guid_gen(u) #define uuid_le_to_bin(guid, u) guid_parse(guid, u) static inline int uuid_le_cmp(const guid_t u1, const guid_t u2) From d61acbbf54c612ea9bf67eed609494cda0857b3a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 7 Feb 2020 13:38:20 +0800 Subject: [PATCH 131/210] btrfs: qgroup: ensure qgroup_rescan_running is only set when the worker is at least queued [BUG] There are some reports about btrfs wait forever to unmount itself, with the following call trace: INFO: task umount:4631 blocked for more than 491 seconds. Tainted: G X 5.3.8-2-default #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. umount D 0 4631 3337 0x00000000 Call Trace: ([<00000000174adf7a>] __schedule+0x342/0x748) [<00000000174ae3ca>] schedule+0x4a/0xd8 [<00000000174b1f08>] schedule_timeout+0x218/0x420 [<00000000174af10c>] wait_for_common+0x104/0x1d8 [<000003ff804d6994>] btrfs_qgroup_wait_for_completion+0x84/0xb0 [btrfs] [<000003ff8044a616>] close_ctree+0x4e/0x380 [btrfs] [<0000000016fa3136>] generic_shutdown_super+0x8e/0x158 [<0000000016fa34d6>] kill_anon_super+0x26/0x40 [<000003ff8041ba88>] btrfs_kill_super+0x28/0xc8 [btrfs] [<0000000016fa39f8>] deactivate_locked_super+0x68/0x98 [<0000000016fcb198>] cleanup_mnt+0xc0/0x140 [<0000000016d6a846>] task_work_run+0xc6/0x110 [<0000000016d04f76>] do_notify_resume+0xae/0xb8 [<00000000174b30ae>] system_call+0xe2/0x2c8 [CAUSE] The problem happens when we have called qgroup_rescan_init(), but not queued the worker. It can be caused mostly by error handling. Qgroup ioctl thread | Unmount thread ----------------------------------------+----------------------------------- | btrfs_qgroup_rescan() | |- qgroup_rescan_init() | | |- qgroup_rescan_running = true; | | | |- trans = btrfs_join_transaction() | | Some error happened | | | |- btrfs_qgroup_rescan() returns error | But qgroup_rescan_running == true; | | close_ctree() | |- btrfs_qgroup_wait_for_completion() | |- running == true; | |- wait_for_completion(); btrfs_qgroup_rescan_worker is never queued, thus no one is going to wake up close_ctree() and we get a deadlock. All involved qgroup_rescan_init() callers are: - btrfs_qgroup_rescan() The example above. It's possible to trigger the deadlock when error happened. - btrfs_quota_enable() Not possible. Just after qgroup_rescan_init() we queue the work. - btrfs_read_qgroup_config() It's possible to trigger the deadlock. It only init the work, the work queueing happens in btrfs_qgroup_rescan_resume(). Thus if error happened in between, deadlock is possible. We shouldn't set fs_info->qgroup_rescan_running just in qgroup_rescan_init(), as at that stage we haven't yet queued qgroup rescan worker to run. [FIX] Set qgroup_rescan_running before queueing the work, so that we ensure the rescan work is queued when we wait for it. Fixes: 8d9eddad1946 ("Btrfs: fix qgroup rescan worker initialization") Signed-off-by: Jeff Mahoney [ Change subject and cause analyse, use a smaller fix ] Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 7e02cd61da73..08c8ab199b6e 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1030,6 +1030,7 @@ out_add_root: ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); + fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); } @@ -3263,7 +3264,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, sizeof(fs_info->qgroup_rescan_progress)); fs_info->qgroup_rescan_progress.objectid = progress_objectid; init_completion(&fs_info->qgroup_rescan_completion); - fs_info->qgroup_rescan_running = true; spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3326,8 +3326,11 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) qgroup_rescan_zero_tracking(fs_info); + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); + mutex_unlock(&fs_info->qgroup_rescan_lock); return 0; } @@ -3363,9 +3366,13 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) { - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); + mutex_unlock(&fs_info->qgroup_rescan_lock); + } } /* From daf475c9154bb89a7c219e15386eebad8e1142d3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 7 Feb 2020 13:38:21 +0800 Subject: [PATCH 132/210] btrfs: qgroup: Remove the unnecesaary spin lock for qgroup_rescan_running After the previous patch, qgroup_rescan_running is protected by btrfs_fs_info::qgroup_rescan_lock, thus no need for the extra spinlock. Suggested-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 08c8ab199b6e..75bc3b686498 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3238,7 +3238,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, } mutex_lock(&fs_info->qgroup_rescan_lock); - spin_lock(&fs_info->qgroup_lock); if (init_flags) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { @@ -3253,7 +3252,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, } if (ret) { - spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); return ret; } @@ -3264,8 +3262,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, sizeof(fs_info->qgroup_rescan_progress)); fs_info->qgroup_rescan_progress.objectid = progress_objectid; init_completion(&fs_info->qgroup_rescan_completion); - - spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); btrfs_init_work(&fs_info->qgroup_rescan_work, @@ -3342,9 +3338,7 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, int ret = 0; mutex_lock(&fs_info->qgroup_rescan_lock); - spin_lock(&fs_info->qgroup_lock); running = fs_info->qgroup_rescan_running; - spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); if (!running) From 52d40aba68dcc42ba3f2846eedf9cb1c6d5ce45f Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:06 +0900 Subject: [PATCH 133/210] btrfs: change full_search to bool in find_free_extent_update_loop While the "full_search" variable defined in find_free_extent() is bool, but the full_search argument of find_free_extent_update_loop() is defined as int. Let's trivially fix the argument type. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 161274118853..9d5563a63a1f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3661,7 +3661,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_free_cluster *last_ptr, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, - int full_search, bool use_cluster) + bool full_search, bool use_cluster) { struct btrfs_root *root = fs_info->extent_root; int ret; From b25c19f49ef294865462bf741e0430c59295ac15 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:07 +0900 Subject: [PATCH 134/210] btrfs: handle invalid profile in chunk allocation Do not BUG_ON() when an invalid profile is passed to __btrfs_alloc_chunk(). Instead return -EINVAL with ASSERT() to catch a bug in the development stage. Suggested-by: Johannes Thumshirn Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3d35466f34b0..f1c98b01f6ec 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4773,7 +4773,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int j; int index; - BUG_ON(!alloc_profile_is_valid(type, 0)); + if (!alloc_profile_is_valid(type, 0)) { + ASSERT(0); + return -EINVAL; + } if (list_empty(&fs_devices->alloc_list)) { if (btrfs_test_opt(info, ENOSPC_DEBUG)) From c4a816c67c396b1a06abf77bd249ac6ad02861b3 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:08 +0900 Subject: [PATCH 135/210] btrfs: introduce chunk allocation policy Introduce chunk allocation policy for btrfs. This policy controls how chunks and device extents are allocated from devices. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 1 + fs/btrfs/volumes.h | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f1c98b01f6ec..e4a2d9600b85 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1205,6 +1205,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->opened = 1; fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; + fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; out: return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7b657f2cba57..f067b5934c46 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -207,6 +207,10 @@ BTRFS_DEVICE_GETSET_FUNCS(total_bytes); BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes); BTRFS_DEVICE_GETSET_FUNCS(bytes_used); +enum btrfs_chunk_allocation_policy { + BTRFS_CHUNK_ALLOC_REGULAR, +}; + struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ u8 metadata_uuid[BTRFS_FSID_SIZE]; @@ -258,6 +262,8 @@ struct btrfs_fs_devices { struct kobject *devices_kobj; struct kobject *devinfo_kobj; struct completion kobj_unregister; + + enum btrfs_chunk_allocation_policy chunk_alloc_policy; }; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 From 3b4ffa4088404446edfffd9cd8673c7826e76530 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:09 +0900 Subject: [PATCH 136/210] btrfs: refactor find_free_dev_extent_start() Factor out two functions from find_free_dev_extent_start(). dev_extent_search_start() decides the starting position of the search. dev_extent_hole_check() checks if a hole found is suitable for device extent allocation. These functions also have the switch-cases to change the allocation behavior depending on the policy. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 78 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e4a2d9600b85..d126e7d814e9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1381,6 +1381,59 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, return false; } +static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) +{ + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* + * We don't want to overwrite the superblock on the drive nor + * any area used by the boot loader (grub for example), so we + * make sure to start at an offset of at least 1MB. + */ + return max_t(u64, start, SZ_1M); + default: + BUG(); + } +} + +/** + * dev_extent_hole_check - check if specified hole is suitable for allocation + * @device: the device which we have the hole + * @hole_start: starting position of the hole + * @hole_size: the size of the hole + * @num_bytes: the size of the free space that we need + * + * This function may modify @hole_start and @hole_end to reflect the suitable + * position for allocation. Returns 1 if hole position is updated, 0 otherwise. + */ +static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, + u64 *hole_size, u64 num_bytes) +{ + bool changed = false; + u64 hole_end = *hole_start + *hole_size; + + /* + * Check before we set max_hole_start, otherwise we could end up + * sending back this offset anyway. + */ + if (contains_pending_extent(device, hole_start, *hole_size)) { + if (hole_end >= *hole_start) + *hole_size = hole_end - *hole_start; + else + *hole_size = 0; + changed = true; + } + + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* No extra check */ + break; + default: + BUG(); + } + + return changed; +} /* * find_free_dev_extent_start - find free space in the specified device @@ -1427,12 +1480,7 @@ static int find_free_dev_extent_start(struct btrfs_device *device, int slot; struct extent_buffer *l; - /* - * We don't want to overwrite the superblock on the drive nor any area - * used by the boot loader (grub for example), so we make sure to start - * at an offset of at least 1MB. - */ - search_start = max_t(u64, search_start, SZ_1M); + search_start = dev_extent_search_start(device, search_start); path = btrfs_alloc_path(); if (!path) @@ -1490,18 +1538,8 @@ again: if (key.offset > search_start) { hole_size = key.offset - search_start; - - /* - * Have to check before we set max_hole_start, otherwise - * we could end up sending back this offset anyway. - */ - if (contains_pending_extent(device, &search_start, - hole_size)) { - if (key.offset >= search_start) - hole_size = key.offset - search_start; - else - hole_size = 0; - } + dev_extent_hole_check(device, &search_start, &hole_size, + num_bytes); if (hole_size > max_hole_size) { max_hole_start = search_start; @@ -1540,8 +1578,8 @@ next: */ if (search_end > search_start) { hole_size = search_end - search_start; - - if (contains_pending_extent(device, &search_start, hole_size)) { + if (dev_extent_hole_check(device, &search_start, &hole_size, + num_bytes)) { btrfs_release_path(path); goto again; } From 4f2bafe8a49e5f5e5cf821117ddd6b868c7dc0e7 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:10 +0900 Subject: [PATCH 137/210] btrfs: introduce alloc_chunk_ctl Introduce "struct alloc_chunk_ctl" to wrap needed parameters for the chunk allocation. This will be used to split __btrfs_alloc_chunk() into smaller functions. This commit folds a number of local variables in __btrfs_alloc_chunk() into one "struct alloc_chunk_ctl ctl". There is no functional change. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 158 ++++++++++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 65 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d126e7d814e9..39a4700ee2ca 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4780,6 +4780,36 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID1C34); } +/* + * Structure used internally for __btrfs_alloc_chunk() function. + * Wraps needed parameters. + */ +struct alloc_chunk_ctl { + u64 start; + u64 type; + /* Total number of stripes to allocate */ + int num_stripes; + /* sub_stripes info for map */ + int sub_stripes; + /* Stripes per device */ + int dev_stripes; + /* Maximum number of devices to use */ + int devs_max; + /* Minimum number of devices to use */ + int devs_min; + /* ndevs has to be a multiple of this */ + int devs_increment; + /* Number of copies */ + int ncopies; + /* Number of stripes worth of bytes to store parity information */ + int nparity; + u64 max_stripe_size; + u64 max_chunk_size; + u64 stripe_size; + u64 chunk_size; + int ndevs; +}; + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 start, u64 type) { @@ -4790,23 +4820,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct extent_map_tree *em_tree; struct extent_map *em; struct btrfs_device_info *devices_info = NULL; + struct alloc_chunk_ctl ctl; u64 total_avail; - int num_stripes; /* total number of stripes to allocate */ int data_stripes; /* number of stripes that count for block group size */ - int sub_stripes; /* sub_stripes info for map */ - int dev_stripes; /* stripes per dev */ - int devs_max; /* max devs to use */ - int devs_min; /* min devs needed */ - int devs_increment; /* ndevs has to be a multiple of this */ - int ncopies; /* how many copies to data has */ - int nparity; /* number of stripes worth of bytes to - store parity information */ int ret; - u64 max_stripe_size; - u64 max_chunk_size; - u64 stripe_size; - u64 chunk_size; int ndevs; int i; int j; @@ -4823,32 +4841,36 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, return -ENOSPC; } + ctl.start = start; + ctl.type = type; + index = btrfs_bg_flags_to_raid_index(type); - sub_stripes = btrfs_raid_array[index].sub_stripes; - dev_stripes = btrfs_raid_array[index].dev_stripes; - devs_max = btrfs_raid_array[index].devs_max; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info); - devs_min = btrfs_raid_array[index].devs_min; - devs_increment = btrfs_raid_array[index].devs_increment; - ncopies = btrfs_raid_array[index].ncopies; - nparity = btrfs_raid_array[index].nparity; + ctl.sub_stripes = btrfs_raid_array[index].sub_stripes; + ctl.dev_stripes = btrfs_raid_array[index].dev_stripes; + ctl.devs_max = btrfs_raid_array[index].devs_max; + if (!ctl.devs_max) + ctl.devs_max = BTRFS_MAX_DEVS(info); + ctl.devs_min = btrfs_raid_array[index].devs_min; + ctl.devs_increment = btrfs_raid_array[index].devs_increment; + ctl.ncopies = btrfs_raid_array[index].ncopies; + ctl.nparity = btrfs_raid_array[index].nparity; if (type & BTRFS_BLOCK_GROUP_DATA) { - max_stripe_size = SZ_1G; - max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; + ctl.max_stripe_size = SZ_1G; + ctl.max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) - max_stripe_size = SZ_1G; + ctl.max_stripe_size = SZ_1G; else - max_stripe_size = SZ_256M; - max_chunk_size = max_stripe_size; + ctl.max_stripe_size = SZ_256M; + ctl.max_chunk_size = ctl.max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = SZ_32M; - max_chunk_size = 2 * max_stripe_size; - devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); + ctl.max_stripe_size = SZ_32M; + ctl.max_chunk_size = 2 * ctl.max_stripe_size; + ctl.devs_max = min_t(int, ctl.devs_max, + BTRFS_MAX_DEVS_SYS_CHUNK); } else { btrfs_err(info, "invalid chunk type 0x%llx requested", type); @@ -4856,8 +4878,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } /* We don't want a chunk larger than 10% of writable space */ - max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), - max_chunk_size); + ctl.max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + ctl.max_chunk_size); devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), GFP_NOFS); @@ -4893,21 +4915,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (total_avail == 0) continue; - ret = find_free_dev_extent(device, - max_stripe_size * dev_stripes, - &dev_offset, &max_avail); + ret = find_free_dev_extent( + device, ctl.max_stripe_size * ctl.dev_stripes, + &dev_offset, &max_avail); if (ret && ret != -ENOSPC) goto error; if (ret == 0) - max_avail = max_stripe_size * dev_stripes; + max_avail = ctl.max_stripe_size * ctl.dev_stripes; - if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { + if (max_avail < BTRFS_STRIPE_LEN * ctl.dev_stripes) { if (btrfs_test_opt(info, ENOSPC_DEBUG)) btrfs_debug(info, "%s: devid %llu has no free space, have=%llu want=%u", __func__, device->devid, max_avail, - BTRFS_STRIPE_LEN * dev_stripes); + BTRFS_STRIPE_LEN * ctl.dev_stripes); continue; } @@ -4922,30 +4944,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, devices_info[ndevs].dev = device; ++ndevs; } + ctl.ndevs = ndevs; /* * now sort the devices by hole size / available space */ - sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + sort(devices_info, ctl.ndevs, sizeof(struct btrfs_device_info), btrfs_cmp_device_info, NULL); /* * Round down to number of usable stripes, devs_increment can be any * number so we can't use round_down() */ - ndevs -= ndevs % devs_increment; + ctl.ndevs -= ctl.ndevs % ctl.devs_increment; - if (ndevs < devs_min) { + if (ctl.ndevs < ctl.devs_min) { ret = -ENOSPC; if (btrfs_test_opt(info, ENOSPC_DEBUG)) { btrfs_debug(info, "%s: not enough devices with free space: have=%d minimum required=%d", - __func__, ndevs, devs_min); + __func__, ctl.ndevs, ctl.devs_min); } goto error; } - ndevs = min(ndevs, devs_max); + ctl.ndevs = min(ctl.ndevs, ctl.devs_max); /* * The primary goal is to maximize the number of stripes, so use as @@ -4954,14 +4977,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, * The DUP profile stores more than one stripe per device, the * max_avail is the total size so we have to adjust. */ - stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); - num_stripes = ndevs * dev_stripes; + ctl.stripe_size = div_u64(devices_info[ctl.ndevs - 1].max_avail, + ctl.dev_stripes); + ctl.num_stripes = ctl.ndevs * ctl.dev_stripes; /* * this will have to be fixed for RAID1 and RAID10 over * more drives */ - data_stripes = (num_stripes - nparity) / ncopies; + data_stripes = (ctl.num_stripes - ctl.nparity) / ctl.ncopies; /* * Use the number of data stripes to figure out how big this chunk @@ -4969,44 +4993,46 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, * and compare that answer with the max chunk size. If it's higher, * we try to reduce stripe_size. */ - if (stripe_size * data_stripes > max_chunk_size) { + if (ctl.stripe_size * data_stripes > ctl.max_chunk_size) { /* * Reduce stripe_size, round it up to a 16MB boundary again and * then use it, unless it ends up being even bigger than the * previous value we had already. */ - stripe_size = min(round_up(div_u64(max_chunk_size, - data_stripes), SZ_16M), - stripe_size); + ctl.stripe_size = + min(round_up(div_u64(ctl.max_chunk_size, data_stripes), + SZ_16M), + ctl.stripe_size); } /* align to BTRFS_STRIPE_LEN */ - stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); + ctl.stripe_size = round_down(ctl.stripe_size, BTRFS_STRIPE_LEN); - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + map = kmalloc(map_lookup_size(ctl.num_stripes), GFP_NOFS); if (!map) { ret = -ENOMEM; goto error; } - map->num_stripes = num_stripes; - for (i = 0; i < ndevs; ++i) { - for (j = 0; j < dev_stripes; ++j) { - int s = i * dev_stripes + j; + map->num_stripes = ctl.num_stripes; + + for (i = 0; i < ctl.ndevs; ++i) { + for (j = 0; j < ctl.dev_stripes; ++j) { + int s = i * ctl.dev_stripes + j; map->stripes[s].dev = devices_info[i].dev; map->stripes[s].physical = devices_info[i].dev_offset + - j * stripe_size; + j * ctl.stripe_size; } } map->stripe_len = BTRFS_STRIPE_LEN; map->io_align = BTRFS_STRIPE_LEN; map->io_width = BTRFS_STRIPE_LEN; map->type = type; - map->sub_stripes = sub_stripes; + map->sub_stripes = ctl.sub_stripes; - chunk_size = stripe_size * data_stripes; + ctl.chunk_size = ctl.stripe_size * data_stripes; - trace_btrfs_chunk_alloc(info, map, start, chunk_size); + trace_btrfs_chunk_alloc(info, map, start, ctl.chunk_size); em = alloc_extent_map(); if (!em) { @@ -5017,10 +5043,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->map_lookup = map; em->start = start; - em->len = chunk_size; + em->len = ctl.chunk_size; em->block_start = 0; em->block_len = em->len; - em->orig_block_len = stripe_size; + em->orig_block_len = ctl.stripe_size; em_tree = &info->mapping_tree; write_lock(&em_tree->lock); @@ -5032,20 +5058,22 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } write_unlock(&em_tree->lock); - ret = btrfs_make_block_group(trans, 0, type, start, chunk_size); + ret = btrfs_make_block_group(trans, 0, type, start, ctl.chunk_size); if (ret) goto error_del_extent; for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; - btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size); + btrfs_device_set_bytes_used(dev, + dev->bytes_used + ctl.stripe_size); if (list_empty(&dev->post_commit_list)) list_add_tail(&dev->post_commit_list, &trans->transaction->dev_update_list); } - atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); + atomic64_sub(ctl.stripe_size * map->num_stripes, + &info->free_chunk_space); free_extent_map(em); check_raid56_incompat_flag(info, type); From 27c314d5ca090e046c7f49892fde7708dbd877a2 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:11 +0900 Subject: [PATCH 138/210] btrfs: factor out init_alloc_chunk_ctl Factor out init_alloc_chunk_ctl() from __btrfs_alloc_chunk(). This function initialises parameters of "struct alloc_chunk_ctl" for allocation. init_alloc_chunk_ctl() handles a common part of the initialisation to load the RAID parameters from btrfs_raid_array. init_alloc_chunk_ctl_policy_regular() decides some parameters for its allocation. The last "else" case in the original code is moved to __btrfs_alloc_chunk() to handle the error case in the common code. Replace the BUG_ON with ASSERT() and error return at the same time. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 100 ++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 38 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 39a4700ee2ca..9ae5f0c1346c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4810,6 +4810,61 @@ struct alloc_chunk_ctl { int ndevs; }; +static void init_alloc_chunk_ctl_policy_regular( + struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + u64 type = ctl->type; + + if (type & BTRFS_BLOCK_GROUP_DATA) { + ctl->max_stripe_size = SZ_1G; + ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + /* For larger filesystems, use larger metadata chunks */ + if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) + ctl->max_stripe_size = SZ_1G; + else + ctl->max_stripe_size = SZ_256M; + ctl->max_chunk_size = ctl->max_stripe_size; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + ctl->max_stripe_size = SZ_32M; + ctl->max_chunk_size = 2 * ctl->max_stripe_size; + ctl->devs_max = min_t(int, ctl->devs_max, + BTRFS_MAX_DEVS_SYS_CHUNK); + } else { + BUG(); + } + + /* We don't want a chunk larger than 10% of writable space */ + ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + ctl->max_chunk_size); +} + +static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + int index = btrfs_bg_flags_to_raid_index(ctl->type); + + ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; + ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; + ctl->devs_max = btrfs_raid_array[index].devs_max; + if (!ctl->devs_max) + ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); + ctl->devs_min = btrfs_raid_array[index].devs_min; + ctl->devs_increment = btrfs_raid_array[index].devs_increment; + ctl->ncopies = btrfs_raid_array[index].ncopies; + ctl->nparity = btrfs_raid_array[index].nparity; + ctl->ndevs = 0; + + switch (fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); + break; + default: + BUG(); + } +} + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 start, u64 type) { @@ -4828,7 +4883,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int ndevs; int i; int j; - int index; if (!alloc_profile_is_valid(type, 0)) { ASSERT(0); @@ -4841,45 +4895,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, return -ENOSPC; } - ctl.start = start; - ctl.type = type; - - index = btrfs_bg_flags_to_raid_index(type); - - ctl.sub_stripes = btrfs_raid_array[index].sub_stripes; - ctl.dev_stripes = btrfs_raid_array[index].dev_stripes; - ctl.devs_max = btrfs_raid_array[index].devs_max; - if (!ctl.devs_max) - ctl.devs_max = BTRFS_MAX_DEVS(info); - ctl.devs_min = btrfs_raid_array[index].devs_min; - ctl.devs_increment = btrfs_raid_array[index].devs_increment; - ctl.ncopies = btrfs_raid_array[index].ncopies; - ctl.nparity = btrfs_raid_array[index].nparity; - - if (type & BTRFS_BLOCK_GROUP_DATA) { - ctl.max_stripe_size = SZ_1G; - ctl.max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; - } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - /* for larger filesystems, use larger metadata chunks */ - if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) - ctl.max_stripe_size = SZ_1G; - else - ctl.max_stripe_size = SZ_256M; - ctl.max_chunk_size = ctl.max_stripe_size; - } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - ctl.max_stripe_size = SZ_32M; - ctl.max_chunk_size = 2 * ctl.max_stripe_size; - ctl.devs_max = min_t(int, ctl.devs_max, - BTRFS_MAX_DEVS_SYS_CHUNK); - } else { - btrfs_err(info, "invalid chunk type 0x%llx requested", - type); - BUG(); + if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(info, "invalid chunk type 0x%llx requested", type); + ASSERT(0); + return -EINVAL; } - /* We don't want a chunk larger than 10% of writable space */ - ctl.max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), - ctl.max_chunk_size); + ctl.start = start; + ctl.type = type; + init_alloc_chunk_ctl(fs_devices, &ctl); devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), GFP_NOFS); From 560156cb25fcb423969543d8bc93fe95eedc6b4c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:12 +0900 Subject: [PATCH 139/210] btrfs: factor out gather_device_info() Factor out gather_device_info() from __btrfs_alloc_chunk(). This function iterates over devices list and gather information about devices. This commit also introduces "max_avail" and "dev_extent_min" to fold the same calculation to one variable. This commit has no functional changes. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 153 ++++++++++++++++++++++++--------------------- 1 file changed, 83 insertions(+), 70 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9ae5f0c1346c..84c12ff94eb7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4865,22 +4865,97 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, } } +static int gather_device_info(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = fs_devices->fs_info; + struct btrfs_device *device; + u64 total_avail; + u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; + u64 dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; + int ret; + int ndevs = 0; + u64 max_avail; + u64 dev_offset; + + /* + * in the first pass through the devices list, we gather information + * about the available holes on each device. + */ + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + WARN(1, KERN_ERR + "BTRFS: read-only device in alloc_list\n"); + continue; + } + + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + continue; + + if (device->total_bytes > device->bytes_used) + total_avail = device->total_bytes - device->bytes_used; + else + total_avail = 0; + + /* If there is no space on this device, skip it. */ + if (total_avail == 0) + continue; + + ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, + &max_avail); + if (ret && ret != -ENOSPC) + return ret; + + if (ret == 0) + max_avail = dev_extent_want; + + if (max_avail < dev_extent_min) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) + btrfs_debug(info, + "%s: devid %llu has no free space, have=%llu want=%llu", + __func__, device->devid, max_avail, + dev_extent_min); + continue; + } + + if (ndevs == fs_devices->rw_devices) { + WARN(1, "%s: found more than %llu devices\n", + __func__, fs_devices->rw_devices); + break; + } + devices_info[ndevs].dev_offset = dev_offset; + devices_info[ndevs].max_avail = max_avail; + devices_info[ndevs].total_avail = total_avail; + devices_info[ndevs].dev = device; + ++ndevs; + } + ctl->ndevs = ndevs; + + /* + * now sort the devices by hole size / available space + */ + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + btrfs_cmp_device_info, NULL); + + return 0; +} + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 start, u64 type) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct btrfs_device *device; struct map_lookup *map = NULL; struct extent_map_tree *em_tree; struct extent_map *em; struct btrfs_device_info *devices_info = NULL; struct alloc_chunk_ctl ctl; - u64 total_avail; - int data_stripes; /* number of stripes that count for - block group size */ + /* Number of stripes that count for block group size */ + int data_stripes; int ret; - int ndevs; int i; int j; @@ -4910,71 +4985,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (!devices_info) return -ENOMEM; - /* - * in the first pass through the devices list, we gather information - * about the available holes on each device. - */ - ndevs = 0; - list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - u64 max_avail; - u64 dev_offset; - - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { - WARN(1, KERN_ERR - "BTRFS: read-only device in alloc_list\n"); - continue; - } - - if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &device->dev_state) || - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) - continue; - - if (device->total_bytes > device->bytes_used) - total_avail = device->total_bytes - device->bytes_used; - else - total_avail = 0; - - /* If there is no space on this device, skip it. */ - if (total_avail == 0) - continue; - - ret = find_free_dev_extent( - device, ctl.max_stripe_size * ctl.dev_stripes, - &dev_offset, &max_avail); - if (ret && ret != -ENOSPC) - goto error; - - if (ret == 0) - max_avail = ctl.max_stripe_size * ctl.dev_stripes; - - if (max_avail < BTRFS_STRIPE_LEN * ctl.dev_stripes) { - if (btrfs_test_opt(info, ENOSPC_DEBUG)) - btrfs_debug(info, - "%s: devid %llu has no free space, have=%llu want=%u", - __func__, device->devid, max_avail, - BTRFS_STRIPE_LEN * ctl.dev_stripes); - continue; - } - - if (ndevs == fs_devices->rw_devices) { - WARN(1, "%s: found more than %llu devices\n", - __func__, fs_devices->rw_devices); - break; - } - devices_info[ndevs].dev_offset = dev_offset; - devices_info[ndevs].max_avail = max_avail; - devices_info[ndevs].total_avail = total_avail; - devices_info[ndevs].dev = device; - ++ndevs; - } - ctl.ndevs = ndevs; - - /* - * now sort the devices by hole size / available space - */ - sort(devices_info, ctl.ndevs, sizeof(struct btrfs_device_info), - btrfs_cmp_device_info, NULL); + ret = gather_device_info(fs_devices, &ctl, devices_info); + if (ret < 0) + goto error; /* * Round down to number of usable stripes, devs_increment can be any From 5badf512ecd0530fbb517c999fccacf6a9d87dc1 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:13 +0900 Subject: [PATCH 140/210] btrfs: factor out decide_stripe_size() Factor out decide_stripe_size() from __btrfs_alloc_chunk(). This function calculates the actual stripe size to allocate. decide_stripe_size() handles the common case to round down the 'ndevs' to 'devs_increment' and check the upper and lower limitation of 'ndevs'. decide_stripe_size_regular() decides the size of a stripe and the size of a chunk. The policy is to maximize the number of stripes. This commit has no functional changes. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 136 ++++++++++++++++++++++++++------------------- 1 file changed, 78 insertions(+), 58 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 84c12ff94eb7..6309a73d100a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4943,6 +4943,82 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, return 0; } +static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + /* Number of stripes that count for block group size */ + int data_stripes; + + /* + * The primary goal is to maximize the number of stripes, so use as + * many devices as possible, even if the stripes are not maximum sized. + * + * The DUP profile stores more than one stripe per device, the + * max_avail is the total size so we have to adjust. + */ + ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, + ctl->dev_stripes); + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + + /* This will have to be fixed for RAID1 and RAID10 over more drives */ + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + + /* + * Use the number of data stripes to figure out how big this chunk is + * really going to be in terms of logical address space, and compare + * that answer with the max chunk size. If it's higher, we try to + * reduce stripe_size. + */ + if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { + /* + * Reduce stripe_size, round it up to a 16MB boundary again and + * then use it, unless it ends up being even bigger than the + * previous value we had already. + */ + ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, + data_stripes), SZ_16M), + ctl->stripe_size); + } + + /* Align to BTRFS_STRIPE_LEN */ + ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); + ctl->chunk_size = ctl->stripe_size * data_stripes; + + return 0; +} + +static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = fs_devices->fs_info; + + /* + * Round down to number of usable stripes, devs_increment can be any + * number so we can't use round_down() that requires power of 2, while + * rounddown is safe. + */ + ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); + + if (ctl->ndevs < ctl->devs_min) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) { + btrfs_debug(info, + "%s: not enough devices with free space: have=%d minimum required=%d", + __func__, ctl->ndevs, ctl->devs_min); + } + return -ENOSPC; + } + + ctl->ndevs = min(ctl->ndevs, ctl->devs_max); + + switch (fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + return decide_stripe_size_regular(ctl, devices_info); + default: + BUG(); + } +} + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 start, u64 type) { @@ -4953,8 +5029,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct extent_map *em; struct btrfs_device_info *devices_info = NULL; struct alloc_chunk_ctl ctl; - /* Number of stripes that count for block group size */ - int data_stripes; int ret; int i; int j; @@ -4989,61 +5063,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (ret < 0) goto error; - /* - * Round down to number of usable stripes, devs_increment can be any - * number so we can't use round_down() - */ - ctl.ndevs -= ctl.ndevs % ctl.devs_increment; - - if (ctl.ndevs < ctl.devs_min) { - ret = -ENOSPC; - if (btrfs_test_opt(info, ENOSPC_DEBUG)) { - btrfs_debug(info, - "%s: not enough devices with free space: have=%d minimum required=%d", - __func__, ctl.ndevs, ctl.devs_min); - } + ret = decide_stripe_size(fs_devices, &ctl, devices_info); + if (ret < 0) goto error; - } - - ctl.ndevs = min(ctl.ndevs, ctl.devs_max); - - /* - * The primary goal is to maximize the number of stripes, so use as - * many devices as possible, even if the stripes are not maximum sized. - * - * The DUP profile stores more than one stripe per device, the - * max_avail is the total size so we have to adjust. - */ - ctl.stripe_size = div_u64(devices_info[ctl.ndevs - 1].max_avail, - ctl.dev_stripes); - ctl.num_stripes = ctl.ndevs * ctl.dev_stripes; - - /* - * this will have to be fixed for RAID1 and RAID10 over - * more drives - */ - data_stripes = (ctl.num_stripes - ctl.nparity) / ctl.ncopies; - - /* - * Use the number of data stripes to figure out how big this chunk - * is really going to be in terms of logical address space, - * and compare that answer with the max chunk size. If it's higher, - * we try to reduce stripe_size. - */ - if (ctl.stripe_size * data_stripes > ctl.max_chunk_size) { - /* - * Reduce stripe_size, round it up to a 16MB boundary again and - * then use it, unless it ends up being even bigger than the - * previous value we had already. - */ - ctl.stripe_size = - min(round_up(div_u64(ctl.max_chunk_size, data_stripes), - SZ_16M), - ctl.stripe_size); - } - - /* align to BTRFS_STRIPE_LEN */ - ctl.stripe_size = round_down(ctl.stripe_size, BTRFS_STRIPE_LEN); map = kmalloc(map_lookup_size(ctl.num_stripes), GFP_NOFS); if (!map) { @@ -5067,8 +5089,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, map->type = type; map->sub_stripes = ctl.sub_stripes; - ctl.chunk_size = ctl.stripe_size * data_stripes; - trace_btrfs_chunk_alloc(info, map, start, ctl.chunk_size); em = alloc_extent_map(); From dce580ca403a3e99806a39d54bcb0ad631dee01f Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:14 +0900 Subject: [PATCH 141/210] btrfs: factor out create_chunk() Factor out create_chunk() from __btrfs_alloc_chunk(). This function finally creates a chunk. There is no functional changes. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 186 ++++++++++++++++++++++++--------------------- 1 file changed, 98 insertions(+), 88 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6309a73d100a..888009bbec8f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5019,19 +5019,108 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, } } +static int create_chunk(struct btrfs_trans_handle *trans, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct map_lookup *map = NULL; + struct extent_map_tree *em_tree; + struct extent_map *em; + u64 start = ctl->start; + u64 type = ctl->type; + int ret; + int i; + int j; + + map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); + if (!map) + return -ENOMEM; + map->num_stripes = ctl->num_stripes; + + for (i = 0; i < ctl->ndevs; ++i) { + for (j = 0; j < ctl->dev_stripes; ++j) { + int s = i * ctl->dev_stripes + j; + map->stripes[s].dev = devices_info[i].dev; + map->stripes[s].physical = devices_info[i].dev_offset + + j * ctl->stripe_size; + } + } + map->stripe_len = BTRFS_STRIPE_LEN; + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; + map->type = type; + map->sub_stripes = ctl->sub_stripes; + + trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); + + em = alloc_extent_map(); + if (!em) { + kfree(map); + return -ENOMEM; + } + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); + em->map_lookup = map; + em->start = start; + em->len = ctl->chunk_size; + em->block_start = 0; + em->block_len = em->len; + em->orig_block_len = ctl->stripe_size; + + em_tree = &info->mapping_tree; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + if (ret) { + write_unlock(&em_tree->lock); + free_extent_map(em); + return ret; + } + write_unlock(&em_tree->lock); + + ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); + if (ret) + goto error_del_extent; + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *dev = map->stripes[i].dev; + + btrfs_device_set_bytes_used(dev, + dev->bytes_used + ctl->stripe_size); + if (list_empty(&dev->post_commit_list)) + list_add_tail(&dev->post_commit_list, + &trans->transaction->dev_update_list); + } + + atomic64_sub(ctl->stripe_size * map->num_stripes, + &info->free_chunk_space); + + free_extent_map(em); + check_raid56_incompat_flag(info, type); + check_raid1c34_incompat_flag(info, type); + + return 0; + +error_del_extent: + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + /* One for our allocation */ + free_extent_map(em); + /* One for the tree reference */ + free_extent_map(em); + + return ret; +} + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 start, u64 type) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; - struct extent_map *em; struct btrfs_device_info *devices_info = NULL; struct alloc_chunk_ctl ctl; int ret; - int i; - int j; if (!alloc_profile_is_valid(type, 0)) { ASSERT(0); @@ -5061,94 +5150,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ret = gather_device_info(fs_devices, &ctl, devices_info); if (ret < 0) - goto error; + goto out; ret = decide_stripe_size(fs_devices, &ctl, devices_info); if (ret < 0) - goto error; + goto out; - map = kmalloc(map_lookup_size(ctl.num_stripes), GFP_NOFS); - if (!map) { - ret = -ENOMEM; - goto error; - } + ret = create_chunk(trans, &ctl, devices_info); - map->num_stripes = ctl.num_stripes; - - for (i = 0; i < ctl.ndevs; ++i) { - for (j = 0; j < ctl.dev_stripes; ++j) { - int s = i * ctl.dev_stripes + j; - map->stripes[s].dev = devices_info[i].dev; - map->stripes[s].physical = devices_info[i].dev_offset + - j * ctl.stripe_size; - } - } - map->stripe_len = BTRFS_STRIPE_LEN; - map->io_align = BTRFS_STRIPE_LEN; - map->io_width = BTRFS_STRIPE_LEN; - map->type = type; - map->sub_stripes = ctl.sub_stripes; - - trace_btrfs_chunk_alloc(info, map, start, ctl.chunk_size); - - em = alloc_extent_map(); - if (!em) { - kfree(map); - ret = -ENOMEM; - goto error; - } - set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->map_lookup = map; - em->start = start; - em->len = ctl.chunk_size; - em->block_start = 0; - em->block_len = em->len; - em->orig_block_len = ctl.stripe_size; - - em_tree = &info->mapping_tree; - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); - if (ret) { - write_unlock(&em_tree->lock); - free_extent_map(em); - goto error; - } - write_unlock(&em_tree->lock); - - ret = btrfs_make_block_group(trans, 0, type, start, ctl.chunk_size); - if (ret) - goto error_del_extent; - - for (i = 0; i < map->num_stripes; i++) { - struct btrfs_device *dev = map->stripes[i].dev; - - btrfs_device_set_bytes_used(dev, - dev->bytes_used + ctl.stripe_size); - if (list_empty(&dev->post_commit_list)) - list_add_tail(&dev->post_commit_list, - &trans->transaction->dev_update_list); - } - - atomic64_sub(ctl.stripe_size * map->num_stripes, - &info->free_chunk_space); - - free_extent_map(em); - check_raid56_incompat_flag(info, type); - check_raid1c34_incompat_flag(info, type); - - kfree(devices_info); - return 0; - -error_del_extent: - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* One for our allocation */ - free_extent_map(em); - /* One for the tree reference */ - free_extent_map(em); -error: +out: kfree(devices_info); return ret; } From 6aafb3038454bf137463f2c2096065ba9a452ccc Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:15 +0900 Subject: [PATCH 142/210] btrfs: parameterize dev_extent_min for chunk allocation Currently, we ignore a device whose available space is less than "BTRFS_STRIPE_LEN * dev_stripes". This is a lower limit for current allocation policy (to maximize the number of stripes). This commit parameterizes dev_extent_min, so that other policies can set their own lower limitat to ignore a device with insufficient space. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 888009bbec8f..1f7ad23dcfb5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4805,6 +4805,7 @@ struct alloc_chunk_ctl { int nparity; u64 max_stripe_size; u64 max_chunk_size; + u64 dev_extent_min; u64 stripe_size; u64 chunk_size; int ndevs; @@ -4838,6 +4839,7 @@ static void init_alloc_chunk_ctl_policy_regular( /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), ctl->max_chunk_size); + ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; } static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, @@ -4873,7 +4875,6 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device; u64 total_avail; u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; - u64 dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; int ret; int ndevs = 0; u64 max_avail; @@ -4901,7 +4902,7 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, total_avail = 0; /* If there is no space on this device, skip it. */ - if (total_avail == 0) + if (total_avail < ctl->dev_extent_min) continue; ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, @@ -4912,12 +4913,12 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, if (ret == 0) max_avail = dev_extent_want; - if (max_avail < dev_extent_min) { + if (max_avail < ctl->dev_extent_min) { if (btrfs_test_opt(info, ENOSPC_DEBUG)) btrfs_debug(info, "%s: devid %llu has no free space, have=%llu want=%llu", __func__, device->devid, max_avail, - dev_extent_min); + ctl->dev_extent_min); continue; } From cb2f96f8ab68812fa3e6f32ec763fb15ae6395c6 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:16 +0900 Subject: [PATCH 143/210] btrfs: introduce extent allocation policy This commit introduces extent allocation policy for btrfs. This policy controls how btrfs allocate an extents from block groups. There is no functional change introduced with this commit. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9d5563a63a1f..bd0d13661ea7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3427,6 +3427,10 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } +enum btrfs_extent_allocation_policy { + BTRFS_EXTENT_ALLOC_CLUSTERED, +}; + /* * Structure used internally for find_free_extent() function. Wraps needed * parameters. @@ -3478,6 +3482,9 @@ struct find_free_extent_ctl { /* Found result */ u64 found_offset; + + /* Allocation policy */ + enum btrfs_extent_allocation_policy policy; }; @@ -3815,6 +3822,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ffe_ctl.have_caching_bg = false; ffe_ctl.orig_have_caching_bg = false; ffe_ctl.found_offset = 0; + ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; From ea544149a49f3aad4b6f164507608e59c12bbc11 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:17 +0900 Subject: [PATCH 144/210] btrfs: move hint_byte into find_free_extent_ctl This commit moves hint_byte into find_free_extent_ctl, so that we can modify the hint_byte in the other functions. This will help us split find_free_extent further. This commit also renames the function argument "hint_byte" to "hint_byte_orig" to avoid misuse. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bd0d13661ea7..4f68291d03ea 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3483,6 +3483,9 @@ struct find_free_extent_ctl { /* Found result */ u64 found_offset; + /* Hint where to start looking for an empty space */ + u64 hint_byte; + /* Allocation policy */ enum btrfs_extent_allocation_policy policy; }; @@ -3797,7 +3800,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, */ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 ram_bytes, u64 num_bytes, u64 empty_size, - u64 hint_byte, struct btrfs_key *ins, + u64 hint_byte_orig, struct btrfs_key *ins, u64 flags, int delalloc) { int ret = 0; @@ -3822,6 +3825,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ffe_ctl.have_caching_bg = false; ffe_ctl.orig_have_caching_bg = false; ffe_ctl.found_offset = 0; + ffe_ctl.hint_byte = hint_byte_orig; ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; ins->type = BTRFS_EXTENT_ITEM_KEY; @@ -3864,14 +3868,14 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, if (last_ptr) { spin_lock(&last_ptr->lock); if (last_ptr->block_group) - hint_byte = last_ptr->window_start; + ffe_ctl.hint_byte = last_ptr->window_start; if (last_ptr->fragmented) { /* * We still set window_start so we can keep track of the * last place we found an allocation to try and save * some time. */ - hint_byte = last_ptr->window_start; + ffe_ctl.hint_byte = last_ptr->window_start; use_cluster = false; } spin_unlock(&last_ptr->lock); @@ -3879,8 +3883,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ffe_ctl.search_start = max(ffe_ctl.search_start, first_logical_byte(fs_info, 0)); - ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); - if (ffe_ctl.search_start == hint_byte) { + ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte); + if (ffe_ctl.search_start == ffe_ctl.hint_byte) { block_group = btrfs_lookup_block_group(fs_info, ffe_ctl.search_start); /* From c10859be9b96815e28637e1e0fbd950802929f4f Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:18 +0900 Subject: [PATCH 145/210] btrfs: move variables for clustered allocation into find_free_extent_ctl Move "last_ptr" and "use_cluster" into struct find_free_extent_ctl, so that hook functions for clustered allocator can use these variables. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4f68291d03ea..0d6e2529a75b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3447,6 +3447,8 @@ struct find_free_extent_ctl { /* For clustered allocation */ u64 empty_cluster; + struct btrfs_free_cluster *last_ptr; + bool use_cluster; bool have_caching_bg; bool orig_have_caching_bg; @@ -3805,11 +3807,9 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, { int ret = 0; int cache_block_group_error = 0; - struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group *block_group = NULL; struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; - bool use_cluster = true; bool full_search = false; WARN_ON(num_bytes < fs_info->sectorsize); @@ -3818,8 +3818,6 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ffe_ctl.empty_size = empty_size; ffe_ctl.flags = flags; ffe_ctl.search_start = 0; - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; ffe_ctl.delalloc = delalloc; ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); ffe_ctl.have_caching_bg = false; @@ -3828,6 +3826,12 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ffe_ctl.hint_byte = hint_byte_orig; ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + /* For clustered allocation */ + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; + ffe_ctl.last_ptr = NULL; + ffe_ctl.use_cluster = true; + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -3858,14 +3862,16 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); return -ENOSPC; } else if (space_info->max_extent_size) { - use_cluster = false; + ffe_ctl.use_cluster = false; } spin_unlock(&space_info->lock); } - last_ptr = fetch_cluster_info(fs_info, space_info, - &ffe_ctl.empty_cluster); - if (last_ptr) { + ffe_ctl.last_ptr = fetch_cluster_info(fs_info, space_info, + &ffe_ctl.empty_cluster); + if (ffe_ctl.last_ptr) { + struct btrfs_free_cluster *last_ptr = ffe_ctl.last_ptr; + spin_lock(&last_ptr->lock); if (last_ptr->block_group) ffe_ctl.hint_byte = last_ptr->window_start; @@ -3876,7 +3882,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, * some time. */ ffe_ctl.hint_byte = last_ptr->window_start; - use_cluster = false; + ffe_ctl.use_cluster = false; } spin_unlock(&last_ptr->lock); } @@ -3989,10 +3995,11 @@ have_block_group: * Ok we want to try and use the cluster allocator, so * lets look there */ - if (last_ptr && use_cluster) { + if (ffe_ctl.last_ptr && ffe_ctl.use_cluster) { struct btrfs_block_group *cluster_bg = NULL; - ret = find_free_extent_clustered(block_group, last_ptr, + ret = find_free_extent_clustered(block_group, + ffe_ctl.last_ptr, &ffe_ctl, &cluster_bg); if (ret == 0) { @@ -4010,8 +4017,8 @@ have_block_group: /* ret == -ENOENT case falls through */ } - ret = find_free_extent_unclustered(block_group, last_ptr, - &ffe_ctl); + ret = find_free_extent_unclustered(block_group, + ffe_ctl.last_ptr, &ffe_ctl); if (ret == -EAGAIN) goto have_block_group; else if (ret > 0) @@ -4060,8 +4067,9 @@ loop: } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, - full_search, use_cluster); + ret = find_free_extent_update_loop(fs_info, ffe_ctl.last_ptr, ins, + &ffe_ctl, full_search, + ffe_ctl.use_cluster); if (ret > 0) goto search; From c668690dc035e9a8597a7e60feeb85d1b1e581e6 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:19 +0900 Subject: [PATCH 146/210] btrfs: factor out do_allocation() for extent allocation Factor out do_allocation() from find_free_extent(). This function do an actual extent allocation in a given block group. The ffe_ctl->policy is used to determine the actual allocator function to use. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 75 +++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0d6e2529a75b..49a682038d1b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3664,6 +3664,37 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg, return 0; } +static int do_allocation_clustered(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + int ret; + + /* We want to try and use the cluster allocator, so lets look there */ + if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { + ret = find_free_extent_clustered(block_group, ffe_ctl->last_ptr, + ffe_ctl, bg_ret); + if (ret >= 0 || ret == -EAGAIN) + return ret; + /* ret == -ENOENT case falls through */ + } + + return find_free_extent_unclustered(block_group, ffe_ctl->last_ptr, + ffe_ctl); +} + +static int do_allocation(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return do_allocation_clustered(block_group, ffe_ctl, bg_ret); + default: + BUG(); + } +} + /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. @@ -3931,6 +3962,8 @@ search: down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[ffe_ctl.index], list) { + struct btrfs_block_group *bg_ret; + /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) continue; @@ -3991,40 +4024,20 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; - /* - * Ok we want to try and use the cluster allocator, so - * lets look there - */ - if (ffe_ctl.last_ptr && ffe_ctl.use_cluster) { - struct btrfs_block_group *cluster_bg = NULL; - - ret = find_free_extent_clustered(block_group, - ffe_ctl.last_ptr, - &ffe_ctl, &cluster_bg); - - if (ret == 0) { - if (cluster_bg && cluster_bg != block_group) { - btrfs_release_block_group(block_group, - delalloc); - block_group = cluster_bg; - } - goto checks; - } else if (ret == -EAGAIN) { - goto have_block_group; - } else if (ret > 0) { - goto loop; + bg_ret = NULL; + ret = do_allocation(block_group, &ffe_ctl, &bg_ret); + if (ret == 0) { + if (bg_ret && bg_ret != block_group) { + btrfs_release_block_group(block_group, delalloc); + block_group = bg_ret; } - /* ret == -ENOENT case falls through */ + } else if (ret == -EAGAIN) { + goto have_block_group; + } else if (ret > 0) { + goto loop; } - ret = find_free_extent_unclustered(block_group, - ffe_ctl.last_ptr, &ffe_ctl); - if (ret == -EAGAIN) - goto have_block_group; - else if (ret > 0) - goto loop; - /* ret == 0 case falls through */ -checks: + /* Checks */ ffe_ctl.search_start = round_up(ffe_ctl.found_offset, fs_info->stripesize); From 897cae7948cb06c2f98de3c6f1833e7d8b3daa02 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:20 +0900 Subject: [PATCH 147/210] btrfs: drop unnecessary arguments from clustered allocation functions Now that, find_free_extent_clustered() and find_free_extent_unclustered() can access "last_ptr" from the "clustered" variable, we can drop it from the arguments. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 49a682038d1b..93d5b5c48185 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3502,11 +3502,11 @@ struct find_free_extent_ctl { * Return 0 means we have found a location and set ffe_ctl->found_offset. */ static int find_free_extent_clustered(struct btrfs_block_group *bg, - struct btrfs_free_cluster *last_ptr, - struct find_free_extent_ctl *ffe_ctl, - struct btrfs_block_group **cluster_bg_ret) + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **cluster_bg_ret) { struct btrfs_block_group *cluster_bg; + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 aligned_cluster; u64 offset; int ret; @@ -3606,9 +3606,9 @@ refill_cluster: * Return -EAGAIN to inform caller that we need to re-search this block group */ static int find_free_extent_unclustered(struct btrfs_block_group *bg, - struct btrfs_free_cluster *last_ptr, - struct find_free_extent_ctl *ffe_ctl) + struct find_free_extent_ctl *ffe_ctl) { + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 offset; /* @@ -3672,15 +3672,13 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, /* We want to try and use the cluster allocator, so lets look there */ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { - ret = find_free_extent_clustered(block_group, ffe_ctl->last_ptr, - ffe_ctl, bg_ret); + ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); if (ret >= 0 || ret == -EAGAIN) return ret; /* ret == -ENOENT case falls through */ } - return find_free_extent_unclustered(block_group, ffe_ctl->last_ptr, - ffe_ctl); + return find_free_extent_unclustered(block_group, ffe_ctl); } static int do_allocation(struct btrfs_block_group *block_group, From baba50624fe57320837cf2aa566769385872d2af Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:21 +0900 Subject: [PATCH 148/210] btrfs: factor out release_block_group() Factor out release_block_group() from find_free_extent(). This function is called when it gives up an allocation from a block group. Each allocation policy should reset its information for an allocation in the next block group. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 93d5b5c48185..6d4f53e92833 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3693,6 +3693,24 @@ static int do_allocation(struct btrfs_block_group *block_group, } } +static void release_block_group(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + int delalloc) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + ffe_ctl->retry_clustered = false; + ffe_ctl->retry_unclustered = false; + break; + default: + BUG(); + } + + BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != + ffe_ctl->index); + btrfs_release_block_group(block_group, delalloc); +} + /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. @@ -4069,11 +4087,7 @@ have_block_group: btrfs_release_block_group(block_group, delalloc); break; loop: - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; - BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != - ffe_ctl.index); - btrfs_release_block_group(block_group, delalloc); + release_block_group(block_group, &ffe_ctl, delalloc); cond_resched(); } up_read(&space_info->groups_sem); From 0ab9724bf5f320f79d6f7e214f4fd1f95449a917 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:22 +0900 Subject: [PATCH 149/210] btrfs: factor out found_extent() for extent allocation Factor out found_extent() from find_free_extent_update_loop(). This function is called when a proper extent is found and before returning from find_free_extent(). Hook functions like found_extent_clustered() should save information for a next allocation. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6d4f53e92833..dfa88a795c3f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3711,6 +3711,30 @@ static void release_block_group(struct btrfs_block_group *block_group, btrfs_release_block_group(block_group, delalloc); } +static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_key *ins) +{ + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + + if (!ffe_ctl->use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } +} + +static void found_extent(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_key *ins) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + found_extent_clustered(ffe_ctl, ins); + break; + default: + BUG(); + } +} + /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. @@ -3737,11 +3761,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return 1; if (ins->objectid) { - if (!use_cluster && last_ptr) { - spin_lock(&last_ptr->lock); - last_ptr->window_start = ins->objectid; - spin_unlock(&last_ptr->lock); - } + found_extent(ffe_ctl, ins); return 0; } From 15b7ee6584c66f21f0a9aa7f7a6f3bfda5c5ad34 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:23 +0900 Subject: [PATCH 150/210] btrfs: drop unnecessary arguments from find_free_extent_update_loop() Now that, we don't use last_ptr and use_cluster in the function. Drop these arguments from it. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dfa88a795c3f..054fc6c7be84 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3741,10 +3741,9 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, * Return <0 means we failed to locate any free extent. */ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, - struct btrfs_free_cluster *last_ptr, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, - bool full_search, bool use_cluster) + bool full_search) { struct btrfs_root *root = fs_info->extent_root; int ret; @@ -4112,9 +4111,7 @@ loop: } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, ffe_ctl.last_ptr, ins, - &ffe_ctl, full_search, - ffe_ctl.use_cluster); + ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search); if (ret > 0) goto search; From c70e2139dc81b35c7d86a5d550490f7eb1451c2e Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:24 +0900 Subject: [PATCH 151/210] btrfs: factor out chunk_allocation_failed() for extent allocation Factor out chunk_allocation_failed() from find_free_extent_update_loop(). This function is called when it failed to allocate a chunk. The function can modify "ffe_ctl->loop" and return 0 to continue with the next stage. Or, it can return -ENOSPC to give up here. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 054fc6c7be84..bcffb96d87c6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3735,6 +3735,21 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, } } +static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + /* + * If we can't allocate a new chunk we've already looped through + * at least once, move on to the NO_EMPTY_SIZE case. + */ + ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; + return 0; + default: + BUG(); + } +} + /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. @@ -3806,16 +3821,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, CHUNK_ALLOC_FORCE); - /* - * If we can't allocate a new chunk we've already looped - * through at least once, move on to the NO_EMPTY_SIZE - * case. - */ - if (ret == -ENOSPC) - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; - /* Do not bail out on ENOSPC since we can do more. */ - if (ret < 0 && ret != -ENOSPC) + if (ret == -ENOSPC) + ret = chunk_allocation_failed(ffe_ctl); + else if (ret < 0) btrfs_abort_transaction(trans, ret); else ret = 0; From 45d8e033b2354e74a3f80cffec0b0b1aa099f4c7 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:25 +0900 Subject: [PATCH 152/210] btrfs: skip LOOP_NO_EMPTY_SIZE if not clustered allocation LOOP_NO_EMPTY_SIZE is solely dedicated for clustered allocation. So, we can skip this stage and give up the allocation. Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bcffb96d87c6..1d895df67b3b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3835,6 +3835,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, } if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { + if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) + return -ENOSPC; + /* * Don't loop again if we already have no empty_size and * no empty_cluster. From 7e895409421534bc7a0d8d79a81be4533d70ba8f Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 25 Feb 2020 12:56:26 +0900 Subject: [PATCH 153/210] btrfs: factor out prepare_allocation() for extent allocation This function finally factor out prepare_allocation() form find_free_extent(). This function is called before the allocation loop and a specific allocator function like prepare_allocation_clustered() should initialize their private information and can set proper hint_byte to indicate where to start the allocation with. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 110 +++++++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 42 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1d895df67b3b..36374b86529c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3853,6 +3853,71 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } +static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, + struct btrfs_key *ins) +{ + /* + * If our free space is heavily fragmented we may not be able to make + * big contiguous allocations, so instead of doing the expensive search + * for free space, simply return ENOSPC with our max_extent_size so we + * can go ahead and search for a more manageable chunk. + * + * If our max_extent_size is large enough for our allocation simply + * disable clustering since we will likely not be able to find enough + * space to create a cluster and induce latency trying. + */ + if (space_info->max_extent_size) { + spin_lock(&space_info->lock); + if (space_info->max_extent_size && + ffe_ctl->num_bytes > space_info->max_extent_size) { + ins->offset = space_info->max_extent_size; + spin_unlock(&space_info->lock); + return -ENOSPC; + } else if (space_info->max_extent_size) { + ffe_ctl->use_cluster = false; + } + spin_unlock(&space_info->lock); + } + + ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info, + &ffe_ctl->empty_cluster); + if (ffe_ctl->last_ptr) { + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + ffe_ctl->hint_byte = last_ptr->window_start; + if (last_ptr->fragmented) { + /* + * We still set window_start so we can keep track of the + * last place we found an allocation to try and save + * some time. + */ + ffe_ctl->hint_byte = last_ptr->window_start; + ffe_ctl->use_cluster = false; + } + spin_unlock(&last_ptr->lock); + } + + return 0; +} + +static int prepare_allocation(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, + struct btrfs_key *ins) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return prepare_allocation_clustered(fs_info, ffe_ctl, + space_info, ins); + default: + BUG(); + } +} + /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: @@ -3922,48 +3987,9 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, return -ENOSPC; } - /* - * If our free space is heavily fragmented we may not be able to make - * big contiguous allocations, so instead of doing the expensive search - * for free space, simply return ENOSPC with our max_extent_size so we - * can go ahead and search for a more manageable chunk. - * - * If our max_extent_size is large enough for our allocation simply - * disable clustering since we will likely not be able to find enough - * space to create a cluster and induce latency trying. - */ - if (unlikely(space_info->max_extent_size)) { - spin_lock(&space_info->lock); - if (space_info->max_extent_size && - num_bytes > space_info->max_extent_size) { - ins->offset = space_info->max_extent_size; - spin_unlock(&space_info->lock); - return -ENOSPC; - } else if (space_info->max_extent_size) { - ffe_ctl.use_cluster = false; - } - spin_unlock(&space_info->lock); - } - - ffe_ctl.last_ptr = fetch_cluster_info(fs_info, space_info, - &ffe_ctl.empty_cluster); - if (ffe_ctl.last_ptr) { - struct btrfs_free_cluster *last_ptr = ffe_ctl.last_ptr; - - spin_lock(&last_ptr->lock); - if (last_ptr->block_group) - ffe_ctl.hint_byte = last_ptr->window_start; - if (last_ptr->fragmented) { - /* - * We still set window_start so we can keep track of the - * last place we found an allocation to try and save - * some time. - */ - ffe_ctl.hint_byte = last_ptr->window_start; - ffe_ctl.use_cluster = false; - } - spin_unlock(&last_ptr->lock); - } + ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins); + if (ret < 0) + return ret; ffe_ctl.search_start = max(ffe_ctl.search_start, first_logical_byte(fs_info, 0)); From f0cc2cd70164efe8f75c5d99560f0f69969c72e4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 28 Feb 2020 13:04:36 +0000 Subject: [PATCH 154/210] Btrfs: fix crash during unmount due to race with delayed inode workers During unmount we can have a job from the delayed inode items work queue still running, that can lead to at least two bad things: 1) A crash, because the worker can try to create a transaction just after the fs roots were freed; 2) A transaction leak, because the worker can create a transaction before the fs roots are freed and just after we committed the last transaction and after we stopped the transaction kthread. A stack trace example of the crash: [79011.691214] kernel BUG at lib/radix-tree.c:982! [79011.692056] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI [79011.693180] CPU: 3 PID: 1394 Comm: kworker/u8:2 Tainted: G W 5.6.0-rc2-btrfs-next-54 #2 (...) [79011.696789] Workqueue: btrfs-delayed-meta btrfs_work_helper [btrfs] [79011.697904] RIP: 0010:radix_tree_tag_set+0xe7/0x170 (...) [79011.702014] RSP: 0018:ffffb3c84a317ca0 EFLAGS: 00010293 [79011.702949] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [79011.704202] RDX: ffffb3c84a317cb0 RSI: ffffb3c84a317ca8 RDI: ffff8db3931340a0 [79011.705463] RBP: 0000000000000005 R08: 0000000000000005 R09: ffffffff974629d0 [79011.706756] R10: ffffb3c84a317bc0 R11: 0000000000000001 R12: ffff8db393134000 [79011.708010] R13: ffff8db3931340a0 R14: ffff8db393134068 R15: 0000000000000001 [79011.709270] FS: 0000000000000000(0000) GS:ffff8db3b6a00000(0000) knlGS:0000000000000000 [79011.710699] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [79011.711710] CR2: 00007f22c2a0a000 CR3: 0000000232ad4005 CR4: 00000000003606e0 [79011.712958] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [79011.714205] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [79011.715448] Call Trace: [79011.715925] record_root_in_trans+0x72/0xf0 [btrfs] [79011.716819] btrfs_record_root_in_trans+0x4b/0x70 [btrfs] [79011.717925] start_transaction+0xdd/0x5c0 [btrfs] [79011.718829] btrfs_async_run_delayed_root+0x17e/0x2b0 [btrfs] [79011.719915] btrfs_work_helper+0xaa/0x720 [btrfs] [79011.720773] process_one_work+0x26d/0x6a0 [79011.721497] worker_thread+0x4f/0x3e0 [79011.722153] ? process_one_work+0x6a0/0x6a0 [79011.722901] kthread+0x103/0x140 [79011.723481] ? kthread_create_worker_on_cpu+0x70/0x70 [79011.724379] ret_from_fork+0x3a/0x50 (...) The following diagram shows a sequence of steps that lead to the crash during ummount of the filesystem: CPU 1 CPU 2 CPU 3 btrfs_punch_hole() btrfs_btree_balance_dirty() btrfs_balance_delayed_items() --> sees fs_info->delayed_root->items with value 200, which is greater than BTRFS_DELAYED_BACKGROUND (128) and smaller than BTRFS_DELAYED_WRITEBACK (512) btrfs_wq_run_delayed_node() --> queues a job for fs_info->delayed_workers to run btrfs_async_run_delayed_root() btrfs_async_run_delayed_root() --> job queued by CPU 1 --> starts picking and running delayed nodes from the prepare_list list close_ctree() btrfs_delete_unused_bgs() btrfs_commit_super() btrfs_join_transaction() --> gets transaction N btrfs_commit_transaction(N) --> set transaction state to TRANTS_STATE_COMMIT_START btrfs_first_prepared_delayed_node() --> picks delayed node X through the prepared_list list btrfs_run_delayed_items() btrfs_first_delayed_node() --> also picks delayed node X but through the node_list list __btrfs_commit_inode_delayed_items() --> runs all delayed items from this node and drops the node's item count to 0 through call to btrfs_release_delayed_inode() --> finishes running any remaining delayed nodes --> finishes transaction commit --> stops cleaner and transaction threads btrfs_free_fs_roots() --> frees all roots and removes them from the radix tree fs_info->fs_roots_radix btrfs_join_transaction() start_transaction() btrfs_record_root_in_trans() record_root_in_trans() radix_tree_tag_set() --> crashes because the root is not in the radix tree anymore If the worker is able to call btrfs_join_transaction() before the unmount task frees the fs roots, we end up leaking a transaction and all its resources, since after the call to btrfs_commit_super() and stopping the transaction kthread, we don't expect to have any transaction open anymore. When this situation happens the worker has a delayed node that has no more items to run, since the task calling btrfs_run_delayed_items(), which is doing a transaction commit, picks the same node and runs all its items first. We can not wait for the worker to complete when running delayed items through btrfs_run_delayed_items(), because we call that function in several phases of a transaction commit, and that could cause a deadlock because the worker calls btrfs_join_transaction() and the task doing the transaction commit may have already set the transaction state to TRANS_STATE_COMMIT_DOING. Also it's not possible to get into a situation where only some of the items of a delayed node are added to the fs/subvolume tree in the current transaction and the remaining ones in the next transaction, because when running the items of a delayed inode we lock its mutex, effectively waiting for the worker if the worker is running the items of the delayed node already. Since this can only cause issues when unmounting a filesystem, fix it in a simple way by waiting for any jobs on the delayed workers queue before calling btrfs_commit_supper() at close_ctree(). This works because at this point no one can call btrfs_btree_balance_dirty() or btrfs_balance_delayed_items(), and if we end up waiting for any worker to complete, btrfs_commit_super() will commit the transaction created by the worker. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/async-thread.c | 8 ++++++++ fs/btrfs/async-thread.h | 1 + fs/btrfs/disk-io.c | 13 +++++++++++++ 3 files changed, 22 insertions(+) diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 1d32a07bb2d1..309516e6a968 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -395,3 +395,11 @@ void btrfs_set_work_high_priority(struct btrfs_work *work) { set_bit(WORK_HIGH_PRIO_BIT, &work->flags); } + +void btrfs_flush_workqueue(struct btrfs_workqueue *wq) +{ + if (wq->high) + flush_workqueue(wq->high->normal_wq); + + flush_workqueue(wq->normal->normal_wq); +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index a4434301d84d..3204daa51b95 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -44,5 +44,6 @@ void btrfs_set_work_high_priority(struct btrfs_work *work); struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work); struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); +void btrfs_flush_workqueue(struct btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f01f19c76cf5..483b54077d01 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4067,6 +4067,19 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ btrfs_delete_unused_bgs(fs_info); + /* + * There might be existing delayed inode workers still running + * and holding an empty delayed inode item. We must wait for + * them to complete first because they can create a transaction. + * This happens when someone calls btrfs_balance_delayed_items() + * and then a transaction commit runs the same delayed nodes + * before any delayed worker has done something with the nodes. + * We must wait for any worker here and not at transaction + * commit time since that could cause a deadlock. + * This is a very rare case. + */ + btrfs_flush_workqueue(fs_info->delayed_workers); + ret = btrfs_commit_super(fs_info); if (ret) btrfs_err(fs_info, "commit super ret %d", ret); From fa121a26b2ceabce613e0b4cfc7498cfde73fe8d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 21 Feb 2020 16:41:10 -0500 Subject: [PATCH 155/210] btrfs: fix btrfs_calc_reclaim_metadata_size calculation I noticed while running my snapshot torture test that we were getting a lot of metadata chunks allocated with very little actually used. Digging into this we would commit the transaction, still not have enough space, and then force a chunk allocation. I noticed that we were barely flushing any delalloc at all, despite the fact that we had around 13gib of outstanding delalloc reservations. It turns out this is because of our btrfs_calc_reclaim_metadata_size() calculation. It _only_ takes into account the outstanding ticket sizes, which isn't the whole story. In this particular workload we're slowly filling up the disk, which means our overcommit space will suddenly become a lot less, and our outstanding reservations will be well more than what we can handle. However we are only flushing based on our ticket size, which is much less than we need to actually reclaim. So fix btrfs_calc_reclaim_metadata_size() to take into account the overage in the case that we've gotten less available space suddenly. This makes it so we attempt to reclaim a lot more delalloc space, which allows us to make our reservations and we no longer are allocating a bunch of needless metadata chunks. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 3b8d8fa8d767..9cb511d8cd9d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -306,25 +306,19 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) return (global->size << 1); } -int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush) +static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + enum btrfs_reserve_flush_enum flush) { u64 profile; u64 avail; - u64 used; int factor; - /* Don't overcommit when in mixed mode. */ - if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) - return 0; - if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) profile = btrfs_system_alloc_profile(fs_info); else profile = btrfs_metadata_alloc_profile(fs_info); - used = btrfs_space_info_used(space_info, true); avail = atomic64_read(&fs_info->free_chunk_space); /* @@ -345,6 +339,22 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, avail >>= 3; else avail >>= 1; + return avail; +} + +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + u64 avail; + u64 used; + + /* Don't overcommit when in mixed mode */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return 0; + + used = btrfs_space_info_used(space_info, true); + avail = calc_available_free_space(fs_info, space_info, flush); if (used + bytes < space_info->total_bytes + avail) return 1; @@ -776,6 +786,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, { struct reserve_ticket *ticket; u64 used; + u64 avail; u64 expected; u64 to_reclaim = 0; @@ -783,6 +794,20 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, to_reclaim += ticket->bytes; list_for_each_entry(ticket, &space_info->priority_tickets, list) to_reclaim += ticket->bytes; + + avail = calc_available_free_space(fs_info, space_info, + BTRFS_RESERVE_FLUSH_ALL); + used = btrfs_space_info_used(space_info, true); + + /* + * We may be flushing because suddenly we have less space than we had + * before, and now we're well over-committed based on our current free + * space. If that's the case add in our overage so we make sure to put + * appropriate pressure on the flushing state machine. + */ + if (space_info->total_bytes + avail < used) + to_reclaim += used - (space_info->total_bytes + avail); + if (to_reclaim) return to_reclaim; From 11c67b1a40b0a4a7c712103508a63267c7ecd4b8 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Mar 2020 12:29:25 +0200 Subject: [PATCH 156/210] btrfs: Rename __btrfs_alloc_chunk to btrfs_alloc_chunk Having btrfs_alloc_chunk doesn't bring any value since it encapsulates a lockdep assert and a call to find_next_chunk. Simply rename the internal __btrfs_alloc_chunk function to the public one and remove it's 2nd parameter as all callers always pass the return value of find_next_chunk. Finally, migrate the call to lockdep_assert_held so as to not lose the check. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1f7ad23dcfb5..f693bc5ed836 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5114,8 +5114,7 @@ error_del_extent: return ret; } -static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - u64 start, u64 type) +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; @@ -5123,6 +5122,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct alloc_chunk_ctl ctl; int ret; + lockdep_assert_held(&info->chunk_mutex); + if (!alloc_profile_is_valid(type, 0)) { ASSERT(0); return -EINVAL; @@ -5140,7 +5141,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, return -EINVAL; } - ctl.start = start; + ctl.start = find_next_chunk(info); ctl.type = type; init_alloc_chunk_ctl(fs_devices, &ctl); @@ -5164,6 +5165,13 @@ out: return ret; } +/* + * Chunk allocation falls into two parts. The first part does work + * that makes the new allocated chunk usable, but does not do any operation + * that modifies the chunk tree. The second part does the work that + * requires modifying the chunk tree. This division is important for the + * bootstrap process of adding storage to a seed btrfs. + */ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 chunk_size) { @@ -5262,39 +5270,19 @@ out: return ret; } -/* - * Chunk allocation falls into two parts. The first part does work - * that makes the new allocated chunk usable, but does not do any operation - * that modifies the chunk tree. The second part does the work that - * requires modifying the chunk tree. This division is important for the - * bootstrap process of adding storage to a seed btrfs. - */ -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) -{ - u64 chunk_offset; - - lockdep_assert_held(&trans->fs_info->chunk_mutex); - chunk_offset = find_next_chunk(trans->fs_info); - return __btrfs_alloc_chunk(trans, chunk_offset, type); -} - static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - u64 chunk_offset; - u64 sys_chunk_offset; u64 alloc_profile; int ret; - chunk_offset = find_next_chunk(fs_info); alloc_profile = btrfs_metadata_alloc_profile(fs_info); - ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); + ret = btrfs_alloc_chunk(trans, alloc_profile); if (ret) return ret; - sys_chunk_offset = find_next_chunk(fs_info); alloc_profile = btrfs_system_alloc_profile(fs_info); - ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); + ret = btrfs_alloc_chunk(trans, alloc_profile); return ret; } From 59a0fcdb489dae25416e93b9fc67cc97a6847f01 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Feb 2020 21:00:45 +0100 Subject: [PATCH 157/210] btrfs: inline checksum name and driver definitions There's an unnecessary indirection in the checksum definition table, pointer and the string itself. The strings are short and the overall size of one entry is now 24 bytes. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f948435e87df..bfedbbe2311f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -31,8 +31,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, static const struct btrfs_csums { u16 size; - const char *name; - const char *driver; + const char name[10]; + const char driver[12]; } btrfs_csums[] = { [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, @@ -63,7 +63,8 @@ const char *btrfs_super_csum_name(u16 csum_type) const char *btrfs_super_csum_driver(u16 csum_type) { /* csum type is validated at mount time */ - return btrfs_csums[csum_type].driver ?: + return btrfs_csums[csum_type].driver[0] ? + btrfs_csums[csum_type].driver : btrfs_csums[csum_type].name; } From e9be5a303d271faa42b44b606fbe4d7507a7f026 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Feb 2020 21:00:47 +0100 Subject: [PATCH 158/210] btrfs: simplify tree block checksumming loop Thw whole point of csum_tree_block is to iterate over all extent buffer pages and pass it to checksumming functions. The bytes where checksum is stored must be skipped, thus map_private_extent_buffer. This complicates further offset calculations. As the first page will be always present, checksum the relevant bytes unconditionally and then do a simple iteration over the remaining pages. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 483b54077d01..6f63cc476d1f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -259,38 +259,22 @@ out: static int csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; + const int num_pages = fs_info->nodesize >> PAGE_SHIFT; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - unsigned long len; - unsigned long cur_len; - unsigned long offset = BTRFS_CSUM_SIZE; char *kaddr; - unsigned long map_start; - unsigned long map_len; - int err; + int i; shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); + kaddr = page_address(buf->pages[0]); + crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, + PAGE_SIZE - BTRFS_CSUM_SIZE); - len = buf->len - offset; - - while (len > 0) { - /* - * Note: we don't need to check for the err == 1 case here, as - * with the given combination of 'start = BTRFS_CSUM_SIZE (32)' - * and 'min_len = 32' and the currently implemented mapping - * algorithm we cannot cross a page boundary. - */ - err = map_private_extent_buffer(buf, offset, 32, - &kaddr, &map_start, &map_len); - if (WARN_ON(err)) - return err; - cur_len = min(len, map_len - (offset - map_start)); - crypto_shash_update(shash, kaddr + offset - map_start, cur_len); - len -= cur_len; - offset += cur_len; + for (i = 1; i < num_pages; i++) { + kaddr = page_address(buf->pages[i]); + crypto_shash_update(shash, kaddr, PAGE_SIZE); } memset(result, 0, BTRFS_CSUM_SIZE); - crypto_shash_final(shash, result); return 0; From c67b38925b682bdd3419ff39688991b1dc7cdfb6 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Feb 2020 21:00:49 +0100 Subject: [PATCH 159/210] btrfs: return void from csum_tree_block Now that csum_tree_block is not returning any errors, we can make csum_tree_block return void and simplify callers. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6f63cc476d1f..6b00ddea0b48 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -253,10 +253,8 @@ out: /* * Compute the csum of a btree block and store the result to provided buffer. - * - * Returns error if the extent buffer cannot be mapped. */ -static int csum_tree_block(struct extent_buffer *buf, u8 *result) +static void csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; const int num_pages = fs_info->nodesize >> PAGE_SHIFT; @@ -276,8 +274,6 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result) } memset(result, 0, BTRFS_CSUM_SIZE); crypto_shash_final(shash, result); - - return 0; } /* @@ -528,8 +524,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE) == 0); - if (csum_tree_block(eb, result)) - return -EINVAL; + csum_tree_block(eb, result); if (btrfs_header_level(eb)) ret = btrfs_check_node(eb); @@ -640,9 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); - ret = csum_tree_block(eb, result); - if (ret) - goto err; + csum_tree_block(eb, result); if (memcmp_extent_buffer(eb, result, 0, csum_size)) { u32 val; From 5ba366c3999cb1a1a56571b2c64c13ff67f1005c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Feb 2020 21:00:52 +0100 Subject: [PATCH 160/210] btrfs: balance: factor out convert profile validation The validation follows the same steps for all three block group types, the existing helper validate_convert_profile can be enhanced and do more of the common things. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f693bc5ed836..74c34c388098 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3801,13 +3801,25 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info) atomic_read(&fs_info->balance_cancel_req) == 0); } -/* Non-zero return value signifies invalidity */ -static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, - u64 allowed) +/* + * Validate target profile against allowed profiles and return true if it's OK. + * Otherwise print the error message and return false. + */ +static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, + const struct btrfs_balance_args *bargs, + u64 allowed, const char *type) { - return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl_arg->target, 1) || - (bctl_arg->target & ~allowed))); + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) + return true; + + /* Profile is valid and does not have bits outside of the allowed set */ + if (alloc_profile_is_valid(bargs->target, 1) && + (bargs->target & ~allowed) == 0) + return true; + + btrfs_err(fs_info, "balance: invalid convert %s profile %s", + type, btrfs_bg_type_to_raid_name(bargs->target)); + return false; } /* @@ -4023,24 +4035,9 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if (num_devices >= btrfs_raid_array[i].devs_min) allowed |= btrfs_raid_array[i].bg_flag; - if (validate_convert_profile(&bctl->data, allowed)) { - btrfs_err(fs_info, - "balance: invalid convert data profile %s", - btrfs_bg_type_to_raid_name(bctl->data.target)); - ret = -EINVAL; - goto out; - } - if (validate_convert_profile(&bctl->meta, allowed)) { - btrfs_err(fs_info, - "balance: invalid convert metadata profile %s", - btrfs_bg_type_to_raid_name(bctl->meta.target)); - ret = -EINVAL; - goto out; - } - if (validate_convert_profile(&bctl->sys, allowed)) { - btrfs_err(fs_info, - "balance: invalid convert system profile %s", - btrfs_bg_type_to_raid_name(bctl->sys.target)); + if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || + !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || + !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { ret = -EINVAL; goto out; } From c6600d9ac6987933b8a60950b7a51cc772095141 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 4 Mar 2020 17:04:47 +0200 Subject: [PATCH 161/210] btrfs: Remove impossible BUG_ON in get_tree_block_key relocate_tree_blocks calls get_tree_block_key for a block iff that block has its ->key_ready equal false. Thus the BUG_ON in the latter function cannot ever be triggered so remove it. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 6130ba69bc49..ba39f6969a72 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3089,7 +3089,6 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, { struct extent_buffer *eb; - BUG_ON(block->key_ready); eb = read_tree_block(fs_info, block->bytenr, block->key.offset, block->level, NULL); if (IS_ERR(eb)) { From 65cd6d9e30fa85922cb073625665807130d01e70 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 4 Mar 2020 11:33:53 +0200 Subject: [PATCH 162/210] btrfs: Open code insert_extent_backref No need to add a level of indirection for hiding a simple 'if'. Open code insert_extent_backref in its sole caller. No functional changes. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 36374b86529c..a3778937d705 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1189,24 +1189,6 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, return ret; } -static int insert_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - u64 bytenr, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add) -{ - int ret; - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); - ret = insert_tree_block_ref(trans, path, bytenr, parent, - root_objectid); - } else { - ret = insert_extent_data_ref(trans, path, bytenr, parent, - root_objectid, owner, offset, - refs_to_add); - } - return ret; -} - static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, @@ -1493,8 +1475,15 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ - ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid, - owner, offset, refs_to_add); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + BUG_ON(refs_to_add != 1); + ret = insert_tree_block_ref(trans, path, bytenr, parent, + root_objectid); + } else { + ret = insert_extent_data_ref(trans, path, bytenr, parent, + root_objectid, owner, offset, + refs_to_add); + } if (ret) btrfs_abort_transaction(trans, ret); out: From 29566c9c773456467933ee22bbca1c2b72a3506c Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Fri, 6 Mar 2020 12:22:43 +0530 Subject: [PATCH 163/210] btrfs: add RCU locks around block group initialization The space_info list is normally RCU protected and should be traversed with rcu_read_lock held. There's a warning [29.104756] WARNING: suspicious RCU usage [29.105046] 5.6.0-rc4-next-20200305 #1 Not tainted [29.105231] ----------------------------- [29.105401] fs/btrfs/block-group.c:2011 RCU-list traversed in non-reader section!! pointing out that the locking is missing in btrfs_read_block_groups. However this is not necessary as the list traversal happens at mount time when there's no other thread potentially accessing the list. To fix the warning and for consistency let's add the RCU lock/unlock, the code won't be affected much as it's doing some lightweight operations. Reported-by: Guenter Roeck Signed-off-by: Madhuparna Bhowmik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index b8f39a679064..786849fcc319 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2008,6 +2008,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) btrfs_release_path(path); } + rcu_read_lock(); list_for_each_entry_rcu(space_info, &info->space_info, list) { if (!(btrfs_get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | @@ -2028,6 +2029,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list) inc_block_group_ro(cache, 1); } + rcu_read_unlock(); btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); From 17b238acf7c665e5c1eb44a31be10299fcbf8858 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 6 Mar 2020 10:49:12 -0600 Subject: [PATCH 164/210] btrfs: delayed-inode: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero." [1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 74ae226ffaf0..ca96ef007d8f 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -70,7 +70,7 @@ struct btrfs_delayed_item { refcount_t refs; int ins_or_del; u32 data_len; - char data[0]; + char data[]; }; static inline void btrfs_init_delayed_root( From 7593f4c53c699699c6968a5fdd795d0fdf99a65d Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 6 Mar 2020 10:51:05 -0600 Subject: [PATCH 165/210] btrfs: rcu-string: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero." [1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/rcu-string.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index a97dc74a4d3d..5c1a617eb25d 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -8,7 +8,7 @@ struct rcu_string { struct rcu_head rcu; - char str[0]; + char str[]; }; static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) From a8753ee3a859a519613641963464b38b9a79dc9c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 6 Mar 2020 16:13:33 -0600 Subject: [PATCH 166/210] btrfs: scrub: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero." [1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 16aa680134fd..adaf8ab694d5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -149,7 +149,7 @@ struct scrub_parity { */ unsigned long *ebitmap; - unsigned long bitmap[0]; + unsigned long bitmap[]; }; struct scrub_ctx { From 6a177381007b463ad611375cce526c24f12ab081 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 28 Feb 2020 13:04:17 +0000 Subject: [PATCH 167/210] Btrfs: move all reflink implementation code into its own file The reflink code is quite large and has been living in ioctl.c since ever. It has grown over the years after many bug fixes and improvements, and since I'm planning on making some further improvements on it, it's time to get it better organized by moving into its own file, reflink.c (similar to what xfs does for example). This change only moves the code out of ioctl.c into the new file, it doesn't do any other change. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.h | 3 - fs/btrfs/file.c | 1 + fs/btrfs/ioctl.c | 733 --------------------------------------------- fs/btrfs/reflink.c | 729 ++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/reflink.h | 12 + 6 files changed, 743 insertions(+), 737 deletions(-) create mode 100644 fs/btrfs/reflink.c create mode 100644 fs/btrfs/reflink.h diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9a0ff3384381..e738f6206ea5 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o delalloc-space.o block-group.o discard.o + block-rsv.o delalloc-space.o block-group.o discard.o reflink.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ea5d0675465a..ecd016f7dab1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2974,9 +2974,6 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8a974a82be51..31c72371a164 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -27,6 +27,7 @@ #include "qgroup.h" #include "compression.h" #include "delalloc-space.h" +#include "reflink.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 04899d3d775e..f8a73a28022a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -87,10 +87,6 @@ struct btrfs_ioctl_send_args_32 { struct btrfs_ioctl_send_args_32) #endif -static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff, - int no_time_update); - /* Mask out flags that are inappropriate for the given type of inode. */ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, unsigned int flags) @@ -3315,735 +3311,6 @@ out: return ret; } -static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); -} - -static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - if (inode1 < inode2) { - swap(inode1, inode2); - swap(loff1, loff2); - } else if (inode1 == inode2 && loff2 < loff1) { - swap(loff1, loff2); - } - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); -} - -static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, - struct inode *dst, u64 dst_loff) -{ - const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - int ret; - - /* - * Lock destination range to serialize with concurrent readpages() and - * source range to serialize with relocation. - */ - btrfs_double_extent_lock(src, loff, dst, dst_loff, len); - ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); - btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); - - return ret; -} - -#define BTRFS_MAX_DEDUPE_LEN SZ_16M - -static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, - struct inode *dst, u64 dst_loff) -{ - int ret; - u64 i, tail_len, chunk_count; - struct btrfs_root *root_dst = BTRFS_I(dst)->root; - - spin_lock(&root_dst->root_item_lock); - if (root_dst->send_in_progress) { - btrfs_warn_rl(root_dst->fs_info, -"cannot deduplicate to root %llu while send operations are using it (%d in progress)", - root_dst->root_key.objectid, - root_dst->send_in_progress); - spin_unlock(&root_dst->root_item_lock); - return -EAGAIN; - } - root_dst->dedupe_in_progress++; - spin_unlock(&root_dst->root_item_lock); - - tail_len = olen % BTRFS_MAX_DEDUPE_LEN; - chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); - - for (i = 0; i < chunk_count; i++) { - ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff); - if (ret) - goto out; - - loff += BTRFS_MAX_DEDUPE_LEN; - dst_loff += BTRFS_MAX_DEDUPE_LEN; - } - - if (tail_len > 0) - ret = btrfs_extent_same_range(src, loff, tail_len, dst, - dst_loff); -out: - spin_lock(&root_dst->root_item_lock); - root_dst->dedupe_in_progress--; - spin_unlock(&root_dst->root_item_lock); - - return ret; -} - -static int clone_finish_inode_update(struct btrfs_trans_handle *trans, - struct inode *inode, - u64 endoff, - const u64 destoff, - const u64 olen, - int no_time_update) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - - inode_inc_iversion(inode); - if (!no_time_update) - inode->i_mtime = inode->i_ctime = current_time(inode); - /* - * We round up to the block size at eof when determining which - * extents to clone above, but shouldn't round up the file size. - */ - if (endoff > destoff + olen) - endoff = destoff + olen; - if (endoff > inode->i_size) { - i_size_write(inode, endoff); - btrfs_inode_safe_disk_i_size_write(inode, 0); - } - - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - goto out; - } - ret = btrfs_end_transaction(trans); -out: - return ret; -} - -/* - * Make sure we do not end up inserting an inline extent into a file that has - * already other (non-inline) extents. If a file has an inline extent it can - * not have any other extents and the (single) inline extent must start at the - * file offset 0. Failing to respect these rules will lead to file corruption, - * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc - * - * We can have extents that have been already written to disk or we can have - * dirty ranges still in delalloc, in which case the extent maps and items are - * created only when we run delalloc, and the delalloc ranges might fall outside - * the range we are currently locking in the inode's io tree. So we check the - * inode's i_size because of that (i_size updates are done while holding the - * i_mutex, which we are holding here). - * We also check to see if the inode has a size not greater than "datal" but has - * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are - * protected against such concurrent fallocate calls by the i_mutex). - * - * If the file has no extents but a size greater than datal, do not allow the - * copy because we would need turn the inline extent into a non-inline one (even - * with NO_HOLES enabled). If we find our destination inode only has one inline - * extent, just overwrite it with the source inline extent if its size is less - * than the source extent's size, or we could copy the source inline extent's - * data into the destination inode's inline extent if the later is greater then - * the former. - */ -static int clone_copy_inline_extent(struct inode *dst, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_key *new_key, - const u64 drop_start, - const u64 datal, - const u64 skip, - const u64 size, - char *inline_data) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); - struct btrfs_root *root = BTRFS_I(dst)->root; - const u64 aligned_end = ALIGN(new_key->offset + datal, - fs_info->sectorsize); - int ret; - struct btrfs_key key; - - if (new_key->offset > 0) - return -EOPNOTSUPP; - - key.objectid = btrfs_ino(BTRFS_I(dst)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - return ret; - } else if (ret > 0) { - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ret; - else if (ret > 0) - goto copy_inline_extent; - } - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && - key.type == BTRFS_EXTENT_DATA_KEY) { - ASSERT(key.offset > 0); - return -EOPNOTSUPP; - } - } else if (i_size_read(dst) <= datal) { - struct btrfs_file_extent_item *ei; - u64 ext_len; - - /* - * If the file size is <= datal, make sure there are no other - * extents following (can happen do to an fallocate call with - * the flag FALLOC_FL_KEEP_SIZE). - */ - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - /* - * If it's an inline extent, it can not have other extents - * following it. - */ - if (btrfs_file_extent_type(path->nodes[0], ei) == - BTRFS_FILE_EXTENT_INLINE) - goto copy_inline_extent; - - ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); - if (ext_len > aligned_end) - return -EOPNOTSUPP; - - ret = btrfs_next_item(root, path); - if (ret < 0) { - return ret; - } else if (ret == 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && - key.type == BTRFS_EXTENT_DATA_KEY) - return -EOPNOTSUPP; - } - } - -copy_inline_extent: - /* - * We have no extent items, or we have an extent at offset 0 which may - * or may not be inlined. All these cases are dealt the same way. - */ - if (i_size_read(dst) > datal) { - /* - * If the destination inode has an inline extent... - * This would require copying the data from the source inline - * extent into the beginning of the destination's inline extent. - * But this is really complex, both extents can be compressed - * or just one of them, which would require decompressing and - * re-compressing data (which could increase the new compressed - * size, not allowing the compressed data to fit anymore in an - * inline extent). - * So just don't support this case for now (it should be rare, - * we are not really saving space when cloning inline extents). - */ - return -EOPNOTSUPP; - } - - btrfs_release_path(path); - ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); - if (ret) - return ret; - ret = btrfs_insert_empty_item(trans, root, path, new_key, size); - if (ret) - return ret; - - if (skip) { - const u32 start = btrfs_file_extent_calc_inline_size(0); - - memmove(inline_data + start, inline_data + start + skip, datal); - } - - write_extent_buffer(path->nodes[0], inline_data, - btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]), - size); - inode_add_bytes(dst, datal); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); - - return 0; -} - -/** - * btrfs_clone() - clone a range from inode file to another - * - * @src: Inode to clone from - * @inode: Inode to clone to - * @off: Offset within source to start clone from - * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen - * @destoff: Offset within @inode to start clone - * @no_time_update: Whether to update mtime/ctime on the target inode - */ -static int btrfs_clone(struct inode *src, struct inode *inode, - const u64 off, const u64 olen, const u64 olen_aligned, - const u64 destoff, int no_time_update) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path = NULL; - struct extent_buffer *leaf; - struct btrfs_trans_handle *trans; - char *buf = NULL; - struct btrfs_key key; - u32 nritems; - int slot; - int ret; - const u64 len = olen_aligned; - u64 last_dest_end = destoff; - - ret = -ENOMEM; - buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); - if (!buf) - return ret; - - path = btrfs_alloc_path(); - if (!path) { - kvfree(buf); - return ret; - } - - path->reada = READA_FORWARD; - /* clone data */ - key.objectid = btrfs_ino(BTRFS_I(src)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = off; - - while (1) { - u64 next_key_min_offset = key.offset + 1; - struct btrfs_file_extent_item *extent; - int type; - u32 size; - struct btrfs_key new_key; - u64 disko = 0, diskl = 0; - u64 datao = 0, datal = 0; - u8 comp; - u64 drop_start; - - /* - * note the key will change type as we walk through the - * tree. - */ - path->leave_spinning = 1; - ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, - 0, 0); - if (ret < 0) - goto out; - /* - * First search, if no extent item that starts at offset off was - * found but the previous item is an extent item, it's possible - * it might overlap our target range, therefore process it. - */ - if (key.offset == off && ret > 0 && path->slots[0] > 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0] - 1); - if (key.type == BTRFS_EXTENT_DATA_KEY) - path->slots[0]--; - } - - nritems = btrfs_header_nritems(path->nodes[0]); -process_slot: - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(BTRFS_I(src)->root, path); - if (ret < 0) - goto out; - if (ret > 0) - break; - nritems = btrfs_header_nritems(path->nodes[0]); - } - leaf = path->nodes[0]; - slot = path->slots[0]; - - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.type > BTRFS_EXTENT_DATA_KEY || - key.objectid != btrfs_ino(BTRFS_I(src))) - break; - - ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - comp = btrfs_file_extent_compression(leaf, extent); - type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - disko = btrfs_file_extent_disk_bytenr(leaf, extent); - diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); - datao = btrfs_file_extent_offset(leaf, extent); - datal = btrfs_file_extent_num_bytes(leaf, extent); - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - /* Take upper bound, may be compressed */ - datal = btrfs_file_extent_ram_bytes(leaf, extent); - } - - /* - * The first search might have left us at an extent item that - * ends before our target range's start, can happen if we have - * holes and NO_HOLES feature enabled. - */ - if (key.offset + datal <= off) { - path->slots[0]++; - goto process_slot; - } else if (key.offset >= off + len) { - break; - } - next_key_min_offset = key.offset + datal; - size = btrfs_item_size_nr(leaf, slot); - read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), - size); - - btrfs_release_path(path); - path->leave_spinning = 0; - - memcpy(&new_key, &key, sizeof(new_key)); - new_key.objectid = btrfs_ino(BTRFS_I(inode)); - if (off <= key.offset) - new_key.offset = key.offset + destoff - off; - else - new_key.offset = destoff; - - /* - * Deal with a hole that doesn't have an extent item that - * represents it (NO_HOLES feature enabled). - * This hole is either in the middle of the cloning range or at - * the beginning (fully overlaps it or partially overlaps it). - */ - if (new_key.offset != last_dest_end) - drop_start = last_dest_end; - else - drop_start = new_key.offset; - - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - struct btrfs_clone_extent_info clone_info; - - /* - * a | --- range to clone ---| b - * | ------------- extent ------------- | - */ - - /* Subtract range b */ - if (key.offset + datal > off + len) - datal = off + len - key.offset; - - /* Subtract range a */ - if (off > key.offset) { - datao += off - key.offset; - datal -= off - key.offset; - } - - clone_info.disk_offset = disko; - clone_info.disk_len = diskl; - clone_info.data_offset = datao; - clone_info.data_len = datal; - clone_info.file_offset = new_key.offset; - clone_info.extent_buf = buf; - clone_info.item_size = size; - ret = btrfs_punch_hole_range(inode, path, - drop_start, - new_key.offset + datal - 1, - &clone_info, &trans); - if (ret) - goto out; - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 skip = 0; - u64 trim = 0; - - if (off > key.offset) { - skip = off - key.offset; - new_key.offset += skip; - } - - if (key.offset + datal > off + len) - trim = key.offset + datal - (off + len); - - if (comp && (skip || trim)) { - ret = -EINVAL; - goto out; - } - size -= skip + trim; - datal -= skip + trim; - - /* - * If our extent is inline, we know we will drop or - * adjust at most 1 extent item in the destination root. - * - * 1 - adjusting old extent (we may have to split it) - * 1 - add new extent - * 1 - inode update - */ - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - - ret = clone_copy_inline_extent(inode, trans, path, - &new_key, drop_start, - datal, skip, size, buf); - if (ret) { - if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - goto out; - } - } - - btrfs_release_path(path); - - last_dest_end = ALIGN(new_key.offset + datal, - fs_info->sectorsize); - ret = clone_finish_inode_update(trans, inode, last_dest_end, - destoff, olen, no_time_update); - if (ret) - goto out; - if (new_key.offset + datal >= destoff + len) - break; - - btrfs_release_path(path); - key.offset = next_key_min_offset; - - if (fatal_signal_pending(current)) { - ret = -EINTR; - goto out; - } - } - ret = 0; - - if (last_dest_end < destoff + len) { - /* - * We have an implicit hole that fully or partially overlaps our - * cloning range at its end. This means that we either have the - * NO_HOLES feature enabled or the implicit hole happened due to - * mixing buffered and direct IO writes against this file. - */ - btrfs_release_path(path); - path->leave_spinning = 0; - - ret = btrfs_punch_hole_range(inode, path, - last_dest_end, destoff + len - 1, - NULL, &trans); - if (ret) - goto out; - - ret = clone_finish_inode_update(trans, inode, destoff + len, - destoff, olen, no_time_update); - } - -out: - btrfs_free_path(path); - kvfree(buf); - return ret; -} - -static noinline int btrfs_clone_files(struct file *file, struct file *file_src, - u64 off, u64 olen, u64 destoff) -{ - struct inode *inode = file_inode(file); - struct inode *src = file_inode(file_src); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; - u64 len = olen; - u64 bs = fs_info->sb->s_blocksize; - - /* - * TODO: - * - split compressed inline extents. annoying: we need to - * decompress into destination's address_space (the file offset - * may change, so source mapping won't do), then recompress (or - * otherwise reinsert) a subrange. - * - * - split destination inode's inline extents. The inline extents can - * be either compressed or non-compressed. - */ - - /* - * VFS's generic_remap_file_range_prep() protects us from cloning the - * eof block into the middle of a file, which would result in corruption - * if the file size is not blocksize aligned. So we don't need to check - * for that case here. - */ - if (off + len == src->i_size) - len = ALIGN(src->i_size, bs) - off; - - if (destoff > inode->i_size) { - const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); - - ret = btrfs_cont_expand(inode, inode->i_size, destoff); - if (ret) - return ret; - /* - * We may have truncated the last block if the inode's size is - * not sector size aligned, so we need to wait for writeback to - * complete before proceeding further, otherwise we can race - * with cloning and attempt to increment a reference to an - * extent that no longer exists (writeback completed right after - * we found the previous extent covering eof and before we - * attempted to increment its reference count). - */ - ret = btrfs_wait_ordered_range(inode, wb_start, - destoff - wb_start); - if (ret) - return ret; - } - - /* - * Lock destination range to serialize with concurrent readpages() and - * source range to serialize with relocation. - */ - btrfs_double_extent_lock(src, off, inode, destoff, len); - ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - btrfs_double_extent_unlock(src, off, inode, destoff, len); - /* - * Truncate page cache pages so that future reads will see the cloned - * data immediately and not the previous data. - */ - truncate_inode_pages_range(&inode->i_data, - round_down(destoff, PAGE_SIZE), - round_up(destoff + len, PAGE_SIZE) - 1); - - return ret; -} - -static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; - bool same_inode = inode_out == inode_in; - u64 wb_len; - int ret; - - if (!(remap_flags & REMAP_FILE_DEDUP)) { - struct btrfs_root *root_out = BTRFS_I(inode_out)->root; - - if (btrfs_root_readonly(root_out)) - return -EROFS; - - if (file_in->f_path.mnt != file_out->f_path.mnt || - inode_in->i_sb != inode_out->i_sb) - return -EXDEV; - } - - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { - return -EINVAL; - } - - /* - * Now that the inodes are locked, we need to start writeback ourselves - * and can not rely on the writeback from the VFS's generic helper - * generic_remap_file_range_prep() because: - * - * 1) For compression we must call filemap_fdatawrite_range() range - * twice (btrfs_fdatawrite_range() does it for us), and the generic - * helper only calls it once; - * - * 2) filemap_fdatawrite_range(), called by the generic helper only - * waits for the writeback to complete, i.e. for IO to be done, and - * not for the ordered extents to complete. We need to wait for them - * to complete so that new file extent items are in the fs tree. - */ - if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) - wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); - else - wb_len = ALIGN(*len, bs); - - /* - * Since we don't lock ranges, wait for ongoing lockless dio writes (as - * any in progress could create its ordered extents after we wait for - * existing ordered extents below). - */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - /* - * Workaround to make sure NOCOW buffered write reach disk as NOCOW. - * - * Btrfs' back references do not have a block level granularity, they - * work at the whole extent level. - * NOCOW buffered write without data space reserved may not be able - * to fall back to CoW due to lack of data space, thus could cause - * data loss. - * - * Here we take a shortcut by flushing the whole inode, so that all - * nocow write should reach disk as nocow before we increase the - * reference of the extent. We could do better by only flushing NOCOW - * data, but that needs extra accounting. - * - * Also we don't need to check ASYNC_EXTENT, as async extent will be - * CoWed anyway, not affecting nocow part. - */ - ret = filemap_flush(inode_in->i_mapping); - if (ret < 0) - return ret; - - ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), - wb_len); - if (ret < 0) - return ret; - ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), - wb_len); - if (ret < 0) - return ret; - - return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, - len, remap_flags); -} - -loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, - struct file *dst_file, loff_t destoff, loff_t len, - unsigned int remap_flags) -{ - struct inode *src_inode = file_inode(src_file); - struct inode *dst_inode = file_inode(dst_file); - bool same_inode = dst_inode == src_inode; - int ret; - - if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) - return -EINVAL; - - if (same_inode) - inode_lock(src_inode); - else - lock_two_nondirectories(src_inode, dst_inode); - - ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, - &len, remap_flags); - if (ret < 0 || len == 0) - goto out_unlock; - - if (remap_flags & REMAP_FILE_DEDUP) - ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); - else - ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); - -out_unlock: - if (same_inode) - inode_unlock(src_inode); - else - unlock_two_nondirectories(src_inode, dst_inode); - - return ret < 0 ? ret : len; -} - static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c new file mode 100644 index 000000000000..367b11656118 --- /dev/null +++ b/fs/btrfs/reflink.c @@ -0,0 +1,729 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "ctree.h" +#include "reflink.h" +#include "transaction.h" + +#define BTRFS_MAX_DEDUPE_LEN SZ_16M + +static int clone_finish_inode_update(struct btrfs_trans_handle *trans, + struct inode *inode, + u64 endoff, + const u64 destoff, + const u64 olen, + int no_time_update) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + inode_inc_iversion(inode); + if (!no_time_update) + inode->i_mtime = inode->i_ctime = current_time(inode); + /* + * We round up to the block size at eof when determining which + * extents to clone above, but shouldn't round up the file size. + */ + if (endoff > destoff + olen) + endoff = destoff + olen; + if (endoff > inode->i_size) { + i_size_write(inode, endoff); + btrfs_inode_safe_disk_i_size_write(inode, 0); + } + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out; + } + ret = btrfs_end_transaction(trans); +out: + return ret; +} + +/* + * Make sure we do not end up inserting an inline extent into a file that has + * already other (non-inline) extents. If a file has an inline extent it can + * not have any other extents and the (single) inline extent must start at the + * file offset 0. Failing to respect these rules will lead to file corruption, + * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc + * + * We can have extents that have been already written to disk or we can have + * dirty ranges still in delalloc, in which case the extent maps and items are + * created only when we run delalloc, and the delalloc ranges might fall outside + * the range we are currently locking in the inode's io tree. So we check the + * inode's i_size because of that (i_size updates are done while holding the + * i_mutex, which we are holding here). + * We also check to see if the inode has a size not greater than "datal" but has + * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are + * protected against such concurrent fallocate calls by the i_mutex). + * + * If the file has no extents but a size greater than datal, do not allow the + * copy because we would need turn the inline extent into a non-inline one (even + * with NO_HOLES enabled). If we find our destination inode only has one inline + * extent, just overwrite it with the source inline extent if its size is less + * than the source extent's size, or we could copy the source inline extent's + * data into the destination inode's inline extent if the later is greater then + * the former. + */ +static int clone_copy_inline_extent(struct inode *dst, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_key *new_key, + const u64 drop_start, + const u64 datal, + const u64 skip, + const u64 size, + char *inline_data) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); + struct btrfs_root *root = BTRFS_I(dst)->root; + const u64 aligned_end = ALIGN(new_key->offset + datal, + fs_info->sectorsize); + int ret; + struct btrfs_key key; + + if (new_key->offset > 0) + return -EOPNOTSUPP; + + key.objectid = btrfs_ino(BTRFS_I(dst)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + goto copy_inline_extent; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == btrfs_ino(BTRFS_I(dst)) && + key.type == BTRFS_EXTENT_DATA_KEY) { + ASSERT(key.offset > 0); + return -EOPNOTSUPP; + } + } else if (i_size_read(dst) <= datal) { + struct btrfs_file_extent_item *ei; + u64 ext_len; + + /* + * If the file size is <= datal, make sure there are no other + * extents following (can happen do to an fallocate call with + * the flag FALLOC_FL_KEEP_SIZE). + */ + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + /* + * If it's an inline extent, it can not have other extents + * following it. + */ + if (btrfs_file_extent_type(path->nodes[0], ei) == + BTRFS_FILE_EXTENT_INLINE) + goto copy_inline_extent; + + ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); + if (ext_len > aligned_end) + return -EOPNOTSUPP; + + ret = btrfs_next_item(root, path); + if (ret < 0) { + return ret; + } else if (ret == 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == btrfs_ino(BTRFS_I(dst)) && + key.type == BTRFS_EXTENT_DATA_KEY) + return -EOPNOTSUPP; + } + } + +copy_inline_extent: + /* + * We have no extent items, or we have an extent at offset 0 which may + * or may not be inlined. All these cases are dealt the same way. + */ + if (i_size_read(dst) > datal) { + /* + * If the destination inode has an inline extent. + * This would require copying the data from the source inline + * extent into the beginning of the destination's inline extent. + * But this is really complex, both extents can be compressed + * or just one of them, which would require decompressing and + * re-compressing data (which could increase the new compressed + * size, not allowing the compressed data to fit anymore in an + * inline extent). + * So just don't support this case for now (it should be rare, + * we are not really saving space when cloning inline extents). + */ + return -EOPNOTSUPP; + } + + btrfs_release_path(path); + ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); + if (ret) + return ret; + ret = btrfs_insert_empty_item(trans, root, path, new_key, size); + if (ret) + return ret; + + if (skip) { + const u32 start = btrfs_file_extent_calc_inline_size(0); + + memmove(inline_data + start, inline_data + start + skip, datal); + } + + write_extent_buffer(path->nodes[0], inline_data, + btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]), + size); + inode_add_bytes(dst, datal); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); + + return 0; +} + +/** + * btrfs_clone() - clone a range from inode file to another + * + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen + * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode + */ +static int btrfs_clone(struct inode *src, struct inode *inode, + const u64 off, const u64 olen, const u64 olen_aligned, + const u64 destoff, int no_time_update) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path = NULL; + struct extent_buffer *leaf; + struct btrfs_trans_handle *trans; + char *buf = NULL; + struct btrfs_key key; + u32 nritems; + int slot; + int ret; + const u64 len = olen_aligned; + u64 last_dest_end = destoff; + + ret = -ENOMEM; + buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); + if (!buf) + return ret; + + path = btrfs_alloc_path(); + if (!path) { + kvfree(buf); + return ret; + } + + path->reada = READA_FORWARD; + /* Clone data */ + key.objectid = btrfs_ino(BTRFS_I(src)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = off; + + while (1) { + u64 next_key_min_offset = key.offset + 1; + struct btrfs_file_extent_item *extent; + int type; + u32 size; + struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; + u64 drop_start; + + /* Note the key will change type as we walk through the tree */ + path->leave_spinning = 1; + ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, + 0, 0); + if (ret < 0) + goto out; + /* + * First search, if no extent item that starts at offset off was + * found but the previous item is an extent item, it's possible + * it might overlap our target range, therefore process it. + */ + if (key.offset == off && ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0] - 1); + if (key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + nritems = btrfs_header_nritems(path->nodes[0]); +process_slot: + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(BTRFS_I(src)->root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + nritems = btrfs_header_nritems(path->nodes[0]); + } + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.type > BTRFS_EXTENT_DATA_KEY || + key.objectid != btrfs_ino(BTRFS_I(src))) + break; + + ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + disko = btrfs_file_extent_disk_bytenr(leaf, extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* Take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, extent); + } + + /* + * The first search might have left us at an extent item that + * ends before our target range's start, can happen if we have + * holes and NO_HOLES feature enabled. + */ + if (key.offset + datal <= off) { + path->slots[0]++; + goto process_slot; + } else if (key.offset >= off + len) { + break; + } + next_key_min_offset = key.offset + datal; + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), + size); + + btrfs_release_path(path); + path->leave_spinning = 0; + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = btrfs_ino(BTRFS_I(inode)); + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; + + /* + * Deal with a hole that doesn't have an extent item that + * represents it (NO_HOLES feature enabled). + * This hole is either in the middle of the cloning range or at + * the beginning (fully overlaps it or partially overlaps it). + */ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + struct btrfs_clone_extent_info clone_info; + + /* + * a | --- range to clone ---| b + * | ------------- extent ------------- | + */ + + /* Subtract range b */ + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + /* Subtract range a */ + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + + clone_info.disk_offset = disko; + clone_info.disk_len = diskl; + clone_info.data_offset = datao; + clone_info.data_len = datal; + clone_info.file_offset = new_key.offset; + clone_info.extent_buf = buf; + clone_info.item_size = size; + ret = btrfs_punch_hole_range(inode, path, drop_start, + new_key.offset + datal - 1, &clone_info, + &trans); + if (ret) + goto out; + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 skip = 0; + u64 trim = 0; + + if (off > key.offset) { + skip = off - key.offset; + new_key.offset += skip; + } + + if (key.offset + datal > off + len) + trim = key.offset + datal - (off + len); + + if (comp && (skip || trim)) { + ret = -EINVAL; + goto out; + } + size -= skip + trim; + datal -= skip + trim; + + /* + * If our extent is inline, we know we will drop or + * adjust at most 1 extent item in the destination root. + * + * 1 - adjusting old extent (we may have to split it) + * 1 - add new extent + * 1 - inode update + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = clone_copy_inline_extent(inode, trans, path, + &new_key, drop_start, + datal, skip, size, buf); + if (ret) { + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out; + } + } + + btrfs_release_path(path); + + last_dest_end = ALIGN(new_key.offset + datal, + fs_info->sectorsize); + ret = clone_finish_inode_update(trans, inode, last_dest_end, + destoff, olen, no_time_update); + if (ret) + goto out; + if (new_key.offset + datal >= destoff + len) + break; + + btrfs_release_path(path); + key.offset = next_key_min_offset; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + } + ret = 0; + + if (last_dest_end < destoff + len) { + /* + * We have an implicit hole that fully or partially overlaps our + * cloning range at its end. This means that we either have the + * NO_HOLES feature enabled or the implicit hole happened due to + * mixing buffered and direct IO writes against this file. + */ + btrfs_release_path(path); + path->leave_spinning = 0; + + ret = btrfs_punch_hole_range(inode, path, last_dest_end, + destoff + len - 1, NULL, &trans); + if (ret) + goto out; + + ret = clone_finish_inode_update(trans, inode, destoff + len, + destoff, olen, no_time_update); + } + +out: + btrfs_free_path(path); + kvfree(buf); + return ret; +} + +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + if (inode1 < inode2) { + swap(inode1, inode2); + swap(loff1, loff2); + } else if (inode1 == inode2 && loff2 < loff1) { + swap(loff1, loff2); + } + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, + struct inode *dst, u64 dst_loff) +{ + const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + int ret; + + /* + * Lock destination range to serialize with concurrent readpages() and + * source range to serialize with relocation. + */ + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + + return ret; +} + +static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, + struct inode *dst, u64 dst_loff) +{ + int ret; + u64 i, tail_len, chunk_count; + struct btrfs_root *root_dst = BTRFS_I(dst)->root; + + spin_lock(&root_dst->root_item_lock); + if (root_dst->send_in_progress) { + btrfs_warn_rl(root_dst->fs_info, +"cannot deduplicate to root %llu while send operations are using it (%d in progress)", + root_dst->root_key.objectid, + root_dst->send_in_progress); + spin_unlock(&root_dst->root_item_lock); + return -EAGAIN; + } + root_dst->dedupe_in_progress++; + spin_unlock(&root_dst->root_item_lock); + + tail_len = olen % BTRFS_MAX_DEDUPE_LEN; + chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); + + for (i = 0; i < chunk_count; i++) { + ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, + dst, dst_loff); + if (ret) + goto out; + + loff += BTRFS_MAX_DEDUPE_LEN; + dst_loff += BTRFS_MAX_DEDUPE_LEN; + } + + if (tail_len > 0) + ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); +out: + spin_lock(&root_dst->root_item_lock); + root_dst->dedupe_in_progress--; + spin_unlock(&root_dst->root_item_lock); + + return ret; +} + +static noinline int btrfs_clone_files(struct file *file, struct file *file_src, + u64 off, u64 olen, u64 destoff) +{ + struct inode *inode = file_inode(file); + struct inode *src = file_inode(file_src); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int ret; + u64 len = olen; + u64 bs = fs_info->sb->s_blocksize; + + /* + * TODO: + * - split compressed inline extents. annoying: we need to + * decompress into destination's address_space (the file offset + * may change, so source mapping won't do), then recompress (or + * otherwise reinsert) a subrange. + * + * - split destination inode's inline extents. The inline extents can + * be either compressed or non-compressed. + */ + + /* + * VFS's generic_remap_file_range_prep() protects us from cloning the + * eof block into the middle of a file, which would result in corruption + * if the file size is not blocksize aligned. So we don't need to check + * for that case here. + */ + if (off + len == src->i_size) + len = ALIGN(src->i_size, bs) - off; + + if (destoff > inode->i_size) { + const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); + + ret = btrfs_cont_expand(inode, inode->i_size, destoff); + if (ret) + return ret; + /* + * We may have truncated the last block if the inode's size is + * not sector size aligned, so we need to wait for writeback to + * complete before proceeding further, otherwise we can race + * with cloning and attempt to increment a reference to an + * extent that no longer exists (writeback completed right after + * we found the previous extent covering eof and before we + * attempted to increment its reference count). + */ + ret = btrfs_wait_ordered_range(inode, wb_start, + destoff - wb_start); + if (ret) + return ret; + } + + /* + * Lock destination range to serialize with concurrent readpages() and + * source range to serialize with relocation. + */ + btrfs_double_extent_lock(src, off, inode, destoff, len); + ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); + btrfs_double_extent_unlock(src, off, inode, destoff, len); + /* + * Truncate page cache pages so that future reads will see the cloned + * data immediately and not the previous data. + */ + truncate_inode_pages_range(&inode->i_data, + round_down(destoff, PAGE_SIZE), + round_up(destoff + len, PAGE_SIZE) - 1); + + return ret; +} + +static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; + bool same_inode = inode_out == inode_in; + u64 wb_len; + int ret; + + if (!(remap_flags & REMAP_FILE_DEDUP)) { + struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + + if (btrfs_root_readonly(root_out)) + return -EROFS; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + inode_in->i_sb != inode_out->i_sb) + return -EXDEV; + } + + /* Don't make the dst file partly checksummed */ + if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { + return -EINVAL; + } + + /* + * Now that the inodes are locked, we need to start writeback ourselves + * and can not rely on the writeback from the VFS's generic helper + * generic_remap_file_range_prep() because: + * + * 1) For compression we must call filemap_fdatawrite_range() range + * twice (btrfs_fdatawrite_range() does it for us), and the generic + * helper only calls it once; + * + * 2) filemap_fdatawrite_range(), called by the generic helper only + * waits for the writeback to complete, i.e. for IO to be done, and + * not for the ordered extents to complete. We need to wait for them + * to complete so that new file extent items are in the fs tree. + */ + if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) + wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + else + wb_len = ALIGN(*len, bs); + + /* + * Since we don't lock ranges, wait for ongoing lockless dio writes (as + * any in progress could create its ordered extents after we wait for + * existing ordered extents below). + */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + /* + * Workaround to make sure NOCOW buffered write reach disk as NOCOW. + * + * Btrfs' back references do not have a block level granularity, they + * work at the whole extent level. + * NOCOW buffered write without data space reserved may not be able + * to fall back to CoW due to lack of data space, thus could cause + * data loss. + * + * Here we take a shortcut by flushing the whole inode, so that all + * nocow write should reach disk as nocow before we increase the + * reference of the extent. We could do better by only flushing NOCOW + * data, but that needs extra accounting. + * + * Also we don't need to check ASYNC_EXTENT, as async extent will be + * CoWed anyway, not affecting nocow part. + */ + ret = filemap_flush(inode_in->i_mapping); + if (ret < 0) + return ret; + + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), + wb_len); + if (ret < 0) + return ret; + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), + wb_len); + if (ret < 0) + return ret; + + return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + len, remap_flags); +} + +loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, loff_t len, + unsigned int remap_flags) +{ + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + bool same_inode = dst_inode == src_inode; + int ret; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (same_inode) + inode_lock(src_inode); + else + lock_two_nondirectories(src_inode, dst_inode); + + ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, + &len, remap_flags); + if (ret < 0 || len == 0) + goto out_unlock; + + if (remap_flags & REMAP_FILE_DEDUP) + ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + else + ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); + +out_unlock: + if (same_inode) + inode_unlock(src_inode); + else + unlock_two_nondirectories(src_inode, dst_inode); + + return ret < 0 ? ret : len; +} diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h new file mode 100644 index 000000000000..ecb309b4dad0 --- /dev/null +++ b/fs/btrfs/reflink.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_REFLINK_H +#define BTRFS_REFLINK_H + +#include + +loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); + +#endif /* BTRFS_REFLINK_H */ From a61e1e0df9f321955e3b03b7432e6d65f9f1a223 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 28 Feb 2020 13:04:18 +0000 Subject: [PATCH 168/210] Btrfs: simplify inline extent handling when doing reflinks We can not reflink parts of an inline extent, we must always reflink the whole inline extent. We know that inline extents always start at file offset 0 and that can never represent an amount of data larger then the filesystem's sector size (both compressed and uncompressed). We also have had the constraints that reflink operations must have a start offset that is aligned to the sector size and an end offset that is also aligned or it ends the inode's i_size, so there's no way for user space to be able to do a reflink operation that will refer to only a part of an inline extent. Initially there was a bug in the inlining code that could allow compressed inline extents that encoded more than 1 page, but that was fixed in 2008 by commit 70b99e6959a4c2 ("Btrfs: Compression corner fixes") since that was problematic. So remove all the extent cloning code that deals with the possibility of cloning only partial inline extents. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/reflink.c | 53 ++++++++++++---------------------------------- 1 file changed, 14 insertions(+), 39 deletions(-) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 367b11656118..829adbb50837 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -73,9 +73,8 @@ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_key *new_key, const u64 drop_start, const u64 datal, - const u64 skip, const u64 size, - char *inline_data) + const char *inline_data) { struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); struct btrfs_root *root = BTRFS_I(dst)->root; @@ -171,12 +170,6 @@ copy_inline_extent: if (ret) return ret; - if (skip) { - const u32 start = btrfs_file_extent_calc_inline_size(0); - - memmove(inline_data + start, inline_data + start + skip, datal); - } - write_extent_buffer(path->nodes[0], inline_data, btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), @@ -240,7 +233,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, struct btrfs_key new_key; u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; - u8 comp; u64 drop_start; /* Note the key will change type as we walk through the tree */ @@ -283,7 +275,6 @@ process_slot: extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - comp = btrfs_file_extent_compression(leaf, extent); type = btrfs_file_extent_type(leaf, extent); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -365,23 +356,18 @@ process_slot: if (ret) goto out; } else if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 skip = 0; - u64 trim = 0; - - if (off > key.offset) { - skip = off - key.offset; - new_key.offset += skip; - } - - if (key.offset + datal > off + len) - trim = key.offset + datal - (off + len); - - if (comp && (skip || trim)) { - ret = -EINVAL; - goto out; - } - size -= skip + trim; - datal -= skip + trim; + /* + * Inline extents always have to start at file offset 0 + * and can never be bigger then the sector size. We can + * never clone only parts of an inline extent, since all + * reflink operations must start at a sector size aligned + * offset, and the length must be aligned too or end at + * the i_size (which implies the whole inlined data). + */ + ASSERT(key.offset == 0); + ASSERT(datal <= fs_info->sectorsize); + if (key.offset != 0 || datal > fs_info->sectorsize) + return -EUCLEAN; /* * If our extent is inline, we know we will drop or @@ -399,7 +385,7 @@ process_slot: ret = clone_copy_inline_extent(inode, trans, path, &new_key, drop_start, - datal, skip, size, buf); + datal, size, buf); if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); @@ -543,17 +529,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, u64 len = olen; u64 bs = fs_info->sb->s_blocksize; - /* - * TODO: - * - split compressed inline extents. annoying: we need to - * decompress into destination's address_space (the file offset - * may change, so source mapping won't do), then recompress (or - * otherwise reinsert) a subrange. - * - * - split destination inode's inline extents. The inline extents can - * be either compressed or non-compressed. - */ - /* * VFS's generic_remap_file_range_prep() protects us from cloning the * eof block into the middle of a file, which would result in corruption From 05a5a7621ce66c142e081ffc24dd6ade6e912061 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 28 Feb 2020 13:04:19 +0000 Subject: [PATCH 169/210] Btrfs: implement full reflink support for inline extents There are a few cases where we don't allow cloning an inline extent into the destination inode, returning -EOPNOTSUPP to user space. This was done to prevent several types of file corruption and because it's not very straightforward to deal with these cases, as they can't rely on simply copying the inline extent between leaves. Such cases require copying the inline extent's data into the respective page of the destination inode. Not supporting these cases makes it harder and more cumbersome to write applications/libraries that work on any filesystem with reflink support, since all these cases for which btrfs fails with -EOPNOTSUPP work just fine on xfs for example. These unsupported cases are also not documented anywhere and explaining which exact cases fail require a bit of too technical understanding of btrfs's internal (inline extents and when and where can they exist in a file), so it's not really user friendly. Also some test cases from fstests that use fsx, such as generic/522 for example, can sporadically fail because they trigger one of these cases, and fsx expects all operations to succeed. This change adds supports for cloning all these cases by copying the inline extent's data into the respective page of the destination inode. With this change test case btrfs/112 from fstests fails because it expects some clone operations to fail, so it will be updated. Also a new test case that exercises all these previously unsupported cases will be added to fstests. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/reflink.c | 274 +++++++++++++++++++++++++++++++-------------- 1 file changed, 187 insertions(+), 87 deletions(-) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 829adbb50837..d1973141d3bb 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include +#include "compression.h" #include "ctree.h" +#include "delalloc-space.h" #include "reflink.h" #include "transaction.h" @@ -42,49 +45,131 @@ out: return ret; } +static int copy_inline_to_page(struct inode *inode, + const u64 file_offset, + char *inline_data, + const u64 size, + const u64 datal, + const u8 comp_type) +{ + const u64 block_size = btrfs_inode_sectorsize(inode); + const u64 range_end = file_offset + block_size - 1; + const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); + char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); + struct extent_changeset *data_reserved = NULL; + struct page *page = NULL; + int ret; + + ASSERT(IS_ALIGNED(file_offset, block_size)); + + /* + * We have flushed and locked the ranges of the source and destination + * inodes, we also have locked the inodes, so we are safe to do a + * reservation here. Also we must not do the reservation while holding + * a transaction open, otherwise we would deadlock. + */ + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, + block_size); + if (ret) + goto out; + + page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT, + btrfs_alloc_write_mask(inode->i_mapping)); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + + set_page_extent_mapped(page); + clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, NULL); + ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); + if (ret) + goto out_unlock; + + if (comp_type == BTRFS_COMPRESS_NONE) { + char *map; + + map = kmap(page); + memcpy(map, data_start, datal); + flush_dcache_page(page); + kunmap(page); + } else { + ret = btrfs_decompress(comp_type, data_start, page, 0, + inline_size, datal); + if (ret) + goto out_unlock; + flush_dcache_page(page); + } + + /* + * If our inline data is smaller then the block/page size, then the + * remaining of the block/page is equivalent to zeroes. We had something + * like the following done: + * + * $ xfs_io -f -c "pwrite -S 0xab 0 500" file + * $ sync # (or fsync) + * $ xfs_io -c "falloc 0 4K" file + * $ xfs_io -c "pwrite -S 0xcd 4K 4K" + * + * So what's in the range [500, 4095] corresponds to zeroes. + */ + if (datal < block_size) { + char *map; + + map = kmap(page); + memset(map + datal, 0, block_size - datal); + flush_dcache_page(page); + kunmap(page); + } + + SetPageUptodate(page); + ClearPageChecked(page); + set_page_dirty(page); +out_unlock: + if (page) { + unlock_page(page); + put_page(page); + } + if (ret) + btrfs_delalloc_release_space(inode, data_reserved, file_offset, + block_size, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), block_size); +out: + extent_changeset_free(data_reserved); + + return ret; +} + /* - * Make sure we do not end up inserting an inline extent into a file that has - * already other (non-inline) extents. If a file has an inline extent it can - * not have any other extents and the (single) inline extent must start at the - * file offset 0. Failing to respect these rules will lead to file corruption, - * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc - * - * We can have extents that have been already written to disk or we can have - * dirty ranges still in delalloc, in which case the extent maps and items are - * created only when we run delalloc, and the delalloc ranges might fall outside - * the range we are currently locking in the inode's io tree. So we check the - * inode's i_size because of that (i_size updates are done while holding the - * i_mutex, which we are holding here). - * We also check to see if the inode has a size not greater than "datal" but has - * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are - * protected against such concurrent fallocate calls by the i_mutex). - * - * If the file has no extents but a size greater than datal, do not allow the - * copy because we would need turn the inline extent into a non-inline one (even - * with NO_HOLES enabled). If we find our destination inode only has one inline - * extent, just overwrite it with the source inline extent if its size is less - * than the source extent's size, or we could copy the source inline extent's - * data into the destination inode's inline extent if the later is greater then - * the former. + * Deal with cloning of inline extents. We try to copy the inline extent from + * the source inode to destination inode when possible. When not possible we + * copy the inline extent's data into the respective page of the inode. */ static int clone_copy_inline_extent(struct inode *dst, - struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_key *new_key, const u64 drop_start, const u64 datal, const u64 size, - const char *inline_data) + const u8 comp_type, + char *inline_data, + struct btrfs_trans_handle **trans_out) { struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); struct btrfs_root *root = BTRFS_I(dst)->root; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); + struct btrfs_trans_handle *trans = NULL; int ret; struct btrfs_key key; - if (new_key->offset > 0) - return -EOPNOTSUPP; + if (new_key->offset > 0) { + ret = copy_inline_to_page(dst, new_key->offset, inline_data, + size, datal, comp_type); + goto out; + } key.objectid = btrfs_ino(BTRFS_I(dst)); key.type = BTRFS_EXTENT_DATA_KEY; @@ -103,72 +188,75 @@ static int clone_copy_inline_extent(struct inode *dst, btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == btrfs_ino(BTRFS_I(dst)) && key.type == BTRFS_EXTENT_DATA_KEY) { + /* + * There's an implicit hole at file offset 0, copy the + * inline extent's data to the page. + */ ASSERT(key.offset > 0); - return -EOPNOTSUPP; + ret = copy_inline_to_page(dst, new_key->offset, + inline_data, size, datal, + comp_type); + goto out; } } else if (i_size_read(dst) <= datal) { struct btrfs_file_extent_item *ei; - u64 ext_len; - /* - * If the file size is <= datal, make sure there are no other - * extents following (can happen do to an fallocate call with - * the flag FALLOC_FL_KEEP_SIZE). - */ ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); /* - * If it's an inline extent, it can not have other extents - * following it. + * If it's an inline extent replace it with the source inline + * extent, otherwise copy the source inline extent data into + * the respective page at the destination inode. */ if (btrfs_file_extent_type(path->nodes[0], ei) == BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; - ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); - if (ext_len > aligned_end) - return -EOPNOTSUPP; - - ret = btrfs_next_item(root, path); - if (ret < 0) { - return ret; - } else if (ret == 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && - key.type == BTRFS_EXTENT_DATA_KEY) - return -EOPNOTSUPP; - } + ret = copy_inline_to_page(dst, new_key->offset, inline_data, + size, datal, comp_type); + goto out; } copy_inline_extent: + ret = 0; /* * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. */ if (i_size_read(dst) > datal) { /* - * If the destination inode has an inline extent. - * This would require copying the data from the source inline - * extent into the beginning of the destination's inline extent. - * But this is really complex, both extents can be compressed - * or just one of them, which would require decompressing and - * re-compressing data (which could increase the new compressed - * size, not allowing the compressed data to fit anymore in an - * inline extent). - * So just don't support this case for now (it should be rare, - * we are not really saving space when cloning inline extents). + * At the destination offset 0 we have either a hole, a regular + * extent or an inline extent larger then the one we want to + * clone. Deal with all these cases by copying the inline extent + * data into the respective page at the destination inode. */ - return -EOPNOTSUPP; + ret = copy_inline_to_page(dst, new_key->offset, inline_data, + size, datal, comp_type); + goto out; } btrfs_release_path(path); + /* + * If we end up here it means were copy the inline extent into a leaf + * of the destination inode. We know we will drop or adjust at most one + * extent item in the destination root. + * + * 1 unit - adjusting old extent (we may have to split it) + * 1 unit - add new extent + * 1 unit - inode update + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); if (ret) - return ret; + goto out; ret = btrfs_insert_empty_item(trans, root, path, new_key, size); if (ret) - return ret; + goto out; write_extent_buffer(path->nodes[0], inline_data, btrfs_item_ptr_offset(path->nodes[0], @@ -176,8 +264,28 @@ copy_inline_extent: size); inode_add_bytes(dst, datal); set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); +out: + if (!ret && !trans) { + /* + * No transaction here means we copied the inline extent into a + * page of the destination inode. + * + * 1 unit to update inode item + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + } + } + if (ret && trans) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + } + if (!ret) + *trans_out = trans; - return 0; + return ret; } /** @@ -196,7 +304,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 destoff, int no_time_update) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path = NULL; struct extent_buffer *leaf; struct btrfs_trans_handle *trans; @@ -233,6 +340,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, struct btrfs_key new_key; u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; + u8 comp; u64 drop_start; /* Note the key will change type as we walk through the tree */ @@ -275,6 +383,7 @@ process_slot: extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); type = btrfs_file_extent_type(leaf, extent); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -369,29 +478,11 @@ process_slot: if (key.offset != 0 || datal > fs_info->sectorsize) return -EUCLEAN; - /* - * If our extent is inline, we know we will drop or - * adjust at most 1 extent item in the destination root. - * - * 1 - adjusting old extent (we may have to split it) - * 1 - add new extent - * 1 - inode update - */ - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); + ret = clone_copy_inline_extent(inode, path, &new_key, + drop_start, datal, size, + comp, buf, &trans); + if (ret) goto out; - } - - ret = clone_copy_inline_extent(inode, trans, path, - &new_key, drop_start, - datal, size, buf); - if (ret) { - if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - goto out; - } } btrfs_release_path(path); @@ -526,6 +617,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, struct inode *src = file_inode(file_src); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; + int wb_ret; u64 len = olen; u64 bs = fs_info->sb->s_blocksize; @@ -566,6 +658,14 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, btrfs_double_extent_lock(src, off, inode, destoff, len); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); btrfs_double_extent_unlock(src, off, inode, destoff, len); + + /* + * We may have copied an inline extent into a page of the destination + * range, so wait for writeback to complete before truncating pages + * from the page cache. This is a rare case. + */ + wb_ret = btrfs_wait_ordered_range(inode, destoff, len); + ret = ret ? ret : wb_ret; /* * Truncate page cache pages so that future reads will see the cloned * data immediately and not the previous data. From 726a342120eba8197b3bc5e01af1bd2dbf80f77f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 17 Feb 2020 14:16:52 +0800 Subject: [PATCH 170/210] btrfs: relocation: add error injection points for cancelling balance Introduce a new error injection point, should_cancel_balance(). It's just a wrapper of atomic_read(&fs_info->balance_cancel_req), but allows us to override the return value. Currently there are only one locations using this function: - btrfs_balance() It checks cancel before each block group. There are other locations checking fs_info->balance_cancel_req, but they are not used as an indicator to exit, so there is no need to use the wrapper. But there will be more locations coming, and some locations can cause kernel panic if not handled properly. So introduce this error injection to provide better test interface. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/relocation.c | 10 ++++++++++ fs/btrfs/volumes.c | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ecd016f7dab1..e490cfd70bba 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3378,6 +3378,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); +int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); /* scrub.c */ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ba39f6969a72..0238801e6408 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -3314,6 +3315,15 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, return ret; } +/* + * Allow error injection to test balance cancellation + */ +int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) +{ + return atomic_read(&fs_info->balance_cancel_req); +} +ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); + static int relocate_file_extent_cluster(struct inode *inode, struct file_extent_cluster *cluster) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 74c34c388098..c1909e5f4506 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3994,7 +3994,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || - atomic_read(&fs_info->balance_cancel_req)) { + btrfs_should_cancel_balance(fs_info)) { ret = -EINVAL; goto out; } From 7f913c7cfec62b91ceb4383c4dfb481092ae9d20 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 17 Feb 2020 14:16:53 +0800 Subject: [PATCH 171/210] btrfs: relocation: Check cancel request after each data page read When relocating a data extents with large large data extents, we spend most of our time in relocate_file_extent_cluster() at stage "moving data extents": 1) | btrfs_relocate_block_group [btrfs]() { 1) | relocate_file_extent_cluster [btrfs]() { 1) $ 6586769 us | } 1) + 18.260 us | relocate_file_extent_cluster [btrfs](); 1) + 15.770 us | relocate_file_extent_cluster [btrfs](); 1) $ 8916340 us | } 1) | btrfs_relocate_block_group [btrfs]() { 1) | relocate_file_extent_cluster [btrfs]() { 1) $ 11611586 us | } 1) + 16.930 us | relocate_file_extent_cluster [btrfs](); 1) + 15.870 us | relocate_file_extent_cluster [btrfs](); 1) $ 14986130 us | } To make data relocation cancelling quicker, add extra balance cancelling check after each page read in relocate_file_extent_cluster(). Cleanup and error handling uses the same mechanism as if the whole process finished Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0238801e6408..a4edb821990f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3445,6 +3445,10 @@ static int relocate_file_extent_cluster(struct inode *inode, btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); + if (btrfs_should_cancel_balance(fs_info)) { + ret = -ECANCELED; + goto out; + } } WARN_ON(nr != cluster->nr); out: @@ -4259,6 +4263,14 @@ restart: backref_cache_cleanup(&rc->backref_cache); btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); + /* + * Even in the case when the relocation is cancelled, we should all go + * through prepare_to_merge() and merge_reloc_roots(). + * + * For error (including cancelled balance), prepare_to_merge() will + * mark all reloc trees orphan, then queue them for cleanup in + * merge_reloc_roots() + */ err = prepare_to_merge(rc, err); merge_reloc_roots(rc); From f31ea0888caeee75bb8a8514c9f8f28b521b55df Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 17 Feb 2020 14:16:54 +0800 Subject: [PATCH 172/210] btrfs: relocation: Check cancel request after each extent found When relocating data block groups with tons of small extents, or large metadata block groups, there can be over 200,000 extents. We will iterate all extents of such block group in relocate_block_group(), where iteration itself can be kinda time-consuming. So when user want to cancel the balance, the extent iteration loop can be another target. This patch will add the cancelling check in the extent iteration loop of relocate_block_group() to make balance cancelling faster. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index a4edb821990f..008ba7efc4be 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4232,6 +4232,10 @@ restart: break; } } + if (btrfs_should_cancel_balance(fs_info)) { + err = -ECANCELED; + break; + } } if (trans && progress && err == -ENOSPC) { ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); From 63f018be577f7cb4787f594400976b4e779b5cfb Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 10 Mar 2020 10:59:31 +0200 Subject: [PATCH 173/210] btrfs: Remove __ prefix from btrfs_block_rsv_release Currently the non-prefixed version is a simple wrapper used to hide the 4th argument of the prefixed version. This doesn't bring much value in practice and only makes the code harder to follow by adding another level of indirection. Rectify this by removing the __ prefix and have only one public function to release bytes from a block reservation. No semantic changes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 11 ++++++----- fs/btrfs/block-rsv.h | 12 ++---------- fs/btrfs/delalloc-space.c | 4 ++-- fs/btrfs/delayed-inode.c | 6 ++---- fs/btrfs/delayed-ref.c | 3 +-- fs/btrfs/inode-map.c | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/props.c | 2 +- fs/btrfs/relocation.c | 8 ++++---- fs/btrfs/root-tree.c | 2 +- fs/btrfs/transaction.c | 6 +++--- 11 files changed, 24 insertions(+), 34 deletions(-) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 2f6439435cc3..27efec8f7c5b 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -203,7 +203,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, { if (!rsv) return; - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); kfree(rsv); } @@ -270,9 +270,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, u64 *qgroup_to_release) +u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + u64 *qgroup_to_release) { struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; @@ -436,7 +436,8 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) { - btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1, + NULL); WARN_ON(fs_info->trans_block_rsv.size > 0); WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index d1428bb73fc5..0b6ae5302837 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -73,7 +73,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, int min_factor); void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes, bool update_size); -u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, +u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, u64 *qgroup_to_release); void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info); @@ -82,20 +82,12 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize); - -static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); -} - static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) { btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); - btrfs_block_rsv_release(fs_info, block_rsv, 0); + btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL); } #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index f23d07a981e4..1245739a3a6e 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -330,8 +330,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) * are releasing 0 bytes, and then we'll just get the reservation over * the size free'd. */ - released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, - &qgroup_to_release); + released = btrfs_block_rsv_release(fs_info, block_rsv, 0, + &qgroup_to_release); if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index ee70584ddc45..f678980fe564 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -595,8 +595,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, item->bytes_reserved, 0); - btrfs_block_rsv_release(fs_info, rsv, - item->bytes_reserved); + btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); } static int btrfs_delayed_inode_reserve_metadata( @@ -677,8 +676,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, rsv = &fs_info->delayed_block_rsv; trace_btrfs_space_reservation(fs_info, "delayed_inode", node->inode_id, node->bytes_reserved, 0); - btrfs_block_rsv_release(fs_info, rsv, - node->bytes_reserved); + btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL); if (qgroup_free) btrfs_qgroup_free_meta_prealloc(node->root, node->bytes_reserved); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index dfdb7d4f8406..353cc2994d10 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -82,8 +82,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); u64 released = 0; - released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, - NULL); + released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, released, 0); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d5c9c69d8263..6009e0e939b5 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -515,7 +515,7 @@ out_release: trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, trans->block_rsv, - trans->bytes_reserved); + trans->bytes_reserved, NULL); out: trans->block_rsv = rsv; trans->bytes_reserved = num_bytes; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1e138c83cc6e..f7c6645ab75b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8702,7 +8702,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) break; } - btrfs_block_rsv_release(fs_info, rsv, -1); + btrfs_block_rsv_release(fs_info, rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); BUG_ON(ret); /* shouldn't happen */ diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index deb59e7cfcac..ff1ff90e48b1 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -383,7 +383,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, if (need_reserve) { btrfs_block_rsv_release(fs_info, trans->block_rsv, - num_bytes); + num_bytes, NULL); if (ret) return ret; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 008ba7efc4be..782c9e90fa6f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2478,7 +2478,7 @@ again: if (IS_ERR(trans)) { if (!err) btrfs_block_rsv_release(fs_info, rc->block_rsv, - num_bytes); + num_bytes, NULL); return PTR_ERR(trans); } @@ -2486,7 +2486,7 @@ again: if (num_bytes != rc->merging_rsv_size) { btrfs_end_transaction(trans); btrfs_block_rsv_release(fs_info, rc->block_rsv, - num_bytes); + num_bytes, NULL); goto again; } } @@ -4265,7 +4265,7 @@ restart: set_reloc_control(rc); backref_cache_cleanup(&rc->backref_cache); - btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); /* * Even in the case when the relocation is cancelled, we should all go @@ -4281,7 +4281,7 @@ restart: rc->merge_reloc_tree = 0; unset_reloc_control(rc); - btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 98b6e0d980f9..668f22844017 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -520,5 +520,5 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv) { - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index ca617441ecbb..096c0aab34ee 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -221,7 +221,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) WARN_ON_ONCE(!list_empty(&trans->new_bgs)); btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, - trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved, NULL); trans->chunk_bytes_reserved = 0; } @@ -675,7 +675,7 @@ join_fail: alloc_fail: if (num_bytes) btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, - num_bytes); + num_bytes, NULL); reserve_fail: btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved); return ERR_PTR(ret); @@ -898,7 +898,7 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, trans->block_rsv, - trans->bytes_reserved); + trans->bytes_reserved, NULL); trans->bytes_reserved = 0; } From 0078a9f941d2a994d756c330f225e888c31c768d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 10 Mar 2020 11:43:51 +0200 Subject: [PATCH 174/210] btrfs: Remove block_rsv parameter from btrfs_drop_snapshot It's no longer used following 30d40577e322 ("btrfs: reloc: Also queue orphan reloc tree for cleanup to avoid BUG_ON()"), so just remove it. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 5 ++--- fs/btrfs/extent-tree.c | 9 +-------- fs/btrfs/relocation.c | 4 ++-- fs/btrfs/transaction.c | 4 ++-- 4 files changed, 7 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e490cfd70bba..af7229eac02b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2657,9 +2657,8 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) return btrfs_next_old_item(root, p, 0); } int btrfs_leaf_free_space(struct extent_buffer *leaf); -int __must_check btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - int update_ref, int for_reloc); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, + int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a3778937d705..001605b672b8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5288,9 +5288,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * * If called with for_reloc == 0, may exit early with -EAGAIN */ -int btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref, - int for_reloc) +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; @@ -5329,9 +5327,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, if (err) goto out_end_trans; - if (block_rsv) - trans->block_rsv = block_rsv; - /* * This will help us catch people modifying the fs tree while we're * dropping it. It is unsafe to mess with the fs tree while it's being @@ -5459,8 +5454,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, err = PTR_ERR(trans); goto out_free; } - if (block_rsv) - trans->block_rsv = block_rsv; } } btrfs_release_path(path); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 782c9e90fa6f..5dfb7ddbb4ab 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2276,7 +2276,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) root->reloc_root = NULL; if (reloc_root) { - ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1); + ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); if (ret2 < 0 && !ret) ret = ret2; } @@ -2289,7 +2289,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) btrfs_put_root(root); } else { /* Orphan reloc tree, just clean it up */ - ret2 = btrfs_drop_snapshot(root, NULL, 0, 1); + ret2 = btrfs_drop_snapshot(root, 0, 1); if (ret2 < 0 && !ret) ret = ret2; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 096c0aab34ee..5939bca9d5eb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2427,9 +2427,9 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - ret = btrfs_drop_snapshot(root, NULL, 0, 0); + ret = btrfs_drop_snapshot(root, 0, 0); else - ret = btrfs_drop_snapshot(root, NULL, 1, 0); + ret = btrfs_drop_snapshot(root, 1, 0); return (ret < 0) ? 0 : 1; } From f8e6608180a31cc72a23b74969da428da236dbd1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 4 Mar 2020 16:57:35 -0800 Subject: [PATCH 175/210] btrfs: implement migratepage callback for data pages Currently btrfs doesn't provide a migratepage callback for data pages. It means that fallback_migrate_page() is used to migrate btrfs pages. fallback_migrate_page() cannot move dirty pages, instead it tries to flush them (in sync mode) or just fails (in async mode). In the sync mode pages which are scheduled to be processed by btrfs_writepage_fixup_worker() can't be effectively flushed by the migration code, because there is no established way to wait for the completion of the delayed work. It all leads to page migration failures. To fix it the patch implements a btrs-specific migratepage callback, which is similar to iomap_migrate_page() used by some other fs, except it does take care of the PagePrivate2 flag which is used for data ordering purposes. Reviewed-by: Chris Mason Signed-off-by: Roman Gushchin Signed-off-by: David Sterba --- fs/btrfs/inode.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f7c6645ab75b..f85051d0390f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include "misc.h" @@ -8327,6 +8328,39 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) return __btrfs_releasepage(page, gfp_flags); } +#ifdef CONFIG_MIGRATION +static int btrfs_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + int ret; + + ret = migrate_page_move_mapping(mapping, newpage, page, 0); + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + + if (page_has_private(page)) { + ClearPagePrivate(page); + get_page(newpage); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + SetPagePrivate(newpage); + } + + if (PagePrivate2(page)) { + ClearPagePrivate2(page); + SetPagePrivate2(newpage); + } + + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); + return MIGRATEPAGE_SUCCESS; +} +#endif + static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { @@ -10532,6 +10566,9 @@ static const struct address_space_operations btrfs_aops = { .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, +#ifdef CONFIG_MIGRATION + .migratepage = btrfs_migratepage, +#endif .set_page_dirty = btrfs_set_page_dirty, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, From db161806dc5615a42893a1df7d803d12890699d9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 10 Mar 2020 11:00:35 +0200 Subject: [PATCH 176/210] btrfs: account ticket size at add/delete time Instead of iterating all pending tickets on the normal/priority list to sum their total size the cost can be amortized across ticket addition/ removal. This turns O(n) + O(m) (where n is the size of the normal list and m of the priority list) into O(1). This will mostly have effect in workloads that experience heavy flushing. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 12 ++++++------ fs/btrfs/space-info.h | 7 +++++++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9cb511d8cd9d..8b0fe053a25d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -389,6 +389,8 @@ again: space_info, ticket->bytes); list_del_init(&ticket->list); + ASSERT(space_info->reclaim_size >= ticket->bytes); + space_info->reclaim_size -= ticket->bytes; ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); @@ -784,16 +786,12 @@ static inline u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { - struct reserve_ticket *ticket; u64 used; u64 avail; u64 expected; - u64 to_reclaim = 0; + u64 to_reclaim = space_info->reclaim_size; - list_for_each_entry(ticket, &space_info->tickets, list) - to_reclaim += ticket->bytes; - list_for_each_entry(ticket, &space_info->priority_tickets, list) - to_reclaim += ticket->bytes; + lockdep_assert_held(&space_info->lock); avail = calc_available_free_space(fs_info, space_info, BTRFS_RESERVE_FLUSH_ALL); @@ -1192,8 +1190,10 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + ASSERT(space_info->reclaim_size >= 0); ticket.bytes = orig_bytes; ticket.error = 0; + space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); if (flush == BTRFS_RESERVE_FLUSH_ALL) { list_add_tail(&ticket.list, &space_info->tickets); diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 24514cd2c6c1..0a5001ef1481 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -54,6 +54,13 @@ struct btrfs_space_info { struct list_head ro_bgs; struct list_head priority_tickets; struct list_head tickets; + + /* + * Size of space that needs to be reclaimed in order to satisfy pending + * tickets + */ + u64 reclaim_size; + /* * tickets_id just indicates the next ticket will be handled, so note * it's not stored per ticket. From 95418ed1d10774cd9a49af6f39e216c1256f1eeb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Mar 2020 12:41:05 +0000 Subject: [PATCH 177/210] btrfs: fix missing file extent item for hole after ranged fsync When doing a fast fsync for a range that starts at an offset greater than zero, we can end up with a log that when replayed causes the respective inode miss a file extent item representing a hole if we are not using the NO_HOLES feature. This is because for fast fsyncs we don't log any extents that cover a range different from the one requested in the fsync. Example scenario to trigger it: $ mkfs.btrfs -O ^no-holes -f /dev/sdd $ mount /dev/sdd /mnt # Create a file with a single 256K and fsync it to clear to full sync # bit in the inode - we want the msync below to trigger a fast fsync. $ xfs_io -f -c "pwrite -S 0xab 0 256K" -c "fsync" /mnt/foo # Force a transaction commit and wipe out the log tree. $ sync # Dirty 768K of data, increasing the file size to 1Mb, and flush only # the range from 256K to 512K without updating the log tree # (sync_file_range() does not trigger fsync, it only starts writeback # and waits for it to finish). $ xfs_io -c "pwrite -S 0xcd 256K 768K" /mnt/foo $ xfs_io -c "sync_range -abw 256K 256K" /mnt/foo # Now dirty the range from 768K to 1M again and sync that range. $ xfs_io -c "mmap -w 768K 256K" \ -c "mwrite -S 0xef 768K 256K" \ -c "msync -s 768K 256K" \ -c "munmap" \ /mnt/foo # Mount to replay the log. $ mount /dev/sdd /mnt $ umount /mnt $ btrfs check /dev/sdd Opening filesystem to check... Checking filesystem on /dev/sdd UUID: 482fb574-b288-478e-a190-a9c44a78fca6 [1/7] checking root items [2/7] checking extents [3/7] checking free space cache [4/7] checking fs roots root 5 inode 257 errors 100, file extent discount Found file extent holes: start: 262144, len: 524288 ERROR: errors found in fs roots found 720896 bytes used, error(s) found total csum bytes: 512 total tree bytes: 131072 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 123514 file data blocks allocated: 589824 referenced 589824 Fix this issue by setting the range to full (0 to LLONG_MAX) when the NO_HOLES feature is not enabled. This results in extra work being done but it gives the guarantee we don't end up with missing holes after replaying the log. CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 31c72371a164..4a536387e992 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2071,6 +2071,16 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_init_log_ctx(&ctx, inode); + /* + * Set the range to full if the NO_HOLES feature is not enabled. + * This is to avoid missing file extent items representing holes after + * replaying the log. + */ + if (!btrfs_fs_incompat(fs_info, NO_HOLES)) { + start = 0; + end = LLONG_MAX; + } + /* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by From a5eeb3d17b979f7afe3ac68fe049ce8b0a039b03 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Mar 2020 12:41:06 +0000 Subject: [PATCH 178/210] btrfs: add helper to get the end offset of a file extent item Getting the end offset for a file extent item requires a bit of code since the extent can be either inline or regular/prealloc. There are some places all over the code base that open code this logic and in another patch later in this series it will be needed again. Therefore encapsulate this logic in a helper function and use it. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/file-item.c | 40 ++++++++++++++++++++++++++++------------ fs/btrfs/inode.c | 10 +--------- fs/btrfs/send.c | 44 +++----------------------------------------- fs/btrfs/tree-log.c | 15 +-------------- 5 files changed, 34 insertions(+), 76 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index af7229eac02b..ebca8e8365ce 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2836,6 +2836,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); +u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6c849e8fd5a1..b618ad5339ba 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -1040,18 +1040,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, btrfs_item_key_to_cpu(leaf, &key, slot); extent_start = key.offset; - - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - extent_end = extent_start + - btrfs_file_extent_num_bytes(leaf, fi); - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - size_t size; - size = btrfs_file_extent_ram_bytes(leaf, fi); - extent_end = ALIGN(extent_start + size, - fs_info->sectorsize); - } - + extent_end = btrfs_file_extent_end(path); em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -1098,3 +1087,30 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, root->root_key.objectid); } } + +/* + * Returns the end offset (non inclusive) of the file extent item the given path + * points to. If it points to an inline extent, the returned offset is rounded + * up to the sector size. + */ +u64 btrfs_file_extent_end(const struct btrfs_path *path) +{ + const struct extent_buffer *leaf = path->nodes[0]; + const int slot = path->slots[0]; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 end; + + btrfs_item_key_to_cpu(leaf, &key, slot); + ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { + end = btrfs_file_extent_ram_bytes(leaf, fi); + end = ALIGN(key.offset + end, leaf->fs_info->sectorsize); + } else { + end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + } + + return end; +} diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f85051d0390f..d31543350799 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6521,6 +6521,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, extent_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; + extent_end = btrfs_file_extent_end(path); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ @@ -6531,18 +6532,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, btrfs_ino(inode)); goto out; } - extent_end = extent_start + - btrfs_file_extent_num_bytes(leaf, item); - trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, extent_start); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - size_t size; - - size = btrfs_file_extent_ram_bytes(leaf, item); - extent_end = ALIGN(extent_start + size, - fs_info->sectorsize); - trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, path->slots[0], extent_start); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6b86841315be..e47f768cec3d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5586,10 +5586,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset) { struct btrfs_path *path; struct btrfs_root *root = sctx->send_root; - struct btrfs_file_extent_item *fi; struct btrfs_key key; - u64 extent_end; - u8 type; int ret; path = alloc_path_for_send(); @@ -5609,18 +5606,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset) if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) goto out; - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], fi); - if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi); - extent_end = ALIGN(key.offset + size, - sctx->send_root->fs_info->sectorsize); - } else { - extent_end = key.offset + - btrfs_file_extent_num_bytes(path->nodes[0], fi); - } - sctx->cur_inode_last_extent = extent_end; + sctx->cur_inode_last_extent = btrfs_file_extent_end(path); out: btrfs_free_path(path); return ret; @@ -5674,16 +5660,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, break; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_ram_bytes(leaf, fi); - - extent_end = ALIGN(key.offset + size, - root->fs_info->sectorsize); - } else { - extent_end = key.offset + - btrfs_file_extent_num_bytes(leaf, fi); - } + extent_end = btrfs_file_extent_end(path); if (extent_end <= start) goto next; if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) { @@ -5704,9 +5681,6 @@ out: static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) { - struct btrfs_file_extent_item *fi; - u64 extent_end; - u8 type; int ret = 0; if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) @@ -5718,18 +5692,6 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, return ret; } - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], fi); - if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi); - extent_end = ALIGN(key->offset + size, - sctx->send_root->fs_info->sectorsize); - } else { - extent_end = key->offset + - btrfs_file_extent_num_bytes(path->nodes[0], fi); - } - if (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset) { /* @@ -5755,7 +5717,7 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, else ret = 0; } - sctx->cur_inode_last_extent = extent_end; + sctx->cur_inode_last_extent = btrfs_file_extent_end(path); return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 19c107be9ef6..b723ee03de26 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4555,9 +4555,7 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, return ret; while (true) { - struct btrfs_file_extent_item *extent; struct extent_buffer *leaf = path->nodes[0]; - u64 len; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); @@ -4606,18 +4604,7 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; } - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(leaf, extent); - prev_extent_end = ALIGN(key.offset + len, - fs_info->sectorsize); - } else { - len = btrfs_file_extent_num_bytes(leaf, extent); - prev_extent_end = key.offset + len; - } - + prev_extent_end = btrfs_file_extent_end(path); path->slots[0]++; cond_resched(); } From da447009a25609327309f695b244710f12794471 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Mar 2020 12:41:07 +0000 Subject: [PATCH 179/210] btrfs: factor out inode items copy loop from btrfs_log_inode() The function btrfs_log_inode() is quite large and so is its loop which iterates the inode items from the fs/subvolume tree and copies them into a log tree. Because this is a large loop inside a very large function and because an upcoming patch in this series needs to add some more logic inside that loop, move the loop into a helper function to make it a bit more manageable. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 274 ++++++++++++++++++++++---------------------- 1 file changed, 138 insertions(+), 136 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b723ee03de26..5cf04d3f067d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4941,6 +4941,138 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, return ret; } +static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_key *min_key, + const struct btrfs_key *max_key, + struct btrfs_path *path, + struct btrfs_path *dst_path, + const u64 logged_isize, + const bool recursive_logging, + const int inode_only, + struct btrfs_log_ctx *ctx, + bool *need_log_inode_item) +{ + struct btrfs_root *root = inode->root; + int ins_start_slot = 0; + int ins_nr = 0; + int ret; + + while (1) { + ret = btrfs_search_forward(root, min_key, path, trans->transid); + if (ret < 0) + return ret; + if (ret > 0) { + ret = 0; + break; + } +again: + /* Note, ins_nr might be > 0 here, cleanup outside the loop */ + if (min_key->objectid != max_key->objectid) + break; + if (min_key->type > max_key->type) + break; + + if (min_key->type == BTRFS_INODE_ITEM_KEY) + *need_log_inode_item = false; + + if ((min_key->type == BTRFS_INODE_REF_KEY || + min_key->type == BTRFS_INODE_EXTREF_KEY) && + inode->generation == trans->transid && + !recursive_logging) { + u64 other_ino = 0; + u64 other_parent = 0; + + ret = btrfs_check_ref_name_override(path->nodes[0], + path->slots[0], min_key, inode, + &other_ino, &other_parent); + if (ret < 0) { + return ret; + } else if (ret > 0 && ctx && + other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { + if (ins_nr > 0) { + ins_nr++; + } else { + ins_nr = 1; + ins_start_slot = path->slots[0]; + } + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, ins_nr, + inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + + ret = log_conflicting_inodes(trans, root, path, + ctx, other_ino, other_parent); + if (ret) + return ret; + btrfs_release_path(path); + goto next_key; + } + } + + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ + if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + goto next_slot; + } + + if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { + ins_nr++; + goto next_slot; + } else if (!ins_nr) { + ins_start_slot = path->slots[0]; + ins_nr = 1; + goto next_slot; + } + + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 1; + ins_start_slot = path->slots[0]; +next_slot: + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key_to_cpu(path->nodes[0], min_key, + path->slots[0]); + goto again; + } + if (ins_nr) { + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, ins_nr, inode_only, + logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + } + btrfs_release_path(path); +next_key: + if (min_key->offset < (u64)-1) { + min_key->offset++; + } else if (min_key->type < max_key->type) { + min_key->type++; + min_key->offset = 0; + } else { + break; + } + } + if (ins_nr) + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, + ins_nr, inode_only, logged_isize); + + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4970,9 +5102,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; int err = 0; int ret; - int nritems; - int ins_start_slot = 0; - int ins_nr; bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &inode->extent_tree; @@ -5103,139 +5232,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, goto out_unlock; } - while (1) { - ins_nr = 0; - ret = btrfs_search_forward(root, &min_key, - path, trans->transid); - if (ret < 0) { - err = ret; - goto out_unlock; - } - if (ret != 0) - break; -again: - /* note, ins_nr might be > 0 here, cleanup outside the loop */ - if (min_key.objectid != ino) - break; - if (min_key.type > max_key.type) - break; - - if (min_key.type == BTRFS_INODE_ITEM_KEY) - need_log_inode_item = false; - - if ((min_key.type == BTRFS_INODE_REF_KEY || - min_key.type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid && - !recursive_logging) { - u64 other_ino = 0; - u64 other_parent = 0; - - ret = btrfs_check_ref_name_override(path->nodes[0], - path->slots[0], &min_key, inode, - &other_ino, &other_parent); - if (ret < 0) { - err = ret; - goto out_unlock; - } else if (ret > 0 && ctx && - other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { - if (ins_nr > 0) { - ins_nr++; - } else { - ins_nr = 1; - ins_start_slot = path->slots[0]; - } - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, - ins_nr, inode_only, - logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - - err = log_conflicting_inodes(trans, root, path, - ctx, other_ino, other_parent); - if (err) - goto out_unlock; - btrfs_release_path(path); - goto next_key; - } - } - - /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ - if (min_key.type == BTRFS_XATTR_ITEM_KEY) { - if (ins_nr == 0) - goto next_slot; - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, - ins_nr, inode_only, logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - goto next_slot; - } - - if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { - ins_nr++; - goto next_slot; - } else if (!ins_nr) { - ins_start_slot = path->slots[0]; - ins_nr = 1; - goto next_slot; - } - - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, ins_nr, inode_only, - logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 1; - ins_start_slot = path->slots[0]; -next_slot: - - nritems = btrfs_header_nritems(path->nodes[0]); - path->slots[0]++; - if (path->slots[0] < nritems) { - btrfs_item_key_to_cpu(path->nodes[0], &min_key, - path->slots[0]); - goto again; - } - if (ins_nr) { - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, - ins_nr, inode_only, logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - } - btrfs_release_path(path); -next_key: - if (min_key.offset < (u64)-1) { - min_key.offset++; - } else if (min_key.type < max_key.type) { - min_key.type++; - min_key.offset = 0; - } else { - break; - } - } - if (ins_nr) { - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, ins_nr, inode_only, - logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - } + err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, + path, dst_path, logged_isize, + recursive_logging, inode_only, ctx, + &need_log_inode_item); + if (err) + goto out_unlock; btrfs_release_path(path); btrfs_release_path(dst_path); From 0a8068a3dd4294ebd8b99969cdc6ccde0f944ba6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Mar 2020 12:41:08 +0000 Subject: [PATCH 180/210] btrfs: make ranged full fsyncs more efficient Commit 0c713cbab6200b ("Btrfs: fix race between ranged fsync and writeback of adjacent ranges") fixed a bug where we could end up with file extent items in a log tree that represent file ranges that overlap due to a race between the hole detection of a ranged full fsync and writeback for a different file range. The problem was solved by forcing any ranged full fsync to become a non-ranged full fsync - setting the range start to 0 and the end offset to LLONG_MAX. This was a simple solution because the code that detected and marked holes was very complex, it used to be done at copy_items() and implied several searches on the fs/subvolume tree. The drawback of that solution was that we started to flush delalloc for the entire file and wait for all the ordered extents to complete for ranged full fsyncs (including ordered extents covering ranges completely outside the given range). Fortunatelly ranged full fsyncs are not the most common case (hopefully for most workloads). However a later fix for detecting and marking holes was made by commit 0e56315ca147b3 ("Btrfs: fix missing hole after hole punching and fsync when using NO_HOLES") and it simplified a lot the detection of holes, and now copy_items() no longer does it and we do it in a much more simple way at btrfs_log_holes(). This makes it now possible to simply make the code that detects holes to operate only on the initial range and no longer need to operate on the whole file, while also avoiding the need to flush delalloc for the entire file and wait for ordered extents that cover ranges that don't overlap the given range. Another special care is that we must skip file extent items that fall entirely outside the fsync range when copying inode items from the fs/subvolume tree into the log tree - this is to avoid races with ordered extent completion for extents falling outside the fsync range, which could cause us to end up with file extent items in the log tree that have overlapping ranges - for example if the fsync range is [1Mb, 2Mb], when we copy inode items we could copy an extent item for the range [0, 512K], then release the search path and before moving to the next leaf, an ordered extent for a range of [256Kb, 512Kb] completes - this would cause us to copy the new extent item for range [256Kb, 512Kb] into the log tree after we have copied one for the range [0, 512Kb] - the extents overlap, resulting in a corruption. So this change just does these steps: 1) When the NO_HOLES feature is enabled it leaves the initial range intact - no longer sets it to [0, LLONG_MAX] when the full sync bit is set in the inode. If NO_HOLES is not enabled, always set the range to a full, just like before this change, to avoid missing file extent items representing holes after replaying the log (for both full and fast fsyncs); 2) Make the hole detection code to operate only on the fsync range; 3) Make the code that copies items from the fs/subvolume tree to skip copying file extent items that cover a range completely outside the range of the fsync. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 13 ------- fs/btrfs/tree-log.c | 93 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 79 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4a536387e992..18c88f514a0d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2102,19 +2102,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); - /* - * If the inode needs a full sync, make sure we use a full range to - * avoid log tree corruption, due to hole detection racing with ordered - * extent completion for adjacent ranges, and assertion failures during - * hole detection. Do this while holding the inode lock, to avoid races - * with other tasks. - */ - if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - start = 0; - end = LLONG_MAX; - } - /* * Before we acquired the inode's lock, someone may have dirtied more * pages in the target range. We need to make sure that writeback for diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5cf04d3f067d..0eebd6a02d41 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -96,8 +96,8 @@ enum { static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, + u64 start, + u64 end, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -4534,13 +4534,15 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, static int btrfs_log_holes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_path *path) + struct btrfs_path *path, + const u64 start, + const u64 end) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; const u64 ino = btrfs_ino(inode); const u64 i_size = i_size_read(&inode->vfs_inode); - u64 prev_extent_end = 0; + u64 prev_extent_end = start; int ret; if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) @@ -4548,14 +4550,21 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; + key.offset = start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + while (true) { struct extent_buffer *leaf = path->nodes[0]; + u64 extent_end; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); @@ -4572,9 +4581,18 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) break; + extent_end = btrfs_file_extent_end(path); + if (extent_end <= start) + goto next_slot; + /* We have a hole, log it. */ if (prev_extent_end < key.offset) { - const u64 hole_len = key.offset - prev_extent_end; + u64 hole_len; + + if (key.offset >= end) + hole_len = end - prev_extent_end; + else + hole_len = key.offset - prev_extent_end; /* * Release the path to avoid deadlocks with other code @@ -4604,16 +4622,20 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; } - prev_extent_end = btrfs_file_extent_end(path); + prev_extent_end = min(extent_end, end); + if (extent_end >= end) + break; +next_slot: path->slots[0]++; cond_resched(); } - if (prev_extent_end < i_size) { + if (prev_extent_end < end && prev_extent_end < i_size) { u64 hole_len; btrfs_release_path(path); - hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); + hole_len = min(ALIGN(i_size, fs_info->sectorsize), end); + hole_len -= prev_extent_end; ret = btrfs_insert_file_extent(trans, root->log_root, ino, prev_extent_end, 0, 0, hole_len, 0, hole_len, @@ -4950,6 +4972,8 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, const u64 logged_isize, const bool recursive_logging, const int inode_only, + const u64 start, + const u64 end, struct btrfs_log_ctx *ctx, bool *need_log_inode_item) { @@ -4958,6 +4982,21 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, int ins_nr = 0; int ret; + /* + * We must make sure we don't copy extent items that are entirely out of + * the range [start, end - 1]. This is not just an optimization to avoid + * copying but also needed to avoid a corruption where we end up with + * file extent items in the log tree that have overlapping ranges - this + * can happen if we race with ordered extent completion for ranges that + * are outside our target range. For example we copy an extent item and + * when we move to the next leaf, that extent was trimmed and a new one + * covering a subrange of it, but with a higher key, was inserted - we + * would then copy this other extent too, resulting in a log tree with + * 2 extent items that represent overlapping ranges. + * + * We can copy the entire extents at the range bondaries however, even + * if they cover an area outside the target range. That's ok. + */ while (1) { ret = btrfs_search_forward(root, min_key, path, trans->transid); if (ret < 0) @@ -5025,6 +5064,29 @@ again: goto next_slot; } + if (min_key->type == BTRFS_EXTENT_DATA_KEY) { + const u64 extent_end = btrfs_file_extent_end(path); + + if (extent_end <= start) { + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, + path, ins_start_slot, + ins_nr, inode_only, + logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + } + goto next_slot; + } + if (extent_end >= end) { + ins_nr++; + if (ins_nr == 1) + ins_start_slot = path->slots[0]; + break; + } + } + if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; goto next_slot; @@ -5090,8 +5152,8 @@ next_key: static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, + u64 start, + u64 end, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -5119,6 +5181,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, return -ENOMEM; } + start = ALIGN_DOWN(start, fs_info->sectorsize); + end = ALIGN(end, fs_info->sectorsize); + min_key.objectid = ino; min_key.type = BTRFS_INODE_ITEM_KEY; min_key.offset = 0; @@ -5234,8 +5299,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, path, dst_path, logged_isize, - recursive_logging, inode_only, ctx, - &need_log_inode_item); + recursive_logging, inode_only, + start, end, ctx, &need_log_inode_item); if (err) goto out_unlock; @@ -5248,7 +5313,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_holes(trans, root, inode, path); + err = btrfs_log_holes(trans, root, inode, path, start, end); if (err) goto out_unlock; } From b39c8f5a393e7fc4459ac2aa1582409839eae2f6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 Mar 2020 11:21:44 -0400 Subject: [PATCH 181/210] btrfs: fix ref-verify to catch operations on 0 ref extents While debugging I noticed I wasn't getting ref verify errors before everything blew up. Turns out it's because we don't warn when we try to add a normal ref via btrfs_inc_ref() if the block entry exists but has 0 references. This is incorrect, we should never be doing anything other than adding a new extent once a block entry drops to 0 references. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 454a1015d026..7887317033c9 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -803,6 +803,15 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, kfree(ref); kfree(ra); goto out_unlock; + } else if (be->num_refs == 0) { + btrfs_err(fs_info, + "trying to do action %d for a bytenr that has 0 total references", + action); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); + goto out_unlock; } if (!parent) { From 19b546d7a1b2eaf575d04a8434e32f679877c45e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 10 Mar 2020 16:14:15 +0800 Subject: [PATCH 182/210] btrfs: relocation: Use btrfs_find_all_leafs to locate data extent parent tree leaves In relocation, we need to locate all parent tree leaves referring to one data extent, thus we have a complex mechanism to iterate throught extent tree and subvolume trees to locate the related leaves. However this is already done in backref.c, we have btrfs_find_all_leafs(), which can return a ulist containing all leaves referring to that data extent. Use btrfs_find_all_leafs() to replace find_data_references(). There is a special handling for v1 space cache data extents, where we need to delete the v1 space cache data extents, to avoid those data extents to hang the data relocation. In this patch, the special handling is done by re-iterating the root tree leaf. Although it's a little less efficient than the old handling, considering we can reuse a lot of code, it should be acceptable. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 8 +- fs/btrfs/backref.h | 4 + fs/btrfs/relocation.c | 322 ++++++++---------------------------------- 3 files changed, 65 insertions(+), 269 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 327e4480957b..f2728fb3ee8f 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1409,10 +1409,10 @@ static void free_leaf_list(struct ulist *blocks) * * returns 0 on success, <0 on error */ -static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos, bool ignore_offset) +int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **leafs, + const u64 *extent_item_pos, bool ignore_offset) { int ret; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 777f61dc081e..723d6da99114 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -40,6 +40,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); +int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **leafs, + const u64 *extent_item_pos, bool ignore_offset); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **roots, bool ignore_offset); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5dfb7ddbb4ab..536fc4cb8fcc 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -23,6 +23,7 @@ #include "print-tree.h" #include "delalloc-space.h" #include "block-group.h" +#include "backref.h" /* * Relocation overview @@ -3620,31 +3621,6 @@ out: return ret; } -/* - * helper to check if the block use full backrefs for pointers in it - */ -static int block_use_full_backref(struct reloc_control *rc, - struct extent_buffer *eb) -{ - u64 flags; - int ret; - - if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || - btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) - return 1; - - ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info, - eb->start, btrfs_header_level(eb), 1, - NULL, &flags); - BUG_ON(ret); - - if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = 1; - else - ret = 0; - return ret; -} - static int delete_block_group_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group *block_group, struct inode *inode, @@ -3688,174 +3664,40 @@ out: } /* - * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY - * this function scans fs tree to find blocks reference the data extent + * Locate the free space cache EXTENT_DATA in root tree leaf and delete the + * cache inode, to avoid free space cache data extent blocking data relocation. */ -static int find_data_references(struct reloc_control *rc, - struct btrfs_key *extent_key, - struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - struct rb_root *blocks) +static int delete_v1_space_cache(struct extent_buffer *leaf, + struct btrfs_block_group *block_group, + u64 data_bytenr) { - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct btrfs_path *path; - struct tree_block *block; - struct btrfs_root *root; - struct btrfs_file_extent_item *fi; - struct rb_node *rb_node; + u64 space_cache_ino; + struct btrfs_file_extent_item *ei; struct btrfs_key key; - u64 ref_root; - u64 ref_objectid; - u64 ref_offset; - u32 ref_count; - u32 nritems; - int err = 0; - int added = 0; - int counted; + bool found = false; + int i; int ret; - ref_root = btrfs_extent_data_ref_root(leaf, ref); - ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); - ref_offset = btrfs_extent_data_ref_offset(leaf, ref); - ref_count = btrfs_extent_data_ref_count(leaf, ref); + if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) + return 0; - /* - * This is an extent belonging to the free space cache, lets just delete - * it and redo the search. - */ - if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { - ret = delete_block_group_cache(fs_info, rc->block_group, - NULL, ref_objectid); - if (ret != -ENOENT) - return ret; - ret = 0; - } - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = READA_FORWARD; - - root = read_fs_root(fs_info, ref_root); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto out_free; - } - - key.objectid = ref_objectid; - key.type = BTRFS_EXTENT_DATA_KEY; - if (ref_offset > ((u64)-1 << 32)) - key.offset = 0; - else - key.offset = ref_offset; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - err = ret; - goto out; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - /* - * the references in tree blocks that use full backrefs - * are not counted in - */ - if (block_use_full_backref(rc, leaf)) - counted = 0; - else - counted = 1; - rb_node = tree_search(blocks, leaf->start); - if (rb_node) { - if (counted) - added = 1; - else - path->slots[0] = nritems; - } - - while (ref_count > 0) { - while (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - err = ret; - goto out; - } - if (WARN_ON(ret > 0)) - goto out; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - added = 0; - - if (block_use_full_backref(rc, leaf)) - counted = 0; - else - counted = 1; - rb_node = tree_search(blocks, leaf->start); - if (rb_node) { - if (counted) - added = 1; - else - path->slots[0] = nritems; - } - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (WARN_ON(key.objectid != ref_objectid || - key.type != BTRFS_EXTENT_DATA_KEY)) + for (i = 0; i < btrfs_header_nritems(leaf); i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_REG && + btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { + found = true; + space_cache_ino = key.objectid; break; - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - goto next; - - if (btrfs_file_extent_disk_bytenr(leaf, fi) != - extent_key->objectid) - goto next; - - key.offset -= btrfs_file_extent_offset(leaf, fi); - if (key.offset != ref_offset) - goto next; - - if (counted) - ref_count--; - if (added) - goto next; - - if (!tree_block_processed(leaf->start, rc)) { - block = kmalloc(sizeof(*block), GFP_NOFS); - if (!block) { - err = -ENOMEM; - break; - } - block->bytenr = leaf->start; - btrfs_item_key_to_cpu(leaf, &block->key, 0); - block->level = 0; - block->key_ready = 1; - rb_node = tree_insert(blocks, block->bytenr, - &block->rb_node); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, - block->bytenr); } - if (counted) - added = 1; - else - path->slots[0] = nritems; -next: - path->slots[0]++; - } -out: - btrfs_put_root(root); -out_free: - btrfs_free_path(path); - return err; + if (!found) + return -ENOENT; + ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, + space_cache_ino); + return ret; } /* @@ -3867,91 +3709,41 @@ int add_data_references(struct reloc_control *rc, struct btrfs_path *path, struct rb_root *blocks) { - struct btrfs_key key; - struct extent_buffer *eb; - struct btrfs_extent_data_ref *dref; - struct btrfs_extent_inline_ref *iref; - unsigned long ptr; - unsigned long end; - u32 blocksize = rc->extent_root->fs_info->nodesize; + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct ulist *leaves = NULL; + struct ulist_iterator leaf_uiter; + struct ulist_node *ref_node = NULL; + const u32 blocksize = fs_info->nodesize; int ret = 0; - int err = 0; - eb = path->nodes[0]; - ptr = btrfs_item_ptr_offset(eb, path->slots[0]); - end = ptr + btrfs_item_size_nr(eb, path->slots[0]); - ptr += sizeof(struct btrfs_extent_item); - - while (ptr < end) { - iref = (struct btrfs_extent_inline_ref *)ptr; - key.type = btrfs_get_extent_inline_ref_type(eb, iref, - BTRFS_REF_TYPE_DATA); - if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - key.offset = btrfs_extent_inline_ref_offset(eb, iref); - ret = __add_tree_block(rc, key.offset, blocksize, - blocks); - } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - ret = find_data_references(rc, extent_key, - eb, dref, blocks); - } else { - ret = -EUCLEAN; - btrfs_err(rc->extent_root->fs_info, - "extent %llu slot %d has an invalid inline ref type", - eb->start, path->slots[0]); - } - if (ret) { - err = ret; - goto out; - } - ptr += btrfs_extent_inline_ref_size(key.type); - } - WARN_ON(ptr > end); - - while (1) { - cond_resched(); - eb = path->nodes[0]; - if (path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(rc->extent_root, path); - if (ret < 0) { - err = ret; - break; - } - if (ret > 0) - break; - eb = path->nodes[0]; - } - - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - if (key.objectid != extent_key->objectid) - break; - - if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - ret = __add_tree_block(rc, key.offset, blocksize, - blocks); - } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_extent_data_ref); - ret = find_data_references(rc, extent_key, - eb, dref, blocks); - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - btrfs_print_v0_err(eb->fs_info); - btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); - ret = -EINVAL; - } else { - ret = 0; - } - if (ret) { - err = ret; - break; - } - path->slots[0]++; - } -out: btrfs_release_path(path); - if (err) + ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, + 0, &leaves, NULL, true); + if (ret < 0) + return ret; + + ULIST_ITER_INIT(&leaf_uiter); + while ((ref_node = ulist_next(leaves, &leaf_uiter))) { + struct extent_buffer *eb; + + eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + break; + } + ret = delete_v1_space_cache(eb, rc->block_group, + extent_key->objectid); + free_extent_buffer(eb); + if (ret < 0) + break; + ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); + if (ret < 0) + break; + } + if (ret < 0) free_block_list(blocks); - return err; + ulist_free(leaves); + return ret; } /* From 8e19c9732ad1d127b5575a10f4fbcacf740500ff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 4 Mar 2020 11:18:23 -0500 Subject: [PATCH 183/210] btrfs: drop block from cache on error in relocation If we have an error while building the backref tree in relocation we'll process all the pending edges and then free the node. However if we integrated some edges into the cache we'll lose our link to those edges by simply freeing this node, which means we'll leak memory and references to any roots that we've found. Instead we need to use remove_backref_node(), which walks through all of the edges that are still linked to this node and free's them up and drops any root references we may be holding. CC: stable@vger.kernel.org # 4.9+ Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 536fc4cb8fcc..d7f3d2197fa1 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1228,7 +1228,7 @@ out: free_backref_node(cache, lower); } - free_backref_node(cache, node); + remove_backref_node(cache, node); return ERR_PTR(err); } ASSERT(!node || !node->detached); From fb2d83eefef4e1c717205bac71cb1941edf8ae11 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 4 Mar 2020 11:18:25 -0500 Subject: [PATCH 184/210] btrfs: unset reloc control if we fail to recover If we fail to load an fs root, or fail to start a transaction we can bail without unsetting the reloc control, which leads to problems later when we free the reloc control but still have it attached to the file system. In the normal path we'll end up calling unset_reloc_control() twice, but all it does is set fs_info->reloc_control = NULL, and we can only have one balance at a time so it's not racey. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d7f3d2197fa1..78537541059b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4460,9 +4460,8 @@ int btrfs_recover_relocation(struct btrfs_root *root) trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { - unset_reloc_control(rc); err = PTR_ERR(trans); - goto out_free; + goto out_unset; } rc->merge_reloc_tree = 1; @@ -4482,7 +4481,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); list_add_tail(&reloc_root->root_list, &reloc_roots); - goto out_free; + goto out_unset; } err = __add_reloc_root(reloc_root); @@ -4493,7 +4492,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) err = btrfs_commit_transaction(trans); if (err) - goto out_free; + goto out_unset; merge_reloc_roots(rc); @@ -4509,7 +4508,8 @@ int btrfs_recover_relocation(struct btrfs_root *root) ret = clean_dirty_subvols(rc); if (ret < 0 && !err) err = ret; -out_free: +out_unset: + unset_reloc_control(rc); kfree(rc); out: if (!list_empty(&reloc_roots)) From 6217b0fadd4473a16fabc6aecd7527a9f71af534 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 4 Mar 2020 11:18:27 -0500 Subject: [PATCH 185/210] btrfs: reloc: clean dirty subvols if we fail to start a transaction If we do merge_reloc_roots() we could insert a few roots onto the dirty subvol roots list, where we hold a ref on them. If we fail to start the transaction we need to run clean_dirty_subvols() in order to cleanup the refs. CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 78537541059b..d2069d69fc92 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4082,10 +4082,10 @@ restart: goto out_free; } btrfs_commit_transaction(trans); +out_free: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) err = ret; -out_free: btrfs_free_block_rsv(fs_info, rc->block_rsv); btrfs_free_path(path); return err; @@ -4501,10 +4501,10 @@ int btrfs_recover_relocation(struct btrfs_root *root) trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); - goto out_free; + goto out_clean; } err = btrfs_commit_transaction(trans); - +out_clean: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) err = ret; From 2abc726ab4b83db774e315c660ab8da21477092f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 4 Mar 2020 11:18:24 -0500 Subject: [PATCH 186/210] btrfs: do not init a reloc root if we aren't relocating We previously were checking if the root had a dead root before accessing root->reloc_root in order to avoid a use-after-free type bug. However this scenario happens after we've unset the reloc control, so we would have been saved if we'd simply checked for fs_info->reloc_control. At this point during relocation we no longer need to be creating new reloc roots, so simply move this check above the reloc_root checks to avoid any future races and confusion. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d2069d69fc92..631f94085ba6 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1511,6 +1511,10 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int clear_rsv = 0; int ret; + if (!rc || !rc->create_reloc_tree || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + /* * The subvolume has reloc tree but the swap is finished, no need to * create/update the dead reloc tree @@ -1524,10 +1528,6 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, return 0; } - if (!rc || !rc->create_reloc_tree || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - return 0; - if (!trans->reloc_reserved) { rsv = trans->block_rsv; trans->block_rsv = rc->block_rsv; @@ -2369,6 +2369,18 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, trans = NULL; goto out; } + + /* + * At this point we no longer have a reloc_control, so we can't + * depend on btrfs_init_reloc_root to update our last_trans. + * + * But that's ok, we started the trans handle on our + * corresponding fs_root, which means it's been added to the + * dirty list. At commit time we'll still call + * btrfs_update_reloc_root() and update our root item + * appropriately. + */ + reloc_root->last_trans = trans->transid; trans->block_rsv = rc->block_rsv; replaced = 0; From 1a0afa0ecfc4dbc8d7583d03cafd3f68f781df0c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 4 Mar 2020 11:18:26 -0500 Subject: [PATCH 187/210] btrfs: free the reloc_control in a consistent way If we have an error while processing the reloc roots we could leak roots that were added to rc->reloc_roots before we hit the error. We could have also not removed the reloc tree mapping from our rb_tree, so clean up any remaining nodes in the reloc root rb_tree. Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ use rbtree_postorder_for_each_entry_safe ] Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 631f94085ba6..a0c8f7949a19 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4202,6 +4202,18 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) return rc; } +static void free_reloc_control(struct reloc_control *rc) +{ + struct mapping_node *node, *tmp; + + free_reloc_roots(&rc->reloc_roots); + rbtree_postorder_for_each_entry_safe(node, tmp, + &rc->reloc_root_tree.rb_root, rb_node) + kfree(node); + + kfree(rc); +} + /* * Print the block group being relocated */ @@ -4346,7 +4358,7 @@ out: btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); btrfs_put_block_group(rc->block_group); - kfree(rc); + free_reloc_control(rc); return err; } @@ -4522,7 +4534,7 @@ out_clean: err = ret; out_unset: unset_reloc_control(rc); - kfree(rc); + free_reloc_control(rc); out: if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); From f28de8d8fd376ba57aaed53cdc1a0ba2238e8ebf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2020 11:44:46 -0400 Subject: [PATCH 188/210] btrfs: clear DEAD_RELOC_TREE before dropping the reloc root The DEAD_RELOC_TREE flag is in place in order to avoid a use after free in init_reloc_root, tracking the presence of reloc_root. However adding the explicit tree references in previous patches makes the use after free impossible because at this point we no longer have a reloc_control set on the fs_info and thus cannot enter the function. So move this to be coupled with clearing the root->reloc_root so we're consistent with all other operations of the reloc root. Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index a0c8f7949a19..54724ae45380 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2275,18 +2275,18 @@ static int clean_dirty_subvols(struct reloc_control *rc) list_del_init(&root->reloc_dirty_list); root->reloc_root = NULL; - if (reloc_root) { - - ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); - if (ret2 < 0 && !ret) - ret = ret2; - } /* * Need barrier to ensure clear_bit() only happens after * root->reloc_root = NULL. Pairs with have_reloc_root. */ smp_wmb(); clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); + + if (reloc_root) { + ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); + if (ret2 < 0 && !ret) + ret = ret2; + } btrfs_put_root(root); } else { /* Orphan reloc tree, just clean it up */ From f44deb7442edf42abee6be25fca7e3e86061b4c9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2020 11:44:47 -0400 Subject: [PATCH 189/210] btrfs: hold a ref on the root->reloc_root We previously were relying on root->reloc_root to be cleaned up by the drop snapshot, or the error handling. However if btrfs_drop_snapshot() failed it wouldn't drop the ref for the root. Also we sort of depend on the right thing to happen with moving reloc roots between lists and the fs root they belong to, which makes it hard to figure out who owns the reference. Fix this by explicitly holding a reference on the reloc root for roo->reloc_root. This means that we hold two references on reloc roots, one for whichever reloc_roots list it's attached to, and the root->reloc_root we're on. This makes it easier to reason out who owns a reference on the root, and when it needs to be dropped. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 58 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 54724ae45380..022e5af66fba 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1368,6 +1368,7 @@ static void __del_reloc_root(struct btrfs_root *root) struct rb_node *rb_node; struct mapping_node *node = NULL; struct reloc_control *rc = fs_info->reloc_ctl; + bool put_ref = false; if (rc && root->node) { spin_lock(&rc->reloc_root_tree.lock); @@ -1383,9 +1384,22 @@ static void __del_reloc_root(struct btrfs_root *root) BUG_ON((struct btrfs_root *)node->data != root); } + /* + * We only put the reloc root here if it's on the list. There's a lot + * of places where the pattern is to splice the rc->reloc_roots, process + * the reloc roots, and then add the reloc root back onto + * rc->reloc_roots. If we call __del_reloc_root while it's off of the + * list we don't want the reference being dropped, because the guy + * messing with the list is in charge of the reference. + */ spin_lock(&fs_info->trans_lock); - list_del_init(&root->root_list); + if (!list_empty(&root->root_list)) { + put_ref = true; + list_del_init(&root->root_list); + } spin_unlock(&fs_info->trans_lock); + if (put_ref) + btrfs_put_root(root); kfree(node); } @@ -1500,6 +1514,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, /* * create reloc tree for a given fs tree. reloc tree is just a * snapshot of the fs tree with special root objectid. + * + * The reloc_root comes out of here with two references, one for + * root->reloc_root, and another for being on the rc->reloc_roots list. */ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -1539,7 +1556,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, ret = __add_reloc_root(reloc_root); BUG_ON(ret < 0); - root->reloc_root = reloc_root; + root->reloc_root = btrfs_grab_root(reloc_root); return 0; } @@ -1560,6 +1577,13 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, reloc_root = root->reloc_root; root_item = &reloc_root->root_item; + /* + * We are probably ok here, but __del_reloc_root() will drop its ref of + * the root. We have the ref for root->reloc_root, but just in case + * hold it while we update the reloc root. + */ + btrfs_grab_root(reloc_root); + /* root->reloc_root will stay until current relocation finished */ if (fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { @@ -1581,7 +1605,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, fs_info->tree_root, &reloc_root->root_key, root_item); BUG_ON(ret); - + btrfs_put_root(reloc_root); out: return 0; } @@ -2281,18 +2305,28 @@ static int clean_dirty_subvols(struct reloc_control *rc) */ smp_wmb(); clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); - if (reloc_root) { + /* + * btrfs_drop_snapshot drops our ref we hold for + * ->reloc_root. If it fails however we must + * drop the ref ourselves. + */ ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); - if (ret2 < 0 && !ret) - ret = ret2; + if (ret2 < 0) { + btrfs_put_root(reloc_root); + if (!ret) + ret = ret2; + } } btrfs_put_root(root); } else { /* Orphan reloc tree, just clean it up */ ret2 = btrfs_drop_snapshot(root, 0, 1); - if (ret2 < 0 && !ret) - ret = ret2; + if (ret2 < 0) { + btrfs_put_root(root); + if (!ret) + ret = ret2; + } } } return ret; @@ -4510,7 +4544,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) err = __add_reloc_root(reloc_root); BUG_ON(err < 0); /* -ENOMEM or logic error */ - fs_root->reloc_root = reloc_root; + fs_root->reloc_root = btrfs_grab_root(reloc_root); btrfs_put_root(fs_root); } @@ -4698,6 +4732,10 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, /* * called after snapshot is created. migrate block reservation * and create reloc root for the newly created snapshot + * + * This is similar to btrfs_init_reloc_root(), we come out of here with two + * references held on the reloc_root, one for root->reloc_root and one for + * rc->reloc_roots. */ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending) @@ -4730,7 +4768,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, ret = __add_reloc_root(reloc_root); BUG_ON(ret < 0); - new_root->reloc_root = reloc_root; + new_root->reloc_root = btrfs_grab_root(reloc_root); if (rc->create_reloc_tree) ret = clone_backref_node(trans, rc, root, reloc_root); From 7b7b74315b24dc064bc1c683659061c3d48f8668 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 4 Mar 2020 11:18:30 -0500 Subject: [PATCH 190/210] btrfs: remove a BUG_ON() from merge_reloc_roots() This was pretty subtle, we default to reloc roots having 0 root refs, so if we crash in the middle of the relocation they can just be deleted. If we successfully complete the relocation operations we'll set our root refs to 1 in prepare_to_merge() and then go on to merge_reloc_roots(). At prepare_to_merge() time if any of the reloc roots have a 0 reference still, we will remove that reloc root from our reloc root rb tree, and then clean it up later. However this only happens if we successfully start a transaction. If we've aborted previously we will skip this step completely, and only have reloc roots with a reference count of 0, but were never properly removed from the reloc control's rb tree. This isn't a problem per-se, our references are held by the list the reloc roots are on, and by the original root the reloc root belongs to. If we end up in this situation all the reloc roots will be added to the dirty_reloc_list, and then properly dropped at that point. The reloc control will be free'd and the rb tree is no longer used. There were two options when fixing this, one was to remove the BUG_ON(), the other was to make prepare_to_merge() handle the case where we couldn't start a trans handle. IMO this is the cleaner solution. I started with handling the error in prepare_to_merge(), but it turned out super ugly. And in the end this BUG_ON() simply doesn't matter, the cleanup was happening properly, we were just panicing because this BUG_ON() only matters in the success case. So I've opted to just remove it and add a comment where it was. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 022e5af66fba..2641b6c5c362 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2653,7 +2653,21 @@ out: free_reloc_roots(&reloc_roots); } - BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + /* + * We used to have + * + * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + * + * here, but it's wrong. If we fail to start the transaction in + * prepare_to_merge() we will have only 0 ref reloc roots, none of which + * have actually been removed from the reloc_root_tree rb tree. This is + * fine because we're bailing here, and we hold a reference on the root + * for the list that holds it, so these roots will be cleaned up when we + * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root + * will be cleaned up on unmount. + * + * The remaining nodes will be cleaned up by free_reloc_control. + */ } static void free_block_list(struct rb_root *blocks) From 3fd6372758d91d8ba801e0733b17d082066a04ef Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:40 -0500 Subject: [PATCH 191/210] btrfs: make the extent buffer leak check per fs info I'm going to make the entire destruction of btrfs_root's controlled by their refcount, so it will be helpful to notice if we're leaking their eb's on umount. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/disk-io.c | 3 +++ fs/btrfs/extent_io.c | 45 ++++++++++++++++++++++---------------------- fs/btrfs/extent_io.h | 7 +++++++ 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ebca8e8365ce..f1eef81aaa49 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -949,6 +949,9 @@ struct btrfs_fs_info { struct kobject *debug_kobj; struct kobject *discard_debug_kobj; struct list_head allocated_roots; + + spinlock_t eb_leak_lock; + struct list_head allocated_ebs; #endif }; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6b00ddea0b48..56cad6a51a7d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1530,6 +1530,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->free_space_root); btrfs_put_root(fs_info->fs_root); btrfs_check_leaked_roots(fs_info); + btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); kvfree(fs_info); @@ -2656,6 +2657,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->unused_bgs); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&fs_info->allocated_roots); + INIT_LIST_HEAD(&fs_info->allocated_ebs); + spin_lock_init(&fs_info->eb_leak_lock); #endif extent_map_tree_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 837262d54e28..36af71040974 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -35,42 +35,45 @@ static inline bool extent_state_in_tree(const struct extent_state *state) } #ifdef CONFIG_BTRFS_DEBUG -static LIST_HEAD(buffers); static LIST_HEAD(states); - static DEFINE_SPINLOCK(leak_lock); -static inline -void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) +static inline void btrfs_leak_debug_add(spinlock_t *lock, + struct list_head *new, + struct list_head *head) { unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); + spin_lock_irqsave(lock, flags); list_add(new, head); - spin_unlock_irqrestore(&leak_lock, flags); + spin_unlock_irqrestore(lock, flags); } -static inline -void btrfs_leak_debug_del(struct list_head *entry) +static inline void btrfs_leak_debug_del(spinlock_t *lock, + struct list_head *entry) { unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); + spin_lock_irqsave(lock, flags); list_del(entry); - spin_unlock_irqrestore(&leak_lock, flags); + spin_unlock_irqrestore(lock, flags); } -static inline void btrfs_extent_buffer_leak_debug_check(void) +void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) { struct extent_buffer *eb; + unsigned long flags; - while (!list_empty(&buffers)) { - eb = list_entry(buffers.next, struct extent_buffer, leak_list); + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + while (!list_empty(&fs_info->allocated_ebs)) { + eb = list_first_entry(&fs_info->allocated_ebs, + struct extent_buffer, leak_list); pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); list_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } static inline void btrfs_extent_state_leak_debug_check(void) @@ -107,9 +110,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, } } #else -#define btrfs_leak_debug_add(new, head) do {} while (0) -#define btrfs_leak_debug_del(entry) do {} while (0) -#define btrfs_extent_buffer_leak_debug_check() do {} while (0) +#define btrfs_leak_debug_add(lock, new, head) do {} while (0) +#define btrfs_leak_debug_del(lock, entry) do {} while (0) #define btrfs_extent_state_leak_debug_check() do {} while (0) #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif @@ -245,8 +247,6 @@ void __cold extent_state_cache_exit(void) void __cold extent_io_exit(void) { - btrfs_extent_buffer_leak_debug_check(); - /* * Make sure all delayed rcu free are flushed before we * destroy caches. @@ -324,7 +324,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->state = 0; state->failrec = NULL; RB_CLEAR_NODE(&state->rb_node); - btrfs_leak_debug_add(&state->leak_list, &states); + btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states); refcount_set(&state->refs, 1); init_waitqueue_head(&state->wq); trace_alloc_extent_state(state, mask, _RET_IP_); @@ -337,7 +337,7 @@ void free_extent_state(struct extent_state *state) return; if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state)); - btrfs_leak_debug_del(&state->leak_list); + btrfs_leak_debug_del(&leak_lock, &state->leak_list); trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } @@ -4875,7 +4875,7 @@ out_free_ulist: static void __free_extent_buffer(struct extent_buffer *eb) { - btrfs_leak_debug_del(&eb->leak_list); + btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } @@ -4962,7 +4962,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); - btrfs_leak_debug_add(&eb->leak_list, &buffers); + btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, + &fs_info->allocated_ebs); spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 234622101230..2ed65bd0760e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -325,4 +325,11 @@ bool find_lock_delalloc_range(struct inode *inode, #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); + +#ifdef CONFIG_BTRFS_DEBUG +void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info); +#else +#define btrfs_extent_buffer_leak_debug_check(fs_info) do {} while (0) +#endif + #endif From 0e996e7fcf2e3a7a5d13b0ed5f9cebc551bd94ba Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:41 -0500 Subject: [PATCH 192/210] btrfs: move ino_cache_inode dropping out of btrfs_free_fs_root We are going to make root life be controlled soley by refcounting, and inodes will be one of the things that hold a ref on the root. This means we need to handle dropping the ino_cache_inode outside of the root freeing logic, so move it into btrfs_drop_and_free_fs_root() so it is cleaned up properly on unmount. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 5 ++++- fs/btrfs/transaction.c | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 56cad6a51a7d..37418d1d0820 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3912,12 +3912,15 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, __btrfs_remove_free_space_cache(root->free_ino_pinned); if (root->free_ino_ctl) __btrfs_remove_free_space_cache(root->free_ino_ctl); + if (root->ino_cache_inode) { + iput(root->ino_cache_inode); + root->ino_cache_inode = NULL; + } btrfs_free_fs_root(root); } void btrfs_free_fs_root(struct btrfs_root *root) { - iput(root->ino_cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); if (root->anon_dev) free_anon_bdev(root->anon_dev); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5939bca9d5eb..d2c667465fd4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2424,6 +2424,10 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); btrfs_kill_all_delayed_nodes(root); + if (root->ino_cache_inode) { + iput(root->ino_cache_inode); + root->ino_cache_inode = NULL; + } if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) From 8c38938c7bb096313ad00c2bafa82af37636b0ec Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:42 -0500 Subject: [PATCH 193/210] btrfs: move the root freeing stuff into btrfs_put_root There are a few different ways to free roots, either you allocated them yourself and you just do free_extent_buffer(root->node); free_extent_buffer(root->commit_node); btrfs_put_root(root); Which is the pattern for log roots. Or for snapshots/subvolumes that are being dropped you simply call btrfs_free_fs_root() which does all the cleanup for you. Unify this all into btrfs_put_root(), so that we don't free up things associated with the root until the last reference is dropped. This makes the root freeing code much more significant. The only caveat is at close_ctree() time we have to free the extent buffers for all of our main roots (extent_root, chunk_root, etc) because we have to drop the btree_inode and we'll run into issues if we hold onto those nodes until ->kill_sb() time. This will be addressed in the future when we kill the btree_inode. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 63 ++++++++++++++++++------------------ fs/btrfs/disk-io.h | 16 +-------- fs/btrfs/extent-tree.c | 7 ++-- fs/btrfs/extent_io.c | 16 +++++++-- fs/btrfs/free-space-tree.c | 2 -- fs/btrfs/qgroup.c | 7 +--- fs/btrfs/relocation.c | 4 --- fs/btrfs/tests/btrfs-tests.c | 5 +-- fs/btrfs/tree-log.c | 6 ---- 9 files changed, 50 insertions(+), 76 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 37418d1d0820..242c072d69a7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1255,11 +1255,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, return root; fail: - if (leaf) { + if (leaf) btrfs_tree_unlock(leaf); - free_extent_buffer(root->commit_root); - free_extent_buffer(leaf); - } btrfs_put_root(root); return ERR_PTR(ret); @@ -1382,10 +1379,10 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, generation, level, NULL); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); + root->node = NULL; goto find_fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; - free_extent_buffer(root->node); goto find_fail; } root->commit_root = btrfs_root_node(root); @@ -1617,14 +1614,14 @@ again: if (ret) { btrfs_put_root(root); if (ret == -EEXIST) { - btrfs_free_fs_root(root); + btrfs_put_root(root); goto again; } goto fail; } return root; fail: - btrfs_free_fs_root(root); + btrfs_put_root(root); return ERR_PTR(ret); } @@ -1996,11 +1993,35 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->csum_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); + free_root_extent_buffers(info->fs_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); free_root_extent_buffers(info->free_space_root); } +void btrfs_put_root(struct btrfs_root *root) +{ + if (!root) + return; + + if (refcount_dec_and_test(&root->refs)) { + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + if (root->anon_dev) + free_anon_bdev(root->anon_dev); + btrfs_drew_lock_destroy(&root->snapshot_lock); + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root->free_ino_ctl); + kfree(root->free_ino_pinned); +#ifdef CONFIG_BTRFS_DEBUG + spin_lock(&root->fs_info->fs_roots_radix_lock); + list_del_init(&root->leak_list); + spin_unlock(&root->fs_info->fs_roots_radix_lock); +#endif + kfree(root); + } +} + void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { int ret; @@ -2012,13 +2033,10 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) struct btrfs_root, root_list); list_del(&gang[0]->root_list); - if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) btrfs_drop_and_free_fs_root(fs_info, gang[0]); - } else { - free_extent_buffer(gang[0]->node); - free_extent_buffer(gang[0]->commit_root); + else btrfs_put_root(gang[0]); - } } while (1) { @@ -2223,11 +2241,11 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); + log_tree_root->node = NULL; btrfs_put_root(log_tree_root); return ret; } else if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); - free_extent_buffer(log_tree_root->node); btrfs_put_root(log_tree_root); return -EIO; } @@ -2236,7 +2254,6 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (ret) { btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); - free_extent_buffer(log_tree_root->node); btrfs_put_root(log_tree_root); return ret; } @@ -3901,8 +3918,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); if (root->reloc_root) { - free_extent_buffer(root->reloc_root->node); - free_extent_buffer(root->reloc_root->commit_root); btrfs_put_root(root->reloc_root); root->reloc_root = NULL; } @@ -3916,19 +3931,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, iput(root->ino_cache_inode); root->ino_cache_inode = NULL; } - btrfs_free_fs_root(root); -} - -void btrfs_free_fs_root(struct btrfs_root *root) -{ - WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); - if (root->anon_dev) - free_anon_bdev(root->anon_dev); - btrfs_drew_lock_destroy(&root->snapshot_lock); - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); btrfs_put_root(root); } @@ -4093,8 +4095,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); - btrfs_free_fs_roots(fs_info); - btrfs_put_block_group_cache(fs_info); /* @@ -4106,6 +4106,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); + btrfs_free_fs_roots(fs_info); /* * We must free the block groups after dropping the fs_roots as we could diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 59c885860bf8..cd629113f61c 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,7 +76,6 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -void btrfs_free_fs_root(struct btrfs_root *root); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); @@ -98,20 +97,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) return NULL; } -static inline void btrfs_put_root(struct btrfs_root *root) -{ - if (!root) - return; - if (refcount_dec_and_test(&root->refs)) { -#ifdef CONFIG_BTRFS_DEBUG - spin_lock(&root->fs_info->fs_roots_radix_lock); - list_del_init(&root->leak_list); - spin_unlock(&root->fs_info->fs_roots_radix_lock); -#endif - kfree(root); - } -} - +void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 001605b672b8..3e7bd2388ed2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5485,13 +5485,10 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) } } - if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); - } else { - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); + else btrfs_put_root(root); - } root_dropped = true; out_end_trans: btrfs_end_transaction_throttle(trans); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 36af71040974..39e45b8a5031 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -64,12 +64,21 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) struct extent_buffer *eb; unsigned long flags; + /* + * If we didn't get into open_ctree our allocated_ebs will not be + * initialized, so just skip this. + */ + if (!fs_info->allocated_ebs.next) + return; + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); while (!list_empty(&fs_info->allocated_ebs)) { eb = list_first_entry(&fs_info->allocated_ebs, struct extent_buffer, leak_list); - pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", - eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); + pr_err( + "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", + eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, + btrfs_header_owner(eb)); list_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } @@ -4875,7 +4884,6 @@ out_free_ulist: static void __free_extent_buffer(struct extent_buffer *eb) { - btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } @@ -4941,6 +4949,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { btrfs_release_extent_buffer_pages(eb); + btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); __free_extent_buffer(eb); } @@ -5329,6 +5338,7 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } + btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_pages(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index bc43950eb32f..8b1f5c8897b7 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1251,8 +1251,6 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_free_tree_block(trans, free_space_root, free_space_root->node, 0, 1); - free_extent_buffer(free_space_root->node); - free_extent_buffer(free_space_root->commit_root); btrfs_put_root(free_space_root); return btrfs_commit_transaction(trans); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 75bc3b686498..c3888fb367e7 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1038,11 +1038,8 @@ out_add_root: out_free_path: btrfs_free_path(path); out_free_root: - if (ret) { - free_extent_buffer(quota_root->node); - free_extent_buffer(quota_root->commit_root); + if (ret) btrfs_put_root(quota_root); - } out: if (ret) { ulist_free(fs_info->qgroup_ulist); @@ -1105,8 +1102,6 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); - free_extent_buffer(quota_root->node); - free_extent_buffer(quota_root->commit_root); btrfs_put_root(quota_root); end_trans: diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2641b6c5c362..7f31dd57de54 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2579,10 +2579,6 @@ void free_reloc_roots(struct list_head *list) reloc_root = list_entry(list->next, struct btrfs_root, root_list); __del_reloc_root(reloc_root); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - reloc_root->node = NULL; - reloc_root->commit_root = NULL; } } diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 69c9afef06e3..42e62fd2809c 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -194,6 +194,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) cleanup_srcu_struct(&fs_info->subvol_srcu); kfree(fs_info->super_copy); btrfs_check_leaked_roots(fs_info); + btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->fs_devices); kfree(fs_info); } @@ -205,10 +206,6 @@ void btrfs_free_dummy_root(struct btrfs_root *root) /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; - if (root->node) { - /* One for allocate_extent_buffer */ - free_extent_buffer(root->node); - } btrfs_put_root(root); } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 0eebd6a02d41..58c111474ba5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3299,7 +3299,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans, clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); - free_extent_buffer(log->node); btrfs_put_root(log); } @@ -6246,8 +6245,6 @@ again: ret = btrfs_pin_extent_for_log_replay(trans, log->node->start, log->node->len); - free_extent_buffer(log->node); - free_extent_buffer(log->commit_root); btrfs_put_root(log); if (!ret) @@ -6285,8 +6282,6 @@ again: wc.replay_dest->log_root = NULL; btrfs_put_root(wc.replay_dest); - free_extent_buffer(log->node); - free_extent_buffer(log->commit_root); btrfs_put_root(log); if (ret) @@ -6318,7 +6313,6 @@ next: if (ret) return ret; - free_extent_buffer(log_root_tree->node); log_root_tree->log_root = NULL; clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); btrfs_put_root(log_root_tree); From 5c8fd99fec9d9265ed8fb732120a2ffd2bbab9b8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:43 -0500 Subject: [PATCH 194/210] btrfs: make inodes hold a ref on their roots If we make sure all the inodes have refs on their root we don't have to worry about the root disappearing while we have open inodes. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/inode.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 242c072d69a7..5db76264b07f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2096,7 +2096,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; - BTRFS_I(inode)->root = fs_info->tree_root; + BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); btrfs_insert_inode_hash(inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d31543350799..683bb20c1d0f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5235,7 +5235,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) inode->i_ino = args->location->objectid; memcpy(&BTRFS_I(inode)->location, args->location, sizeof(*args->location)); - BTRFS_I(inode)->root = args->root; + BTRFS_I(inode)->root = btrfs_grab_root(args->root); + BUG_ON(args->root && !BTRFS_I(inode)->root); return 0; } @@ -5316,7 +5317,7 @@ static struct inode *new_simple_dir(struct super_block *s, if (!inode) return ERR_PTR(-ENOMEM); - BTRFS_I(inode)->root = root; + BTRFS_I(inode)->root = btrfs_grab_root(root); memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -5884,7 +5885,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->dir_index = *index; - BTRFS_I(inode)->root = root; + BTRFS_I(inode)->root = btrfs_grab_root(root); BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; @@ -8926,6 +8927,7 @@ void btrfs_destroy_inode(struct inode *inode) inode_tree_del(inode); btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); + btrfs_put_root(BTRFS_I(inode)->root); } int btrfs_drop_inode(struct inode *inode) From dc9492c14c758639d7b2468d4ed3c77e785c1a35 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:44 -0500 Subject: [PATCH 195/210] btrfs: hold a ref on the root on the dead roots list At the point we add a root to the dead roots list we have no open inodes for that root, so we need to hold a ref on that root to keep it from disappearing. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +-- fs/btrfs/transaction.c | 5 ++++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5db76264b07f..831755a867ce 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2035,8 +2035,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) btrfs_drop_and_free_fs_root(fs_info, gang[0]); - else - btrfs_put_root(gang[0]); + btrfs_put_root(gang[0]); } while (1) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d2c667465fd4..8cede6eb9843 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1264,8 +1264,10 @@ void btrfs_add_dead_root(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&fs_info->trans_lock); - if (list_empty(&root->root_list)) + if (list_empty(&root->root_list)) { + btrfs_grab_root(root); list_add_tail(&root->root_list, &fs_info->dead_roots); + } spin_unlock(&fs_info->trans_lock); } @@ -2435,6 +2437,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) else ret = btrfs_drop_snapshot(root, 1, 0); + btrfs_put_root(root); return (ret < 0) ? 0 : 1; } From 4785e24fa5d23b05bc0f579e10aeb44c4a0a2a3d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:45 -0500 Subject: [PATCH 196/210] btrfs: don't take an extra root ref at allocation time Now that all the users of roots take references for them we can drop the extra root ref we've been taking. Before we had roots at 2 refs for the life of the file system, one for the radix tree, and one simply for existing. Now that we have proper ref accounting in all places that use roots we can drop this extra ref simply for existing as we no longer need it. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 20 ++++++-------------- fs/btrfs/tests/qgroup-tests.c | 2 ++ 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 831755a867ce..3a6d35e1edb7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1601,22 +1601,11 @@ again: if (ret == 0) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); - /* - * All roots have two refs on them at all times, one for the mounted fs, - * and one for being in the radix tree. This way we only free the root - * when we are unmounting or deleting the subvolume. We get one ref - * from __setup_root, one for inserting it into the radix tree, and then - * we have the third for returning it, and the caller will put it when - * it's done with the root. - */ - btrfs_grab_root(root); ret = btrfs_insert_fs_root(fs_info, root); if (ret) { btrfs_put_root(root); - if (ret == -EEXIST) { - btrfs_put_root(root); + if (ret == -EEXIST) goto again; - } goto fail; } return root; @@ -3904,11 +3893,13 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { + bool drop_ref = false; + spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) - btrfs_put_root(root); + drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); if (btrfs_root_refs(&root->root_item) == 0) @@ -3930,7 +3921,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, iput(root->ino_cache_inode); root->ino_cache_inode = NULL; } - btrfs_put_root(root); + if (drop_ref) + btrfs_put_root(root); } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index ac035a6fa003..ce1ca8e73c2d 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -507,6 +507,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) test_err("couldn't insert fs root %d", ret); goto out; } + btrfs_put_root(tmp_root); tmp_root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(tmp_root)) { @@ -521,6 +522,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) test_err("couldn't insert fs root %d", ret); goto out; } + btrfs_put_root(tmp_root); test_msg("running qgroup tests"); ret = test_no_shared_qgroup(root, sectorsize, nodesize); From efc3453494af78180ac00d9dd8391fd52c4a921e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:46 -0500 Subject: [PATCH 197/210] btrfs: make btrfs_cleanup_fs_roots use the radix tree lock The radix root is primarily protected by the fs_roots_radix_lock, so use that to lookup and get a ref on all of our fs roots in btrfs_cleanup_fs_roots. The tree reference is taken in the protected section as before. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3a6d35e1edb7..4d3eba909664 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3932,15 +3932,14 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) int i = 0; int err = 0; unsigned int ret = 0; - int index; while (1) { - index = srcu_read_lock(&fs_info->subvol_srcu); + spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); if (!ret) { - srcu_read_unlock(&fs_info->subvol_srcu, index); + spin_unlock(&fs_info->fs_roots_radix_lock); break; } root_objectid = gang[ret - 1]->root_key.objectid + 1; @@ -3954,7 +3953,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) /* grab all the search result for later use */ gang[i] = btrfs_grab_root(gang[i]); } - srcu_read_unlock(&fs_info->subvol_srcu, index); + spin_unlock(&fs_info->fs_roots_radix_lock); for (i = 0; i < ret; i++) { if (!gang[i]) From c75e839414d3610e6487ae3145199c500d55f7f7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:47 -0500 Subject: [PATCH 198/210] btrfs: kill the subvol_srcu Now that we have proper root ref counting everywhere we can kill the subvol_srcu. * removal of fs_info::subvol_srcu reduces size of fs_info by 1176 bytes * the refcount_t used for the references checks for accidental 0->1 in cases where the root lifetime would not be properly protected * there's a leak detector for roots to catch unfreed roots at umount time * SRCU served us well over the years but is was not a proper synchronization mechanism for some cases Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/backref.c | 12 +----------- fs/btrfs/ctree.h | 1 - fs/btrfs/disk-io.c | 37 +++++++++--------------------------- fs/btrfs/export.c | 21 ++++---------------- fs/btrfs/file.c | 5 ----- fs/btrfs/inode.c | 3 --- fs/btrfs/send.c | 14 -------------- fs/btrfs/tests/btrfs-tests.c | 9 --------- 8 files changed, 14 insertions(+), 88 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f2728fb3ee8f..cd2d39b60be0 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -542,24 +542,19 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, int ret = 0; int root_level; int level = ref->level; - int index; struct btrfs_key search_key = ref->key_for_search; root_key.objectid = ref->root_id; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - root = btrfs_get_fs_root(fs_info, &root_key, false); if (IS_ERR(root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(root); goto out_free; } if (btrfs_is_testing(fs_info)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -ENOENT; goto out; } @@ -571,10 +566,8 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, else root_level = btrfs_old_root_level(root, time_seq); - if (root_level + 1 == level) { - srcu_read_unlock(&fs_info->subvol_srcu, index); + if (root_level + 1 == level) goto out; - } /* * We can often find data backrefs with an offset that is too large @@ -604,9 +597,6 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, else ret = btrfs_search_old_slot(root, &search_key, path, time_seq); - /* root node has been locked, we can release @subvol_srcu safely here */ - srcu_read_unlock(&fs_info->subvol_srcu, index); - btrfs_debug(fs_info, "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", ref->root_id, level, ref->count, ret, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f1eef81aaa49..8aa7b9dac405 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -697,7 +697,6 @@ struct btrfs_fs_info { struct rw_semaphore cleanup_work_sem; struct rw_semaphore subvol_sem; - struct srcu_struct subvol_srcu; spinlock_t trans_lock; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4d3eba909664..a6cb5cbbdb9f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2757,46 +2757,33 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - ret = init_srcu_struct(&fs_info->subvol_srcu); + ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); if (ret) return ret; - ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); - if (ret) - goto fail; - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) - goto fail; + return ret; fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); if (ret) - goto fail; + return ret; ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, GFP_KERNEL); if (ret) - goto fail; + return ret; fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), GFP_KERNEL); - if (!fs_info->delayed_root) { - ret = -ENOMEM; - goto fail; - } + if (!fs_info->delayed_root) + return -ENOMEM; btrfs_init_delayed_root(fs_info->delayed_root); - ret = btrfs_alloc_stripe_hash_table(fs_info); - if (ret) - goto fail; - - return 0; -fail: - cleanup_srcu_struct(&fs_info->subvol_srcu); - return ret; + return btrfs_alloc_stripe_hash_table(fs_info); } static int btrfs_uuid_rescan_kthread(void *data) @@ -2870,13 +2857,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->chunk_root = chunk_root; if (!tree_root || !chunk_root) { err = -ENOMEM; - goto fail_srcu; + goto fail; } fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_srcu; + goto fail; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); btrfs_init_btree_inode(fs_info); @@ -3398,8 +3385,6 @@ fail_alloc: btrfs_mapping_tree_free(&fs_info->mapping_tree); iput(fs_info->btree_inode); -fail_srcu: - cleanup_srcu_struct(&fs_info->subvol_srcu); fail: btrfs_close_devices(fs_info->fs_devices); return err; @@ -3902,9 +3887,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); - if (btrfs_root_refs(&root->root_item) == 0) - synchronize_srcu(&fs_info->subvol_srcu); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); if (root->reloc_root) { @@ -4116,7 +4098,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_mapping_tree_free(&fs_info->mapping_tree); btrfs_close_devices(fs_info->fs_devices); - cleanup_srcu_struct(&fs_info->subvol_srcu); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 25bd4317bf5a..2bb25d2dc44b 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -65,8 +65,6 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, struct btrfs_root *root; struct inode *inode; struct btrfs_key key; - int index; - int err = 0; if (objectid < BTRFS_FIRST_FREE_OBJECTID) return ERR_PTR(-ESTALE); @@ -75,13 +73,9 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - root = btrfs_get_fs_root(fs_info, &key, true); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto fail; - } + if (IS_ERR(root)) + return ERR_CAST(root); key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; @@ -89,12 +83,8 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, inode = btrfs_iget(sb, &key, root); btrfs_put_root(root); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto fail; - } - - srcu_read_unlock(&fs_info->subvol_srcu, index); + if (IS_ERR(inode)) + return ERR_CAST(inode); if (check_generation && generation != inode->i_generation) { iput(inode); @@ -102,9 +92,6 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, } return d_obtain_alias(inode); -fail: - srcu_read_unlock(&fs_info->subvol_srcu, index); - return ERR_PTR(err); } static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 18c88f514a0d..20107f42a766 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -278,7 +278,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; int num_defrag; - int index; int ret; /* get the inode */ @@ -286,8 +285,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - inode_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(inode_root)) { ret = PTR_ERR(inode_root); @@ -303,7 +300,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, ret = PTR_ERR(inode); goto cleanup; } - srcu_read_unlock(&fs_info->subvol_srcu, index); /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); @@ -339,7 +335,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, iput(inode); return 0; cleanup: - srcu_read_unlock(&fs_info->subvol_srcu, index); kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 683bb20c1d0f..320d1062068d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5364,7 +5364,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct btrfs_root *sub_root = root; struct btrfs_key location; u8 di_type = 0; - int index; int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) @@ -5391,7 +5390,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return inode; } - index = srcu_read_lock(&fs_info->subvol_srcu); ret = fixup_tree_root_location(fs_info, dir, dentry, &location, &sub_root); if (ret < 0) { @@ -5404,7 +5402,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) } if (root != sub_root) btrfs_put_root(sub_root); - srcu_read_unlock(&fs_info->subvol_srcu, index); if (!IS_ERR(inode) && root != sub_root) { down_read(&fs_info->cleanup_work_sem); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e47f768cec3d..c5f41bd86765 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7028,7 +7028,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) int clone_sources_to_rollback = 0; unsigned alloc_size; int sort_clone_roots = 0; - int index; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -7155,11 +7154,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - clone_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(clone_root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(clone_root); goto out; } @@ -7168,7 +7164,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) btrfs_root_dead(clone_root)) { spin_unlock(&clone_root->root_item_lock); btrfs_put_root(clone_root); - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; goto out; } @@ -7176,13 +7171,11 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) dedupe_in_progress_warn(clone_root); spin_unlock(&clone_root->root_item_lock); btrfs_put_root(clone_root); - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EAGAIN; goto out; } clone_root->send_in_progress++; spin_unlock(&clone_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); sctx->clone_roots[i].root = clone_root; clone_sources_to_rollback = i + 1; @@ -7196,11 +7189,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - sctx->parent_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(sctx->parent_root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(sctx->parent_root); goto out; } @@ -7210,20 +7200,16 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (!btrfs_root_readonly(sctx->parent_root) || btrfs_root_dead(sctx->parent_root)) { spin_unlock(&sctx->parent_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; goto out; } if (sctx->parent_root->dedupe_in_progress) { dedupe_in_progress_warn(sctx->parent_root); spin_unlock(&sctx->parent_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EAGAIN; goto out; } spin_unlock(&sctx->parent_root->root_item_lock); - - srcu_read_unlock(&fs_info->subvol_srcu, index); } /* diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 42e62fd2809c..999c14e5d0bd 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -134,14 +134,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; - - if (init_srcu_struct(&fs_info->subvol_srcu)) { - kfree(fs_info->fs_devices); - kfree(fs_info->super_copy); - kfree(fs_info); - return NULL; - } - set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); test_mnt->mnt_sb->s_fs_info = fs_info; @@ -191,7 +183,6 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) } btrfs_free_qgroup_config(fs_info); btrfs_free_fs_roots(fs_info); - cleanup_srcu_struct(&fs_info->subvol_srcu); kfree(fs_info->super_copy); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); From 9c1036fdb1d1ff1b09d03c8db60f4dc67cc6614e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 13 Mar 2020 17:23:18 +0200 Subject: [PATCH 199/210] btrfs: Remove BTRFS_SUBVOL_CREATE_ASYNC support This functionality was deprecated in kernel 5.4. Since no one has complained of the impending removal it's time we did so. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ add comment ] Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 25 +------------------------ include/uapi/linux/btrfs.h | 13 ++++++++----- 2 files changed, 9 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f8a73a28022a..7e09985d566e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1818,8 +1818,6 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, { struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; - u64 transid = 0; - u64 *ptr = NULL; bool readonly = false; struct btrfs_qgroup_inherit *inherit = NULL; @@ -1836,15 +1834,6 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, goto free_args; } - if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) { - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - - btrfs_warn(fs_info, -"SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7"); - - ptr = &transid; - } if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { @@ -1860,17 +1849,10 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, } ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, - vol_args->fd, subvol, ptr, + vol_args->fd, subvol, NULL, readonly, inherit); if (ret) goto free_inherit; - - if (ptr && copy_to_user(arg + - offsetof(struct btrfs_ioctl_vol_args_v2, - transid), - ptr, sizeof(*ptr))) - ret = -EFAULT; - free_inherit: kfree(inherit); free_args: @@ -1929,11 +1911,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_drop_write; } - if (flags & BTRFS_SUBVOL_CREATE_ASYNC) { - ret = -EINVAL; - goto out_drop_write; - } - if (flags & ~BTRFS_SUBVOL_RDONLY) { ret = -EOPNOTSUPP; goto out_drop_write; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index b5f3ea36d3cb..8134924cfc17 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -36,7 +36,12 @@ struct btrfs_ioctl_vol_args { #define BTRFS_DEVICE_PATH_NAME_MAX 1024 #define BTRFS_SUBVOL_NAME_MAX 4039 -#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) +/* + * Deprecated since 5.7: + * + * BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) + */ + #define BTRFS_SUBVOL_RDONLY (1ULL << 1) #define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) @@ -45,8 +50,7 @@ struct btrfs_ioctl_vol_args { #define BTRFS_SUBVOL_SPEC_BY_ID (1ULL << 4) #define BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED \ - (BTRFS_SUBVOL_CREATE_ASYNC | \ - BTRFS_SUBVOL_RDONLY | \ + (BTRFS_SUBVOL_RDONLY | \ BTRFS_SUBVOL_QGROUP_INHERIT | \ BTRFS_DEVICE_SPEC_BY_ID | \ BTRFS_SUBVOL_SPEC_BY_ID) @@ -116,8 +120,7 @@ struct btrfs_ioctl_qgroup_limit_args { /* Supported flags for BTRFS_IOC_SNAP_CREATE_V2 and BTRFS_IOC_SUBVOL_CREATE_V2 */ #define BTRFS_SUBVOL_CREATE_ARGS_MASK \ - (BTRFS_SUBVOL_CREATE_ASYNC | \ - BTRFS_SUBVOL_RDONLY | \ + (BTRFS_SUBVOL_RDONLY | \ BTRFS_SUBVOL_QGROUP_INHERIT) /* Supported flags for BTRFS_IOC_SNAP_DESTROY_V2 */ From 5d54c67eccb489b25e433bd98a5622e3a1195a8b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 13 Mar 2020 17:23:19 +0200 Subject: [PATCH 200/210] btrfs: Remove transid argument from btrfs_ioctl_snap_create_transid btrfs_ioctl_snap_create_transid no longer takes a transid argument, so remove it and rename the function to __btrfs_ioctl_snap_create to reflect it's an internal, worker function. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7e09985d566e..e843785f966b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1729,9 +1729,9 @@ out: return ret; } -static noinline int btrfs_ioctl_snap_create_transid(struct file *file, +static noinline int __btrfs_ioctl_snap_create(struct file *file, const char *name, unsigned long fd, int subvol, - u64 *transid, bool readonly, + bool readonly, struct btrfs_qgroup_inherit *inherit) { int namelen; @@ -1758,7 +1758,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, if (subvol) { ret = btrfs_mksubvol(&file->f_path, name, namelen, - NULL, transid, readonly, inherit); + NULL, readonly, inherit); } else { struct fd src = fdget(fd); struct inode *src_inode; @@ -1781,7 +1781,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, } else { ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, - transid, readonly, inherit); + readonly, inherit); } fdput(src); } @@ -1805,9 +1805,8 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, - vol_args->fd, subvol, - NULL, false, NULL); + ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, + subvol, false, NULL); kfree(vol_args); return ret; @@ -1848,9 +1847,8 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, } } - ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, - vol_args->fd, subvol, NULL, - readonly, inherit); + ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, + subvol, readonly, inherit); if (ret) goto free_inherit; free_inherit: From 9babda9f33fdb738b1aee770dcc130f25b67a6f0 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 13 Mar 2020 17:23:20 +0200 Subject: [PATCH 201/210] btrfs: Remove async_transid from btrfs_mksubvol/create_subvol/create_snapshot With BTRFS_SUBVOL_CREATE_ASYNC support remove it's no longer required to pass the async_transid parameter so remove it and any code using it. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 37 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e843785f966b..40b729dce91c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -551,7 +551,6 @@ int __pure btrfs_is_empty_uuid(u8 *uuid) static noinline int create_subvol(struct inode *dir, struct dentry *dentry, const char *name, int namelen, - u64 *async_transid, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -723,14 +722,7 @@ fail: trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(fs_info, &block_rsv); - if (async_transid) { - *async_transid = trans->transid; - err = btrfs_commit_transaction_async(trans, 1); - if (err) - err = btrfs_commit_transaction(trans); - } else { - err = btrfs_commit_transaction(trans); - } + err = btrfs_commit_transaction(trans); if (err && !ret) ret = err; @@ -748,8 +740,7 @@ fail_free: } static int create_snapshot(struct btrfs_root *root, struct inode *dir, - struct dentry *dentry, - u64 *async_transid, bool readonly, + struct dentry *dentry, bool readonly, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -833,14 +824,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); spin_unlock(&fs_info->trans_lock); - if (async_transid) { - *async_transid = trans->transid; - ret = btrfs_commit_transaction_async(trans, 1); - if (ret) - ret = btrfs_commit_transaction(trans); - } else { - ret = btrfs_commit_transaction(trans); - } + + ret = btrfs_commit_transaction(trans); if (ret) goto fail; @@ -946,7 +931,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) static noinline int btrfs_mksubvol(const struct path *parent, const char *name, int namelen, struct btrfs_root *snap_src, - u64 *async_transid, bool readonly, + bool readonly, struct btrfs_qgroup_inherit *inherit) { struct inode *dir = d_inode(parent->dentry); @@ -982,13 +967,11 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) goto out_up_read; - if (snap_src) { - error = create_snapshot(snap_src, dir, dentry, - async_transid, readonly, inherit); - } else { - error = create_subvol(dir, dentry, name, namelen, - async_transid, inherit); - } + if (snap_src) + error = create_snapshot(snap_src, dir, dentry, readonly, inherit); + else + error = create_subvol(dir, dentry, name, namelen, inherit); + if (!error) fsnotify_mkdir(dir, dentry); out_up_read: From cd22a51c6650f567f297215dd7c47c3401ee7290 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2020 17:09:53 -0400 Subject: [PATCH 202/210] btrfs: do not use readahead for running delayed refs Readahead will generate a lot of extra reads for adjacent nodes, but when running delayed refs we have no idea if the next ref is going to be adjacent or not, so this potentially just generates a lot of extra IO. To make matters worse each ref is truly just looking for one item, it doesn't generally search forward, so we simply don't need it here. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3e7bd2388ed2..54a64d1e18c6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1447,7 +1447,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, @@ -1472,7 +1471,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ if (owner < BTRFS_FIRST_FREE_OBJECTID) { @@ -1589,7 +1587,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, } again: - path->reada = READA_FORWARD; path->leave_spinning = 1; ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); if (ret < 0) { @@ -2978,7 +2975,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; From d7ff00f6082c8578a26d75f2ca1aa28b8f05901e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2020 17:09:54 -0400 Subject: [PATCH 203/210] btrfs: do not readahead in build_backref_tree Here we are just searching down to the bytenr we're building the backref tree for, and all of it's paths to the roots. These bytenrs are not guaranteed to be anywhere near each other, so readahead just generates extra latency. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7f31dd57de54..63b11e546a87 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -743,8 +743,6 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, err = -ENOMEM; goto out; } - path1->reada = READA_FORWARD; - path2->reada = READA_FORWARD; node = alloc_backref_node(cache); if (!node) { From 5f6b2e5cd67a7bb39eb24fa3206e54803e1a25ff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2020 17:17:06 -0400 Subject: [PATCH 204/210] btrfs: reloc: reorder reservation before root selection Since we're not only checking for metadata reservations but also if we need to throttle our delayed ref generation, reorder reserve_metadata_space() above the select_one_root() call in relocate_tree_block(). The reason we want this is because select_reloc_root() will mess with the backref cache, and if we're going to bail we want to be able to cleanly remove this node from the backref cache and come back along to regenerate it. Move it up so this is the first thing we do to make restarting cleaner. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 63b11e546a87..73d9dc192fbb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3177,6 +3177,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, if (!node) return 0; + /* + * If we fail here we want to drop our backref_node because we are going + * to start over and regenerate the tree for it. + */ + ret = reserve_metadata_space(trans, rc, node); + if (ret) + goto out; + BUG_ON(node->processed); root = select_one_root(node); if (root == ERR_PTR(-ENOENT)) { @@ -3184,12 +3192,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, goto out; } - if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { - ret = reserve_metadata_space(trans, rc, node); - if (ret) - goto out; - } - if (root) { if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { BUG_ON(node->new_bytenr); From 50dbbb71c79df89532ec41d118d59386e5a877e3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2020 17:17:07 -0400 Subject: [PATCH 205/210] btrfs: restart relocate_tree_blocks properly There are two bugs here, but fixing them independently would just result in pain if you happened to bisect between the two patches. First is how we handle the -EAGAIN from relocate_tree_block(). We don't set error, unless we happen to be the first node, which makes no sense, I have no idea what the code was trying to accomplish here. We in fact _do_ want err set here so that we know we need to restart in relocate_block_group(). Also we need finish_pending_nodes() to not actually call link_to_upper(), because we didn't actually relocate the block. And then if we do get -EAGAIN we do not want to set our backref cache last_trans to the one before ours. This would force us to update our backref cache if we didn't cross transaction ids, which would mean we'd have some nodes updated to their new_bytenr, but still able to find their old bytenr because we're searching the same commit root as the last time we went through relocate_tree_blocks. Fixing these two things keeps us from panicing when we start breaking out of relocate_tree_blocks() either for delayed ref flushing or enospc. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 73d9dc192fbb..b9abbbb1a72b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3269,9 +3269,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, ret = relocate_tree_block(trans, rc, node, &block->key, path); if (ret < 0) { - if (ret != -EAGAIN || &block->rb_node == rb_first(blocks)) - err = ret; - goto out; + err = ret; + break; } } out: @@ -4051,12 +4050,6 @@ restart: if (!RB_EMPTY_ROOT(&blocks)) { ret = relocate_tree_blocks(trans, rc, &blocks); if (ret < 0) { - /* - * if we fail to relocate tree blocks, force to update - * backref cache when committing transaction. - */ - rc->backref_cache.last_trans = trans->transid - 1; - if (ret != -EAGAIN) { err = ret; break; From ea287ab157c2816bf12aad4cece41372f9d146b4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2020 17:17:08 -0400 Subject: [PATCH 206/210] btrfs: track reloc roots based on their commit root bytenr We always search the commit root of the extent tree for looking up back references, however we track the reloc roots based on their current bytenr. This is wrong, if we commit the transaction between relocating tree blocks we could end up in this code in build_backref_tree if (key.objectid == key.offset) { /* * Only root blocks of reloc trees use backref * pointing to itself. */ root = find_reloc_root(rc, cur->bytenr); ASSERT(root); cur->root = root; break; } find_reloc_root() is looking based on the bytenr we had in the commit root, but if we've COWed this reloc root we will not find that bytenr, and we will trip over the ASSERT(root). Fix this by using the commit_root->start bytenr for indexing the commit root. Then we change the __update_reloc_root() caller to be used when we switch the commit root for the reloc root during commit. This fixes the panic I was seeing when we started throttling relocation for delayed refs. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b9abbbb1a72b..f65595602aa8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1339,7 +1339,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) if (!node) return -ENOMEM; - node->bytenr = root->node->start; + node->bytenr = root->commit_root->start; node->data = root; spin_lock(&rc->reloc_root_tree.lock); @@ -1371,10 +1371,11 @@ static void __del_reloc_root(struct btrfs_root *root) if (rc && root->node) { spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->node->start); + root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + RB_CLEAR_NODE(&node->rb_node); } spin_unlock(&rc->reloc_root_tree.lock); if (!node) @@ -1405,7 +1406,7 @@ static void __del_reloc_root(struct btrfs_root *root) * helper to update the 'address of tree root -> reloc tree' * mapping */ -static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) +static int __update_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; @@ -1414,7 +1415,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->node->start); + root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); @@ -1426,7 +1427,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) BUG_ON((struct btrfs_root *)node->data != root); spin_lock(&rc->reloc_root_tree.lock); - node->bytenr = new_bytenr; + node->bytenr = root->node->start; rb_node = tree_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); @@ -1595,6 +1596,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, } if (reloc_root->commit_root != reloc_root->node) { + __update_reloc_root(reloc_root); btrfs_set_root_node(root_item, reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->commit_root = btrfs_root_node(reloc_root); @@ -4660,11 +4662,6 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, BUG_ON(rc->stage == UPDATE_DATA_PTRS && root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (buf == root->node) - __update_reloc_root(root, cow->start); - } - level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)) From 39dba8739c4e360d7d1b27119c728791e68b0448 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2020 17:17:09 -0400 Subject: [PATCH 207/210] btrfs: do not resolve backrefs for roots that are being deleted Zygo reported a deadlock where a task was stuck in the inode logical resolve code. The deadlock looks like this Task 1 btrfs_ioctl_logical_to_ino ->iterate_inodes_from_logical ->iterate_extent_inodes ->path->search_commit_root isn't set, so a transaction is started ->resolve_indirect_ref for a root that's being deleted ->search for our key, attempt to lock a node, DEADLOCK Task 2 btrfs_drop_snapshot ->walk down to a leaf, lock it, walk up, lock node ->end transaction ->start transaction -> wait_cur_trans Task 3 btrfs_commit_transaction ->wait_event(cur_trans->write_wait, num_writers == 1) DEADLOCK We are holding a transaction open in btrfs_ioctl_logical_to_ino while we try to resolve our references. btrfs_drop_snapshot() holds onto its locks while it stops and starts transaction handles, because it assumes nobody is going to touch the root now. Commit just does what commit does, waiting for the writers to finish, blocking any new trans handles from starting. Fix this by making the backref code not try to resolve backrefs of roots that are currently being deleted. This will keep us from walking into a snapshot that's currently being deleted. This problem was harder to hit before because we rarely broke out of the snapshot delete halfway through, but with my delayed ref throttling code it happened much more often. However we've always been able to do this, so it's not a new problem. Fixes: 8da6d5815c59 ("Btrfs: added btrfs_find_all_roots()") Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/backref.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index cd2d39b60be0..9c380e7edf62 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -554,6 +554,12 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out_free; } + if (!path->search_commit_root && + test_bit(BTRFS_ROOT_DELETING, &root->state)) { + ret = -ENOENT; + goto out; + } + if (btrfs_is_testing(fs_info)) { ret = -ENOENT; goto out; From abdd9feb45ed7e9cc55b60a58deba9ca78d46352 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 22 Mar 2020 10:09:11 +0100 Subject: [PATCH 208/210] btrfs: sysfs: Use scnprintf() instead of snprintf() snprintf() is a hard-to-use function, and it's especially difficult to use it properly for concatenating substrings in a buffer with a limited size. Since snprintf() returns the would-be-output size, not the actual size, the subsequent use of snprintf() may point to the incorrect position easily. Also, returning the value from snprintf() directly to sysfs show function would pass a bogus value that is higher than the actually truncated string. That said, although the current code doesn't actually overflow the buffer with PAGE_SIZE, it's a usage that shouldn't be done. Or it's worse; this gives a wrong confidence as if it were doing safe operations. This patch replaces such snprintf() calls with a safer version, scnprintf(). It returns the actual output size, hence it's more intuitive and the code does what's expected. Signed-off-by: Takashi Iwai Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 57 ++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 93cf76118a04..a39bff64ff24 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -155,7 +155,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj, } else val = can_modify_feature(fa); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } static ssize_t btrfs_feature_attr_store(struct kobject *kobj, @@ -295,7 +295,7 @@ static const struct attribute_group btrfs_feature_attr_group = { static ssize_t rmdir_subvol_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return snprintf(buf, PAGE_SIZE, "0\n"); + return scnprintf(buf, PAGE_SIZE, "0\n"); } BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); @@ -310,12 +310,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj, * This "trick" only works as long as 'enum btrfs_csum_type' has * no holes in it */ - ret += snprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", (i == 0 ? "" : " "), btrfs_super_csum_name(i)); } - ret += snprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); @@ -350,7 +350,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%lld\n", atomic64_read(&fs_info->discard_ctl.discardable_bytes)); } BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show); @@ -361,7 +361,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%d\n", + return scnprintf(buf, PAGE_SIZE, "%d\n", atomic_read(&fs_info->discard_ctl.discardable_extents)); } BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show); @@ -372,7 +372,7 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%lld\n", fs_info->discard_ctl.discard_bitmap_bytes); } BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); @@ -383,7 +383,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%lld\n", atomic64_read(&fs_info->discard_ctl.discard_bytes_saved)); } BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show); @@ -394,7 +394,7 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%lld\n", fs_info->discard_ctl.discard_extent_bytes); } BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); @@ -405,7 +405,7 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", + return scnprintf(buf, PAGE_SIZE, "%u\n", READ_ONCE(fs_info->discard_ctl.iops_limit)); } @@ -435,7 +435,7 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", + return scnprintf(buf, PAGE_SIZE, "%u\n", READ_ONCE(fs_info->discard_ctl.kbps_limit)); } @@ -465,7 +465,7 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%llu\n", + return scnprintf(buf, PAGE_SIZE, "%llu\n", READ_ONCE(fs_info->discard_ctl.max_discard_size)); } @@ -530,7 +530,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) val = *value_ptr; if (lock) spin_unlock(lock); - return snprintf(buf, PAGE_SIZE, "%llu\n", val); + return scnprintf(buf, PAGE_SIZE, "%llu\n", val); } static ssize_t global_rsv_size_show(struct kobject *kobj, @@ -576,7 +576,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj, val += block_group->used; } up_read(&sinfo->groups_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", val); + return scnprintf(buf, PAGE_SIZE, "%llu\n", val); } static struct attribute *raid_attrs[] = { @@ -613,7 +613,7 @@ static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, { struct btrfs_space_info *sinfo = to_space_info(kobj); s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned); - return snprintf(buf, PAGE_SIZE, "%lld\n", val); + return scnprintf(buf, PAGE_SIZE, "%lld\n", val); } SPACE_INFO_ATTR(flags); @@ -670,7 +670,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj, ssize_t ret; spin_lock(&fs_info->super_lock); - ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); spin_unlock(&fs_info->super_lock); return ret; @@ -718,7 +718,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); + return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -728,8 +728,8 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", - fs_info->super_copy->sectorsize); + return scnprintf(buf, PAGE_SIZE, "%u\n", + fs_info->super_copy->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -739,8 +739,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", - fs_info->super_copy->sectorsize); + return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); @@ -752,7 +751,7 @@ static ssize_t quota_override_show(struct kobject *kobj, int quota_override; quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); - return snprintf(buf, PAGE_SIZE, "%d\n", quota_override); + return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override); } static ssize_t quota_override_store(struct kobject *kobj, @@ -790,7 +789,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%pU\n", + return scnprintf(buf, PAGE_SIZE, "%pU\n", fs_info->fs_devices->metadata_uuid); } @@ -802,7 +801,7 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); - return snprintf(buf, PAGE_SIZE, "%s (%s)\n", + return scnprintf(buf, PAGE_SIZE, "%s (%s)\n", btrfs_super_csum_name(csum_type), crypto_shash_driver_name(fs_info->csum_shash)); } @@ -992,7 +991,7 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) continue; name = btrfs_feature_attrs[set][i].kobj_attr.attr.name; - len += snprintf(str + len, bufsize - len, "%s%s", + len += scnprintf(str + len, bufsize - len, "%s%s", len ? "," : "", name); } @@ -1201,7 +1200,7 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); @@ -1214,7 +1213,7 @@ static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show); @@ -1228,7 +1227,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); @@ -1241,7 +1240,7 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); From 351cbf6e4410e7ece05e35d0a07320538f2418b4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 19 Mar 2020 10:11:32 -0400 Subject: [PATCH 209/210] btrfs: use nofs allocations for running delayed items Zygo reported the following lockdep splat while testing the balance patches ====================================================== WARNING: possible circular locking dependency detected 5.6.0-c6f0579d496a+ #53 Not tainted ------------------------------------------------------ kswapd0/1133 is trying to acquire lock: ffff888092f622c0 (&delayed_node->mutex){+.+.}, at: __btrfs_release_delayed_node+0x7c/0x5b0 but task is already holding lock: ffffffff8fc5f860 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}: fs_reclaim_acquire.part.91+0x29/0x30 fs_reclaim_acquire+0x19/0x20 kmem_cache_alloc_trace+0x32/0x740 add_block_entry+0x45/0x260 btrfs_ref_tree_mod+0x6e2/0x8b0 btrfs_alloc_tree_block+0x789/0x880 alloc_tree_block_no_bg_flush+0xc6/0xf0 __btrfs_cow_block+0x270/0x940 btrfs_cow_block+0x1ba/0x3a0 btrfs_search_slot+0x999/0x1030 btrfs_insert_empty_items+0x81/0xe0 btrfs_insert_delayed_items+0x128/0x7d0 __btrfs_run_delayed_items+0xf4/0x2a0 btrfs_run_delayed_items+0x13/0x20 btrfs_commit_transaction+0x5cc/0x1390 insert_balance_item.isra.39+0x6b2/0x6e0 btrfs_balance+0x72d/0x18d0 btrfs_ioctl_balance+0x3de/0x4c0 btrfs_ioctl+0x30ab/0x44a0 ksys_ioctl+0xa1/0xe0 __x64_sys_ioctl+0x43/0x50 do_syscall_64+0x77/0x2c0 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (&delayed_node->mutex){+.+.}: __lock_acquire+0x197e/0x2550 lock_acquire+0x103/0x220 __mutex_lock+0x13d/0xce0 mutex_lock_nested+0x1b/0x20 __btrfs_release_delayed_node+0x7c/0x5b0 btrfs_remove_delayed_node+0x49/0x50 btrfs_evict_inode+0x6fc/0x900 evict+0x19a/0x2c0 dispose_list+0xa0/0xe0 prune_icache_sb+0xbd/0xf0 super_cache_scan+0x1b5/0x250 do_shrink_slab+0x1f6/0x530 shrink_slab+0x32e/0x410 shrink_node+0x2a5/0xba0 balance_pgdat+0x4bd/0x8a0 kswapd+0x35a/0x800 kthread+0x1e9/0x210 ret_from_fork+0x3a/0x50 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&delayed_node->mutex); lock(fs_reclaim); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by kswapd0/1133: #0: ffffffff8fc5f860 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30 #1: ffffffff8fc380d8 (shrinker_rwsem){++++}, at: shrink_slab+0x1e8/0x410 #2: ffff8881e0e6c0e8 (&type->s_umount_key#42){++++}, at: trylock_super+0x1b/0x70 stack backtrace: CPU: 2 PID: 1133 Comm: kswapd0 Not tainted 5.6.0-c6f0579d496a+ #53 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Call Trace: dump_stack+0xc1/0x11a print_circular_bug.isra.38.cold.57+0x145/0x14a check_noncircular+0x2a9/0x2f0 ? print_circular_bug.isra.38+0x130/0x130 ? stack_trace_consume_entry+0x90/0x90 ? save_trace+0x3cc/0x420 __lock_acquire+0x197e/0x2550 ? btrfs_inode_clear_file_extent_range+0x9b/0xb0 ? register_lock_class+0x960/0x960 lock_acquire+0x103/0x220 ? __btrfs_release_delayed_node+0x7c/0x5b0 __mutex_lock+0x13d/0xce0 ? __btrfs_release_delayed_node+0x7c/0x5b0 ? __asan_loadN+0xf/0x20 ? pvclock_clocksource_read+0xeb/0x190 ? __btrfs_release_delayed_node+0x7c/0x5b0 ? mutex_lock_io_nested+0xc20/0xc20 ? __kasan_check_read+0x11/0x20 ? check_chain_key+0x1e6/0x2e0 mutex_lock_nested+0x1b/0x20 ? mutex_lock_nested+0x1b/0x20 __btrfs_release_delayed_node+0x7c/0x5b0 btrfs_remove_delayed_node+0x49/0x50 btrfs_evict_inode+0x6fc/0x900 ? btrfs_setattr+0x840/0x840 ? do_raw_spin_unlock+0xa8/0x140 evict+0x19a/0x2c0 dispose_list+0xa0/0xe0 prune_icache_sb+0xbd/0xf0 ? invalidate_inodes+0x310/0x310 super_cache_scan+0x1b5/0x250 do_shrink_slab+0x1f6/0x530 shrink_slab+0x32e/0x410 ? do_shrink_slab+0x530/0x530 ? do_shrink_slab+0x530/0x530 ? __kasan_check_read+0x11/0x20 ? mem_cgroup_protected+0x13d/0x260 shrink_node+0x2a5/0xba0 balance_pgdat+0x4bd/0x8a0 ? mem_cgroup_shrink_node+0x490/0x490 ? _raw_spin_unlock_irq+0x27/0x40 ? finish_task_switch+0xce/0x390 ? rcu_read_lock_bh_held+0xb0/0xb0 kswapd+0x35a/0x800 ? _raw_spin_unlock_irqrestore+0x4c/0x60 ? balance_pgdat+0x8a0/0x8a0 ? finish_wait+0x110/0x110 ? __kasan_check_read+0x11/0x20 ? __kthread_parkme+0xc6/0xe0 ? balance_pgdat+0x8a0/0x8a0 kthread+0x1e9/0x210 ? kthread_create_worker_on_cpu+0xc0/0xc0 ret_from_fork+0x3a/0x50 This is because we hold that delayed node's mutex while doing tree operations. Fix this by just wrapping the searches in nofs. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f678980fe564..bf1595a42a98 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,6 +6,7 @@ #include #include +#include #include "misc.h" #include "delayed-inode.h" #include "disk-io.h" @@ -803,11 +804,14 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_delayed_item *delayed_item) { struct extent_buffer *leaf; + unsigned int nofs_flag; char *ptr; int ret; + nofs_flag = memalloc_nofs_save(); ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key, delayed_item->data_len); + memalloc_nofs_restore(nofs_flag); if (ret < 0 && ret != -EEXIST) return ret; @@ -935,6 +939,7 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *node) { struct btrfs_delayed_item *curr, *prev; + unsigned int nofs_flag; int ret = 0; do_again: @@ -943,7 +948,9 @@ do_again: if (!curr) goto delete_fail; + nofs_flag = memalloc_nofs_save(); ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); + memalloc_nofs_restore(nofs_flag); if (ret < 0) goto delete_fail; else if (ret > 0) { @@ -1010,6 +1017,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + unsigned int nofs_flag; int mod; int ret; @@ -1022,7 +1030,9 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, else mod = 1; + nofs_flag = memalloc_nofs_save(); ret = btrfs_lookup_inode(trans, root, path, &key, mod); + memalloc_nofs_restore(nofs_flag); if (ret > 0) { btrfs_release_path(path); return -ENOENT; @@ -1073,7 +1083,10 @@ search: key.type = BTRFS_INODE_EXTREF_KEY; key.offset = -1; + + nofs_flag = memalloc_nofs_save(); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + memalloc_nofs_restore(nofs_flag); if (ret < 0) goto err_out; ASSERT(ret); From 6ff06729c22ec0b7498d900d79cc88cfb8aceaeb Mon Sep 17 00:00:00 2001 From: Robbie Ko Date: Tue, 17 Mar 2020 14:31:02 +0800 Subject: [PATCH 210/210] btrfs: fix missing semaphore unlock in btrfs_sync_file Ordered ops are started twice in sync file, once outside of inode mutex and once inside, taking the dio semaphore. There was one error path missing the semaphore unlock. Fixes: aab15e8ec2576 ("Btrfs: fix rare chances for data loss when doing a fast fsync") CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Robbie Ko Reviewed-by: Filipe Manana [ add changelog ] Signed-off-by: David Sterba --- fs/btrfs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 20107f42a766..8a144f9cb7ac 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2117,6 +2117,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; }