linux/fs/btrfs/root-tree.c
Boris Burkov 74e9795812 btrfs: qgroup: fix qgroup prealloc rsv leak in subvolume operations
Create subvolume, create snapshot and delete subvolume all use
btrfs_subvolume_reserve_metadata() to reserve metadata for the changes
done to the parent subvolume's fs tree, which cannot be mediated in the
normal way via start_transaction. When quota groups (squota or qgroups)
are enabled, this reserves qgroup metadata of type PREALLOC. Once the
operation is associated to a transaction, we convert PREALLOC to
PERTRANS, which gets cleared in bulk at the end of the transaction.

However, the error paths of these three operations were not implementing
this lifecycle correctly. They unconditionally converted the PREALLOC to
PERTRANS in a generic cleanup step regardless of errors or whether the
operation was fully associated to a transaction or not. This resulted in
error paths occasionally converting this rsv to PERTRANS without calling
record_root_in_trans successfully, which meant that unless that root got
recorded in the transaction by some other thread, the end of the
transaction would not free that root's PERTRANS, leaking it. Ultimately,
this resulted in hitting a WARN in CONFIG_BTRFS_DEBUG builds at unmount
for the leaked reservation.

The fix is to ensure that every qgroup PREALLOC reservation observes the
following properties:

1. any failure before record_root_in_trans is called successfully
   results in freeing the PREALLOC reservation.
2. after record_root_in_trans, we convert to PERTRANS, and now the
   transaction owns freeing the reservation.

This patch enforces those properties on the three operations. Without
it, generic/269 with squotas enabled at mkfs time would fail in ~5-10
runs on my system. With this patch, it ran successfully 1000 times in a
row.

Fixes: e85fde5162 ("btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations")
CC: stable@vger.kernel.org # 6.1+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
2024-04-02 19:18:23 +02:00

551 lines
14 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/err.h>
#include <linux/uuid.h>
#include "ctree.h"
#include "fs.h"
#include "messages.h"
#include "transaction.h"
#include "disk-io.h"
#include "qgroup.h"
#include "space-info.h"
#include "accessors.h"
#include "root-tree.h"
#include "orphan.h"
/*
* Read a root item from the tree. In case we detect a root item smaller then
* sizeof(root_item), we know it's an old version of the root structure and
* initialize all new fields to zero. The same happens if we detect mismatching
* generation numbers as then we know the root was once mounted with an older
* kernel that was not aware of the root item structure change.
*/
static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
struct btrfs_root_item *item)
{
u32 len;
int need_reset = 0;
len = btrfs_item_size(eb, slot);
read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
min_t(u32, len, sizeof(*item)));
if (len < sizeof(*item))
need_reset = 1;
if (!need_reset && btrfs_root_generation(item)
!= btrfs_root_generation_v2(item)) {
if (btrfs_root_generation_v2(item) != 0) {
btrfs_warn(eb->fs_info,
"mismatching generation and generation_v2 found in root item. This root was probably mounted with an older kernel. Resetting all new fields.");
}
need_reset = 1;
}
if (need_reset) {
/* Clear all members from generation_v2 onwards. */
memset_startat(item, 0, generation_v2);
generate_random_guid(item->uuid);
}
}
/*
* Lookup the root by the key.
*
* root: the root of the root tree
* search_key: the key to search
* path: the path we search
* root_item: the root item of the tree we look for
* root_key: the root key of the tree we look for
*
* If ->offset of 'search_key' is -1ULL, it means we are not sure the offset
* of the search key, just lookup the root with the highest offset for a
* given objectid.
*
* If we find something return 0, otherwise > 0, < 0 on error.
*/
int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
struct btrfs_path *path, struct btrfs_root_item *root_item,
struct btrfs_key *root_key)
{
struct btrfs_key found_key;
struct extent_buffer *l;
int ret;
int slot;
ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
if (ret < 0)
return ret;
if (search_key->offset != -1ULL) { /* the search key is exact */
if (ret > 0)
goto out;
} else {
/*
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of the valid range.
*/
if (ret == 0) {
ret = -EUCLEAN;
goto out;
}
if (path->slots[0] == 0)
goto out;
path->slots[0]--;
ret = 0;
}
l = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
if (found_key.objectid != search_key->objectid ||
found_key.type != BTRFS_ROOT_ITEM_KEY) {
ret = 1;
goto out;
}
if (root_item)
btrfs_read_root_item(l, slot, root_item);
if (root_key)
memcpy(root_key, &found_key, sizeof(found_key));
out:
btrfs_release_path(path);
return ret;
}
void btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node)
{
btrfs_set_root_bytenr(item, node->start);
btrfs_set_root_level(item, btrfs_header_level(node));
btrfs_set_root_generation(item, btrfs_header_generation(node));
}
/*
* copy the data in 'item' into the btree
*/
int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_root_item
*item)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *l;
int ret;
int slot;
unsigned long ptr;
u32 old_len;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
if (ret < 0)
goto out;
if (ret > 0) {
btrfs_crit(fs_info,
"unable to find root key (%llu %u %llu) in tree %llu",
key->objectid, key->type, key->offset,
root->root_key.objectid);
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
goto out;
}
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr_offset(l, slot);
old_len = btrfs_item_size(l, slot);
/*
* If this is the first time we update the root item which originated
* from an older kernel, we need to enlarge the item size to make room
* for the added fields.
*/
if (old_len < sizeof(*item)) {
btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, key, path,
-1, 1);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path,
key, sizeof(*item));
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
}
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr_offset(l, slot);
}
/*
* Update generation_v2 so at the next mount we know the new root
* fields are valid.
*/
btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
write_extent_buffer(l, item, ptr, sizeof(*item));
btrfs_mark_buffer_dirty(trans, path->nodes[0]);
out:
btrfs_free_path(path);
return ret;
}
int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, struct btrfs_root_item *item)
{
/*
* Make sure generation v1 and v2 match. See update_root for details.
*/
btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
return btrfs_insert_item(trans, root, key, item, sizeof(*item));
}
int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *tree_root = fs_info->tree_root;
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_root *root;
int err = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = 0;
while (1) {
u64 root_objectid;
ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
if (ret < 0) {
err = ret;
break;
}
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(tree_root, path);
if (ret < 0)
err = ret;
if (ret != 0)
break;
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(path);
if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
root_objectid = key.offset;
key.offset++;
root = btrfs_get_fs_root(fs_info, root_objectid, false);
err = PTR_ERR_OR_ZERO(root);
if (err && err != -ENOENT) {
break;
} else if (err == -ENOENT) {
struct btrfs_trans_handle *trans;
btrfs_release_path(path);
trans = btrfs_join_transaction(tree_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
btrfs_handle_fs_error(fs_info, err,
"Failed to start trans to delete orphan item");
break;
}
err = btrfs_del_orphan_item(trans, tree_root,
root_objectid);
btrfs_end_transaction(trans);
if (err) {
btrfs_handle_fs_error(fs_info, err,
"Failed to delete root orphan item");
break;
}
continue;
}
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
if (btrfs_root_refs(&root->root_item) == 0) {
struct btrfs_key drop_key;
btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
/*
* If we have a non-zero drop_progress then we know we
* made it partly through deleting this snapshot, and
* thus we need to make sure we block any balance from
* happening until this snapshot is completely dropped.
*/
if (drop_key.objectid != 0 || drop_key.type != 0 ||
drop_key.offset != 0) {
set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
}
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
}
btrfs_put_root(root);
}
btrfs_free_path(path);
return err;
}
/* drop the root item for 'key' from the tree root */
int btrfs_del_root(struct btrfs_trans_handle *trans,
const struct btrfs_key *key)
{
struct btrfs_root *root = trans->fs_info->tree_root;
struct btrfs_path *path;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(trans, root, key, path, -1, 1);
if (ret < 0)
goto out;
if (ret != 0) {
/* The root must exist but we did not find it by the key. */
ret = -EUCLEAN;
goto out;
}
ret = btrfs_del_item(trans, root, path);
out:
btrfs_free_path(path);
return ret;
}
int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
u64 ref_id, u64 dirid, u64 *sequence,
const struct fscrypt_str *name)
{
struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_path *path;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
struct btrfs_key key;
unsigned long ptr;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = root_id;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0) {
goto out;
} else if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
ptr = (unsigned long)(ref + 1);
if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
(btrfs_root_ref_name_len(leaf, ref) != name->len) ||
memcmp_extent_buffer(leaf, name->name, ptr, name->len)) {
ret = -ENOENT;
goto out;
}
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
if (ret)
goto out;
} else {
ret = -ENOENT;
goto out;
}
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(path);
key.objectid = ref_id;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = root_id;
goto again;
}
out:
btrfs_free_path(path);
return ret;
}
/*
* add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
* or BTRFS_ROOT_BACKREF_KEY.
*
* The dirid, sequence, name and name_len refer to the directory entry
* that is referencing the root.
*
* For a forward ref, the root_id is the id of the tree referencing
* the root and ref_id is the id of the subvol or snapshot.
*
* For a back ref the root_id is the id of the subvol or snapshot and
* ref_id is the id of the tree referencing it.
*
* Will return 0, -ENOMEM, or anything from the CoW path
*/
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
u64 ref_id, u64 dirid, u64 sequence,
const struct fscrypt_str *name)
{
struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_key key;
int ret;
struct btrfs_path *path;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
unsigned long ptr;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = root_id;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = ref_id;
again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name->len);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
return ret;
}
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
btrfs_set_root_ref_dirid(leaf, ref, dirid);
btrfs_set_root_ref_sequence(leaf, ref, sequence);
btrfs_set_root_ref_name_len(leaf, ref, name->len);
ptr = (unsigned long)(ref + 1);
write_extent_buffer(leaf, name->name, ptr, name->len);
btrfs_mark_buffer_dirty(trans, leaf);
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(path);
key.objectid = ref_id;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = root_id;
goto again;
}
btrfs_free_path(path);
return 0;
}
/*
* Old btrfs forgets to init root_item->flags and root_item->byte_limit
* for subvolumes. To work around this problem, we steal a bit from
* root_item->inode_item->flags, and use it to indicate if those fields
* have been properly initialized.
*/
void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
{
u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode);
if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
btrfs_set_stack_inode_flags(&root_item->inode, inode_flags);
btrfs_set_root_flags(root_item, 0);
btrfs_set_root_limit(root_item, 0);
}
}
void btrfs_update_root_times(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_root_item *item = &root->root_item;
struct timespec64 ct;
ktime_get_real_ts64(&ct);
spin_lock(&root->root_item_lock);
btrfs_set_root_ctransid(item, trans->transid);
btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec);
spin_unlock(&root->root_item_lock);
}
/*
* Reserve space for subvolume operation.
*
* root: the root of the parent directory
* rsv: block reservation
* items: the number of items that we need do reservation
* use_global_rsv: allow fallback to the global block reservation
*
* This function is used to reserve the space for snapshot/subvolume
* creation and deletion. Those operations are different with the
* common file/directory operations, they change two fs/file trees
* and root tree, the number of items that the qgroup reserves is
* different with the free space reservation. So we can not use
* the space reservation mechanism in start_transaction().
*/
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv, int items,
bool use_global_rsv)
{
u64 qgroup_num_bytes = 0;
u64 num_bytes;
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
if (btrfs_qgroup_enabled(fs_info)) {
/* One for parent inode, two for dir entries */
qgroup_num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root,
qgroup_num_bytes, true,
false);
if (ret)
return ret;
}
num_bytes = btrfs_calc_insert_metadata_size(fs_info, items);
rsv->space_info = btrfs_find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes,
BTRFS_RESERVE_FLUSH_ALL);
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
if (ret && qgroup_num_bytes)
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
if (!ret) {
spin_lock(&rsv->lock);
rsv->qgroup_rsv_reserved += qgroup_num_bytes;
spin_unlock(&rsv->lock);
}
return ret;
}