btrfs: fix refcount_t usage when deleting btrfs_delayed_nodes
refcounts have a generic implementation and an asm optimized one. The
generic version has extra debugging to make sure that once a refcount
goes to zero, refcount_inc won't increase it.
The btrfs delayed inode code wasn't expecting this, and we're tripping
over the warnings when the generic refcounts are used. We ended up with
this race:
Process A Process B
btrfs_get_delayed_node()
spin_lock(root->inode_lock)
radix_tree_lookup()
__btrfs_release_delayed_node()
refcount_dec_and_test(&delayed_node->refs)
our refcount is now zero
refcount_add(2) <---
warning here, refcount
unchanged
spin_lock(root->inode_lock)
radix_tree_delete()
With the generic refcounts, we actually warn again when process B above
tries to release his refcount because refcount_add() turned into a
no-op.
We saw this in production on older kernels without the asm optimized
refcounts.
The fix used here is to use refcount_inc_not_zero() to detect when the
object is in the middle of being freed and return NULL. This is almost
always the right answer anyway, since we usually end up pitching the
delayed_node if it didn't have fresh data in it.
This also changes __btrfs_release_delayed_node() to remove the extra
check for zero refcounts before radix tree deletion.
btrfs_get_delayed_node() was the only path that was allowing refcounts
to go from zero to one.
Fixes: 6de5f18e7b
("btrfs: fix refcount_t usage when deleting btrfs_delayed_node")
CC: <stable@vger.kernel.org> # 4.12+
Signed-off-by: Chris Mason <clm@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
parent
beed9263f4
commit
ec35e48b28
@ -87,6 +87,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
|
|||||||
|
|
||||||
spin_lock(&root->inode_lock);
|
spin_lock(&root->inode_lock);
|
||||||
node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
|
node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
|
||||||
|
|
||||||
if (node) {
|
if (node) {
|
||||||
if (btrfs_inode->delayed_node) {
|
if (btrfs_inode->delayed_node) {
|
||||||
refcount_inc(&node->refs); /* can be accessed */
|
refcount_inc(&node->refs); /* can be accessed */
|
||||||
@ -94,9 +95,30 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
|
|||||||
spin_unlock(&root->inode_lock);
|
spin_unlock(&root->inode_lock);
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* It's possible that we're racing into the middle of removing
|
||||||
|
* this node from the radix tree. In this case, the refcount
|
||||||
|
* was zero and it should never go back to one. Just return
|
||||||
|
* NULL like it was never in the radix at all; our release
|
||||||
|
* function is in the process of removing it.
|
||||||
|
*
|
||||||
|
* Some implementations of refcount_inc refuse to bump the
|
||||||
|
* refcount once it has hit zero. If we don't do this dance
|
||||||
|
* here, refcount_inc() may decide to just WARN_ONCE() instead
|
||||||
|
* of actually bumping the refcount.
|
||||||
|
*
|
||||||
|
* If this node is properly in the radix, we want to bump the
|
||||||
|
* refcount twice, once for the inode and once for this get
|
||||||
|
* operation.
|
||||||
|
*/
|
||||||
|
if (refcount_inc_not_zero(&node->refs)) {
|
||||||
|
refcount_inc(&node->refs);
|
||||||
btrfs_inode->delayed_node = node;
|
btrfs_inode->delayed_node = node;
|
||||||
/* can be accessed and cached in the inode */
|
} else {
|
||||||
refcount_add(2, &node->refs);
|
node = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
spin_unlock(&root->inode_lock);
|
spin_unlock(&root->inode_lock);
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
@ -254,16 +276,17 @@ static void __btrfs_release_delayed_node(
|
|||||||
mutex_unlock(&delayed_node->mutex);
|
mutex_unlock(&delayed_node->mutex);
|
||||||
|
|
||||||
if (refcount_dec_and_test(&delayed_node->refs)) {
|
if (refcount_dec_and_test(&delayed_node->refs)) {
|
||||||
bool free = false;
|
|
||||||
struct btrfs_root *root = delayed_node->root;
|
struct btrfs_root *root = delayed_node->root;
|
||||||
|
|
||||||
spin_lock(&root->inode_lock);
|
spin_lock(&root->inode_lock);
|
||||||
if (refcount_read(&delayed_node->refs) == 0) {
|
/*
|
||||||
|
* Once our refcount goes to zero, nobody is allowed to bump it
|
||||||
|
* back up. We can delete it now.
|
||||||
|
*/
|
||||||
|
ASSERT(refcount_read(&delayed_node->refs) == 0);
|
||||||
radix_tree_delete(&root->delayed_nodes_tree,
|
radix_tree_delete(&root->delayed_nodes_tree,
|
||||||
delayed_node->inode_id);
|
delayed_node->inode_id);
|
||||||
free = true;
|
|
||||||
}
|
|
||||||
spin_unlock(&root->inode_lock);
|
spin_unlock(&root->inode_lock);
|
||||||
if (free)
|
|
||||||
kmem_cache_free(delayed_node_cache, delayed_node);
|
kmem_cache_free(delayed_node_cache, delayed_node);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user