mirror of
https://github.com/torvalds/linux.git
synced 2024-11-11 14:42:24 +00:00
btrfs: always wait on ordered extents at fsync time
There's a priority inversion that exists currently with btrfs fsync. In some cases we will collect outstanding ordered extents onto a list and only wait on them at the very last second. However this "very last second" falls inside of a transaction handle, so if we are in a lower priority cgroup we can end up holding the transaction open for longer than needed, so if a high priority cgroup is also trying to fsync() it'll see latency. Signed-off-by: Josef Bacik <jbacik@fb.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
parent
16d1c062c7
commit
b5e6c3e170
@ -2068,53 +2068,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
|
|||||||
atomic_inc(&root->log_batch);
|
atomic_inc(&root->log_batch);
|
||||||
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
|
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
|
||||||
&BTRFS_I(inode)->runtime_flags);
|
&BTRFS_I(inode)->runtime_flags);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We might have have had more pages made dirty after calling
|
* We have to do this here to avoid the priority inversion of waiting on
|
||||||
* start_ordered_ops and before acquiring the inode's i_mutex.
|
* IO of a lower priority task while holding a transaciton open.
|
||||||
*/
|
*/
|
||||||
if (full_sync) {
|
ret = btrfs_wait_ordered_range(inode, start, len);
|
||||||
/*
|
|
||||||
* For a full sync, we need to make sure any ordered operations
|
|
||||||
* start and finish before we start logging the inode, so that
|
|
||||||
* all extents are persisted and the respective file extent
|
|
||||||
* items are in the fs/subvol btree.
|
|
||||||
*/
|
|
||||||
ret = btrfs_wait_ordered_range(inode, start, len);
|
|
||||||
} else {
|
|
||||||
/*
|
|
||||||
* Start any new ordered operations before starting to log the
|
|
||||||
* inode. We will wait for them to finish in btrfs_sync_log().
|
|
||||||
*
|
|
||||||
* Right before acquiring the inode's mutex, we might have new
|
|
||||||
* writes dirtying pages, which won't immediately start the
|
|
||||||
* respective ordered operations - that is done through the
|
|
||||||
* fill_delalloc callbacks invoked from the writepage and
|
|
||||||
* writepages address space operations. So make sure we start
|
|
||||||
* all ordered operations before starting to log our inode. Not
|
|
||||||
* doing this means that while logging the inode, writeback
|
|
||||||
* could start and invoke writepage/writepages, which would call
|
|
||||||
* the fill_delalloc callbacks (cow_file_range,
|
|
||||||
* submit_compressed_extents). These callbacks add first an
|
|
||||||
* extent map to the modified list of extents and then create
|
|
||||||
* the respective ordered operation, which means in
|
|
||||||
* tree-log.c:btrfs_log_inode() we might capture all existing
|
|
||||||
* ordered operations (with btrfs_get_logged_extents()) before
|
|
||||||
* the fill_delalloc callback adds its ordered operation, and by
|
|
||||||
* the time we visit the modified list of extent maps (with
|
|
||||||
* btrfs_log_changed_extents()), we see and process the extent
|
|
||||||
* map they created. We then use the extent map to construct a
|
|
||||||
* file extent item for logging without waiting for the
|
|
||||||
* respective ordered operation to finish - this file extent
|
|
||||||
* item points to a disk location that might not have yet been
|
|
||||||
* written to, containing random data - so after a crash a log
|
|
||||||
* replay will make our inode have file extent items that point
|
|
||||||
* to disk locations containing invalid data, as we returned
|
|
||||||
* success to userspace without waiting for the respective
|
|
||||||
* ordered operation to finish, because it wasn't captured by
|
|
||||||
* btrfs_get_logged_extents().
|
|
||||||
*/
|
|
||||||
ret = start_ordered_ops(inode, start, end);
|
|
||||||
}
|
|
||||||
if (ret) {
|
if (ret) {
|
||||||
inode_unlock(inode);
|
inode_unlock(inode);
|
||||||
goto out;
|
goto out;
|
||||||
@ -2239,13 +2198,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!full_sync) {
|
|
||||||
ret = btrfs_wait_ordered_range(inode, start, len);
|
|
||||||
if (ret) {
|
|
||||||
btrfs_end_transaction(trans);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ret = btrfs_commit_transaction(trans);
|
ret = btrfs_commit_transaction(trans);
|
||||||
} else {
|
} else {
|
||||||
ret = btrfs_end_transaction(trans);
|
ret = btrfs_end_transaction(trans);
|
||||||
|
Loading…
Reference in New Issue
Block a user