mirror of
https://github.com/torvalds/linux.git
synced 2024-11-17 01:22:07 +00:00
for-5.16-deadlock-fix-tag
-----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmGL/QcACgkQxWXV+ddt WDvj2g//aYLMLB53Px0rEvQgC8YJKtBFjPTL0FVI9YUQoolIykLpvtn5KQlyJRGK dnuORSSfM3azmEbAz6KlBm/kMHDCq3yGhvdVvGxoG8ndKhB/5JIUB+m1GCEF2OEY fMQJhsjPSOmxWkJzPCeHeMMXTWn1h7dWEZXTChfZZVvU8C0+UwZ0uVmdRPF5QdpI Dg/knwaGIGWfs6f5e8Lwg6+M+fLjJkLDeZEqIRqWF+EmYT0rTyv0vU/cKsUqBint KlsauJBOu/gQNSsXS1+GmIj4u4DqhocyB8z5ZOaDrq3MRP/jkJ0vDCsHXW97/AWU gPPz3N2hhUMtVG2ORbQsLGHdUdjgNwVYuMpbhx8m/XJ1dpclpgbOYkiVu/jfelUU YbFIR3NObu3hW3XOdpSRo+rkQafMtRgV4WWPbPC4gU+a/r0KnF9Q5L4jlHtIHbys UXyzNsxrWl4dX7QsvwLJ+VCAM6dF5M/3UPCJzZtYRPhtyZGNPf+WLSuMPxDqUQMY YuVTu5EuJpk707fFfhWf+6GI77Bo1Trc+a4d5G9Es/jCjobrKQRvS+aq6V8H9Vk4 0AQFRedMuTbPVmpM7LA8/B+aGBkwoOQ7Uh02CgT6hzOO0gLsFV/zGL+y23908G7S b9g3Ef8UY1k9wYFl5HpP/ZQwZiPLTZvTkbZhuaFx8+4/sAReXHw= =dM9v -----END PGP SIGNATURE----- Merge tag 'for-5.16-deadlock-fix-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux Pull btrfs fix from David Sterba: "Fix for a deadlock when direct/buffered IO is done on a mmaped file and a fault happens (details in the patch). There's a fstest generic/647 that triggers the problem and makes testing hard" * tag 'for-5.16-deadlock-fix-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: fix deadlock due to page faults during direct IO reads and writes
This commit is contained in:
commit
6070dcc8e5
139
fs/btrfs/file.c
139
fs/btrfs/file.c
@ -1912,16 +1912,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
|
||||
|
||||
static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
|
||||
{
|
||||
const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file_inode(file);
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
||||
loff_t pos;
|
||||
ssize_t written = 0;
|
||||
ssize_t written_buffered;
|
||||
size_t prev_left = 0;
|
||||
loff_t endbyte;
|
||||
ssize_t err;
|
||||
unsigned int ilock_flags = 0;
|
||||
struct iomap_dio *dio = NULL;
|
||||
|
||||
if (iocb->ki_flags & IOCB_NOWAIT)
|
||||
ilock_flags |= BTRFS_ILOCK_TRY;
|
||||
@ -1964,23 +1965,80 @@ relock:
|
||||
goto buffered;
|
||||
}
|
||||
|
||||
dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
|
||||
0, 0);
|
||||
/*
|
||||
* We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
|
||||
* calls generic_write_sync() (through iomap_dio_complete()), because
|
||||
* that results in calling fsync (btrfs_sync_file()) which will try to
|
||||
* lock the inode in exclusive/write mode.
|
||||
*/
|
||||
if (is_sync_write)
|
||||
iocb->ki_flags &= ~IOCB_DSYNC;
|
||||
|
||||
/*
|
||||
* The iov_iter can be mapped to the same file range we are writing to.
|
||||
* If that's the case, then we will deadlock in the iomap code, because
|
||||
* it first calls our callback btrfs_dio_iomap_begin(), which will create
|
||||
* an ordered extent, and after that it will fault in the pages that the
|
||||
* iov_iter refers to. During the fault in we end up in the readahead
|
||||
* pages code (starting at btrfs_readahead()), which will lock the range,
|
||||
* find that ordered extent and then wait for it to complete (at
|
||||
* btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
|
||||
* obviously the ordered extent can never complete as we didn't submit
|
||||
* yet the respective bio(s). This always happens when the buffer is
|
||||
* memory mapped to the same file range, since the iomap DIO code always
|
||||
* invalidates pages in the target file range (after starting and waiting
|
||||
* for any writeback).
|
||||
*
|
||||
* So here we disable page faults in the iov_iter and then retry if we
|
||||
* got -EFAULT, faulting in the pages before the retry.
|
||||
*/
|
||||
again:
|
||||
from->nofault = true;
|
||||
err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
|
||||
IOMAP_DIO_PARTIAL, written);
|
||||
from->nofault = false;
|
||||
|
||||
/* No increment (+=) because iomap returns a cumulative value. */
|
||||
if (err > 0)
|
||||
written = err;
|
||||
|
||||
if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
|
||||
const size_t left = iov_iter_count(from);
|
||||
/*
|
||||
* We have more data left to write. Try to fault in as many as
|
||||
* possible of the remainder pages and retry. We do this without
|
||||
* releasing and locking again the inode, to prevent races with
|
||||
* truncate.
|
||||
*
|
||||
* Also, in case the iov refers to pages in the file range of the
|
||||
* file we want to write to (due to a mmap), we could enter an
|
||||
* infinite loop if we retry after faulting the pages in, since
|
||||
* iomap will invalidate any pages in the range early on, before
|
||||
* it tries to fault in the pages of the iov. So we keep track of
|
||||
* how much was left of iov in the previous EFAULT and fallback
|
||||
* to buffered IO in case we haven't made any progress.
|
||||
*/
|
||||
if (left == prev_left) {
|
||||
err = -ENOTBLK;
|
||||
} else {
|
||||
fault_in_iov_iter_readable(from, left);
|
||||
prev_left = left;
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
|
||||
btrfs_inode_unlock(inode, ilock_flags);
|
||||
|
||||
if (IS_ERR_OR_NULL(dio)) {
|
||||
err = PTR_ERR_OR_ZERO(dio);
|
||||
if (err < 0 && err != -ENOTBLK)
|
||||
goto out;
|
||||
} else {
|
||||
written = iomap_dio_complete(dio);
|
||||
}
|
||||
/*
|
||||
* Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
|
||||
* the fsync (call generic_write_sync()).
|
||||
*/
|
||||
if (is_sync_write)
|
||||
iocb->ki_flags |= IOCB_DSYNC;
|
||||
|
||||
if (written < 0 || !iov_iter_count(from)) {
|
||||
err = written;
|
||||
/* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
|
||||
if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
|
||||
goto out;
|
||||
}
|
||||
|
||||
buffered:
|
||||
pos = iocb->ki_pos;
|
||||
@ -2005,7 +2063,7 @@ buffered:
|
||||
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
|
||||
endbyte >> PAGE_SHIFT);
|
||||
out:
|
||||
return written ? written : err;
|
||||
return err < 0 ? err : written;
|
||||
}
|
||||
|
||||
static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
|
||||
@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
|
||||
static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
|
||||
{
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
size_t prev_left = 0;
|
||||
ssize_t read = 0;
|
||||
ssize_t ret;
|
||||
|
||||
if (fsverity_active(inode))
|
||||
@ -3668,10 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
|
||||
return 0;
|
||||
|
||||
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
|
||||
again:
|
||||
/*
|
||||
* This is similar to what we do for direct IO writes, see the comment
|
||||
* at btrfs_direct_write(), but we also disable page faults in addition
|
||||
* to disabling them only at the iov_iter level. This is because when
|
||||
* reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
|
||||
* which can still trigger page fault ins despite having set ->nofault
|
||||
* to true of our 'to' iov_iter.
|
||||
*
|
||||
* The difference to direct IO writes is that we deadlock when trying
|
||||
* to lock the extent range in the inode's tree during he page reads
|
||||
* triggered by the fault in (while for writes it is due to waiting for
|
||||
* our own ordered extent). This is because for direct IO reads,
|
||||
* btrfs_dio_iomap_begin() returns with the extent range locked, which
|
||||
* is only unlocked in the endio callback (end_bio_extent_readpage()).
|
||||
*/
|
||||
pagefault_disable();
|
||||
to->nofault = true;
|
||||
ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
|
||||
0, 0);
|
||||
IOMAP_DIO_PARTIAL, read);
|
||||
to->nofault = false;
|
||||
pagefault_enable();
|
||||
|
||||
/* No increment (+=) because iomap returns a cumulative value. */
|
||||
if (ret > 0)
|
||||
read = ret;
|
||||
|
||||
if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
|
||||
const size_t left = iov_iter_count(to);
|
||||
|
||||
if (left == prev_left) {
|
||||
/*
|
||||
* We didn't make any progress since the last attempt,
|
||||
* fallback to a buffered read for the remainder of the
|
||||
* range. This is just to avoid any possibility of looping
|
||||
* for too long.
|
||||
*/
|
||||
ret = read;
|
||||
} else {
|
||||
/*
|
||||
* We made some progress since the last retry or this is
|
||||
* the first time we are retrying. Fault in as many pages
|
||||
* as possible and retry.
|
||||
*/
|
||||
fault_in_iov_iter_writeable(to, left);
|
||||
prev_left = left;
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
|
||||
return ret;
|
||||
return ret < 0 ? ret : read;
|
||||
}
|
||||
|
||||
static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||
|
Loading…
Reference in New Issue
Block a user