From abdad709ed8fe4fd3b865ed1010de37a49601ff4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 19 Mar 2022 18:04:41 -0600 Subject: [PATCH 01/19] io_uring: recycle provided before arming poll We currently have a race where we recycle the selected buffer if poll returns IO_APOLL_OK. But that's too late, as the poll could already be triggering or have triggered. If that race happens, then we're putting a buffer that's already being used. Fix this by recycling before we arm poll. This does mean that we'll sometimes almost instantly re-select the buffer, but it's rare enough in testing that it should not pose a performance issue. Fixes: b1c62645758e ("io_uring: recycle provided buffers if request goes async") Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5fa736344b67..98949348ee02 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6240,6 +6240,8 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; + io_kbuf_recycle(req); + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); if (ret || ipt.error) return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; @@ -7491,7 +7493,6 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) io_queue_async_work(req, NULL); break; case IO_APOLL_OK: - io_kbuf_recycle(req); break; } From f63cf5192fe3418ad5ae1a4412eba5694b145f79 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 20 Mar 2022 13:08:38 -0600 Subject: [PATCH 02/19] io_uring: ensure that fsnotify is always called Ensure that we call fsnotify_modify() if we write a file, and that we do fsnotify_access() if we read it. This enables anyone using inotify on the file to get notified. Ditto for fallocate, ensure that fsnotify_modify() is called. Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 98949348ee02..1a65d7880440 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2973,8 +2973,12 @@ static bool io_rw_should_reissue(struct io_kiocb *req) static bool __io_complete_rw_common(struct io_kiocb *req, long res) { - if (req->rw.kiocb.ki_flags & IOCB_WRITE) + if (req->rw.kiocb.ki_flags & IOCB_WRITE) { kiocb_end_write(req); + fsnotify_modify(req->file); + } else { + fsnotify_access(req->file); + } if (unlikely(res != req->result)) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { @@ -4537,6 +4541,8 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) req->sync.len); if (ret < 0) req_set_fail(req); + else + fsnotify_modify(req->file); io_req_complete(req, ret); return 0; } From 649bb75d19c93f5459f450191953dff4825fda3e Mon Sep 17 00:00:00 2001 From: Almog Khaikin Date: Mon, 21 Mar 2022 11:00:59 +0200 Subject: [PATCH 03/19] io_uring: fix memory ordering when SQPOLL thread goes to sleep Without a full memory barrier between the store to the flags and the load of the SQ tail the two operations can be reordered and this can lead to a situation where the SQPOLL thread goes to sleep while the application writes to the SQ tail and doesn't see the wakeup flag. This memory barrier pairs with a full memory barrier in the application between its store to the SQ tail and its load of the flags. Signed-off-by: Almog Khaikin Link: https://lore.kernel.org/r/20220321090059.46313-1-almogkh@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1a65d7880440..48f4540d7dd5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8042,6 +8042,13 @@ static int io_sq_thread(void *data) needs_sched = false; break; } + + /* + * Ensure the store of the wakeup flag is not + * reordered with the load of the SQ tail + */ + smp_mb(); + if (io_sqring_entries(ctx)) { needs_sched = false; break; From 61bc84c4008812d784c398cfb54118c1ba396dfc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 21 Mar 2022 19:03:24 -0600 Subject: [PATCH 04/19] io_uring: remove poll entry from list when canceling all When the ring is exiting, as part of the shutdown, poll requests are removed. But io_poll_remove_all() does not remove entries when finding them, and since completions are done out-of-band, we can find and remove the same entry multiple times. We do guard the poll execution by poll ownership, but that does not exclude us from reissuing a new one once the previous removal ownership goes away. This can race with poll execution as well, where we then end up seeing req->apoll be NULL because a previous task_work requeue finished the request. Remove the poll entry when we find it and get ownership of it. This prevents multiple invocations from finding it. Fixes: aa43477b0402 ("io_uring: poll rework") Reported-by: Dylan Yudaken Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 48f4540d7dd5..53bd71363a44 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6275,6 +6275,7 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) { if (io_match_task_safe(req, tsk, cancel_all)) { + hlist_del_init(&req->hash_node); io_poll_cancel_req(req); found = true; } From e2c0cb7c0cc72939b61a7efee376206725796625 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 22 Mar 2022 06:57:25 -0600 Subject: [PATCH 05/19] io_uring: bump poll refs to full 31-bits The previous commit: 1bc84c40088 ("io_uring: remove poll entry from list when canceling all") removed a potential overflow condition for the poll references. They are currently limited to 20-bits, even if we have 31-bits available. The upper bit is used to mark for cancelation. Bump the poll ref space to 31-bits, making that kind of situation much harder to trigger in general. We'll separately add overflow checking and handling. Fixes: aa43477b0402 ("io_uring: poll rework") Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 53bd71363a44..e8d88f0cdad3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5793,7 +5793,7 @@ struct io_poll_table { }; #define IO_POLL_CANCEL_FLAG BIT(31) -#define IO_POLL_REF_MASK ((1u << 20)-1) +#define IO_POLL_REF_MASK GENMASK(30, 0) /* * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can From d89a4fac0fbc6fe5fc24d1c9a889440dcf410368 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 22 Mar 2022 13:11:28 -0600 Subject: [PATCH 06/19] io_uring: fix assuming triggered poll waitqueue is the single poll syzbot reports a recent regression: BUG: KASAN: use-after-free in __wake_up_common+0x637/0x650 kernel/sched/wait.c:101 Read of size 8 at addr ffff888011e8a130 by task syz-executor413/3618 CPU: 0 PID: 3618 Comm: syz-executor413 Tainted: G W 5.17.0-syzkaller-01402-g8565d64430f8 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x303 mm/kasan/report.c:255 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 __wake_up_common+0x637/0x650 kernel/sched/wait.c:101 __wake_up_common_lock+0xd0/0x130 kernel/sched/wait.c:138 tty_release+0x657/0x1200 drivers/tty/tty_io.c:1781 __fput+0x286/0x9f0 fs/file_table.c:317 task_work_run+0xdd/0x1a0 kernel/task_work.c:164 exit_task_work include/linux/task_work.h:32 [inline] do_exit+0xaff/0x29d0 kernel/exit.c:806 do_group_exit+0xd2/0x2f0 kernel/exit.c:936 __do_sys_exit_group kernel/exit.c:947 [inline] __se_sys_exit_group kernel/exit.c:945 [inline] __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:945 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f439a1fac69 which is due to leaving the request on the waitqueue mistakenly. The reproducer is using a tty device, which means we end up arming the same poll queue twice (it uses the same poll waitqueue for both), but in io_poll_wake() we always just clear REQ_F_SINGLE_POLL regardless of which entry triggered. This leaves one waitqueue potentially armed after we're done, which then blows up in tty when the waitqueue is attempted removed. We have no room to store this information, so simply encode it in the wait_queue_entry->private where we store the io_kiocb request pointer. Fixes: 91eac1c69c20 ("io_uring: cache poll/double-poll state with a request flag") Reported-by: syzbot+09ad4050dd3a120bfccd@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e8d88f0cdad3..6395393eaf9e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6027,10 +6027,13 @@ static void io_poll_cancel_req(struct io_kiocb *req) io_poll_execute(req, 0, 0); } +#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) +#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) + static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_kiocb *req = wait->private; + struct io_kiocb *req = wqe_to_req(wait); struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, wait); __poll_t mask = key_to_poll(key); @@ -6068,7 +6071,10 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && poll->events & EPOLLONESHOT) { list_del_init(&poll->wait.entry); poll->head = NULL; - req->flags &= ~REQ_F_SINGLE_POLL; + if (wqe_is_double(wait)) + req->flags &= ~REQ_F_DOUBLE_POLL; + else + req->flags &= ~REQ_F_SINGLE_POLL; } __io_poll_execute(req, mask, poll->events); } @@ -6080,6 +6086,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, struct io_poll_iocb **poll_ptr) { struct io_kiocb *req = pt->req; + unsigned long wqe_private = (unsigned long) req; /* * The file being polled uses multiple waitqueues for poll handling @@ -6105,6 +6112,8 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = -ENOMEM; return; } + /* mark as double wq entry */ + wqe_private |= 1; req->flags |= REQ_F_DOUBLE_POLL; io_init_poll_iocb(poll, first->events, first->wait.func); *poll_ptr = poll; @@ -6115,7 +6124,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, req->flags |= REQ_F_SINGLE_POLL; pt->nr_entries++; poll->head = head; - poll->wait.private = req; + poll->wait.private = (void *) wqe_private; if (poll->events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(head, &poll->wait); @@ -6142,7 +6151,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req, INIT_HLIST_NODE(&req->hash_node); io_init_poll_iocb(poll, mask, io_poll_wake); poll->file = req->file; - poll->wait.private = req; ipt->pt._key = mask; ipt->req = req; From 4d55f238f8b89124f73e50abbd05e413def514fe Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 22 Mar 2022 14:12:33 -0600 Subject: [PATCH 07/19] io_uring: don't recycle provided buffer if punted to async worker We only really need to recycle the buffer when going async for a file type that has an indefinite reponse time (eg non-file/bdev). And for files that to arm poll, the async worker will arm poll anyway and the buffer will get recycled there. In that latter case, we're not holding ctx->uring_lock. Ensure we take the issue_flags into account and acquire it if we need to. Fixes: b1c62645758e ("io_uring: recycle provided buffers if request goes async") Reported-by: Stefan Roesch Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6395393eaf9e..f41d91ce1fd0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1383,7 +1383,7 @@ static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, return NULL; } -static void io_kbuf_recycle(struct io_kiocb *req) +static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; @@ -1392,6 +1392,9 @@ static void io_kbuf_recycle(struct io_kiocb *req) if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return; + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_lock(&ctx->uring_lock); + lockdep_assert_held(&ctx->uring_lock); buf = req->kbuf; @@ -1399,6 +1402,9 @@ static void io_kbuf_recycle(struct io_kiocb *req) list_add(&buf->list, &bl->buf_list); req->flags &= ~REQ_F_BUFFER_SELECTED; req->kbuf = NULL; + + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_unlock(&ctx->uring_lock); } static bool io_match_task(struct io_kiocb *head, struct task_struct *task, @@ -6254,7 +6260,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; - io_kbuf_recycle(req); + io_kbuf_recycle(req, issue_flags); ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); if (ret || ipt.error) @@ -7504,7 +7510,6 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ - io_kbuf_recycle(req); io_queue_async_work(req, NULL); break; case IO_APOLL_OK: From 7ba89d2af17aa879dda30f5d5d3f152e587fc551 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 23 Mar 2022 09:32:35 -0600 Subject: [PATCH 08/19] io_uring: ensure recv and recvmsg handle MSG_WAITALL correctly We currently don't attempt to get the full asked for length even if MSG_WAITALL is set, if we get a partial receive. If we do see a partial receive, then just note how many bytes we did and return -EAGAIN to get it retried. The iov is advanced appropriately for the vector based case, and we manually bump the buffer and remainder for the non-vector case. Cc: stable@vger.kernel.org Reported-by: Constantine Gavrilov Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index f41d91ce1fd0..a70de170aea1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -612,6 +612,7 @@ struct io_sr_msg { int msg_flags; int bgid; size_t len; + size_t done_io; }; struct io_open { @@ -5417,12 +5418,21 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif + sr->done_io = 0; return 0; } +static bool io_net_retry(struct socket *sock, int flags) +{ + if (!(flags & MSG_WAITALL)) + return false; + return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; +} + static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; + struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; struct io_buffer *kbuf; unsigned flags; @@ -5465,6 +5475,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->done_io += ret; + return io_setup_async_msg(req, kmsg); + } req_set_fail(req); } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { req_set_fail(req); @@ -5474,6 +5488,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); return 0; } @@ -5524,12 +5542,22 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; + return -EAGAIN; + } req_set_fail(req); } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { out_free: req_set_fail(req); } + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); return 0; } From 8a3e8ee56417f5e0e66580d93941ed9d6f4c8274 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 23 Mar 2022 09:30:05 -0600 Subject: [PATCH 09/19] io_uring: add flag for disabling provided buffer recycling If we need to continue doing this IO, then we don't want a potentially selected buffer recycled. Add a flag for that. Set this for recv/recvmsg if they do partial IO. Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index a70de170aea1..88556e654c5a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -783,6 +783,7 @@ enum { REQ_F_SKIP_LINK_CQES_BIT, REQ_F_SINGLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT, + REQ_F_PARTIAL_IO_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -845,6 +846,8 @@ enum { REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), /* double poll may active */ REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), + /* request has already done partial IO */ + REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), }; struct async_poll { @@ -1392,6 +1395,9 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return; + /* don't recycle if we already did IO to this buffer */ + if (req->flags & REQ_F_PARTIAL_IO) + return; if (issue_flags & IO_URING_F_UNLOCKED) mutex_lock(&ctx->uring_lock); @@ -5477,6 +5483,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; return io_setup_async_msg(req, kmsg); } req_set_fail(req); @@ -5546,6 +5553,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) sr->len -= ret; sr->buf += ret; sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; return -EAGAIN; } req_set_fail(req); From 7ef66d186eb95f987a97fb3329b65c840e2dc9bf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 24 Mar 2022 06:53:18 -0600 Subject: [PATCH 10/19] io_uring: remove IORING_CQE_F_MSG This was introduced with the message ring opcode, but isn't strictly required for the request itself. The sender can encode what is needed in user_data, which is passed to the receiver. It's unclear if having a separate flag that essentially says "This CQE did not originate from an SQE on this ring" provides any real utility to applications. While we can always re-introduce a flag to provide this information, we cannot take it away at a later point in time. Remove the flag while we still can, before it's in a released kernel. Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- include/uapi/linux/io_uring.h | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 88556e654c5a..28b7a1b8abb6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4474,8 +4474,7 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) target_ctx = req->file->private_data; spin_lock(&target_ctx->completion_lock); - filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, - IORING_CQE_F_MSG); + filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0); io_commit_cqring(target_ctx); spin_unlock(&target_ctx->completion_lock); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index d2be4eb22008..784adc6f6ed2 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -201,11 +201,9 @@ struct io_uring_cqe { * * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID * IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries - * IORING_CQE_F_MSG If set, CQE was generated with IORING_OP_MSG_RING */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) -#define IORING_CQE_F_MSG (1U << 2) enum { IORING_CQE_BUFFER_SHIFT = 16, From a73825ba70c93e1eb39a845bb3d9885a787f8ffe Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 24 Mar 2022 07:34:35 -0700 Subject: [PATCH 11/19] io_uring: fix async accept on O_NONBLOCK sockets Do not set REQ_F_NOWAIT if the socket is non blocking. When enabled this causes the accept to immediately post a CQE with EAGAIN, which means you cannot perform an accept SQE on a NONBLOCK socket asynchronously. By removing the flag if there is no pending accept then poll is armed as usual and when a connection comes in the CQE is posted. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220324143435.2875844-1-dylany@fb.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 28b7a1b8abb6..a76e91fe277c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5602,9 +5602,6 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct file *file; int ret, fd; - if (req->file->f_flags & O_NONBLOCK) - req->flags |= REQ_F_NOWAIT; - if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); if (unlikely(fd < 0)) From 34d2bfe7d4b65b375d0edf704133a6b6970f9d81 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 24 Mar 2022 10:17:44 -0600 Subject: [PATCH 12/19] io_uring: improve task work cache utilization While profiling task_work intensive workloads, I noticed that most of the time in tctx_task_work() is spending stalled on loading 'req'. This is one of the unfortunate side effects of using linked lists, particularly when they end up being passe around. Prefetch the next request, if there is one. There's a sufficient amount of work in between that this makes it available for the next loop. While fiddling with the cache layout, move the link outside of the hot completion cacheline. It's rarely used in hot workloads, so better to bring in kbuf which is used for networked loads with provided buffers. This reduces tctx_task_work() overhead from ~3% to 1-1.5% in my testing. Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a76e91fe277c..bb40c80fd9ca 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -928,7 +928,6 @@ struct io_kiocb { struct io_wq_work_node comp_list; atomic_t refs; atomic_t poll_refs; - struct io_kiocb *link; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; @@ -939,6 +938,7 @@ struct io_kiocb { /* custom credentials, valid IFF REQ_F_CREDS is set */ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; + struct io_kiocb *link; const struct cred *creds; struct io_wq_work work; }; @@ -2451,6 +2451,8 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); + prefetch(container_of(next, struct io_kiocb, io_task_work.node)); + if (req->ctx != *ctx) { if (unlikely(!*uring_locked && *ctx)) ctx_commit_and_unlock(*ctx); @@ -2483,6 +2485,8 @@ static void handle_tw_list(struct io_wq_work_node *node, struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); + prefetch(container_of(next, struct io_kiocb, io_task_work.node)); + if (req->ctx != *ctx) { ctx_flush_and_put(*ctx, locked); *ctx = req->ctx; From 52dd86406dfa322c8d42b3a4328858abdc6f1d85 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Fri, 25 Mar 2022 02:37:55 -0700 Subject: [PATCH 13/19] io_uring: enable EPOLLEXCLUSIVE for accept poll When polling sockets for accept, use EPOLLEXCLUSIVE. This is helpful when multiple accept SQEs are submitted. For O_NONBLOCK sockets multiple queued SQEs would previously have all completed at once, but most with -EAGAIN as the result. Now only one wakes up and completes. For sockets without O_NONBLOCK there is no user facing change, but internally the extra requests would previously be queued onto a worker thread as they would wake up with no connection waiting, and be punted. Now they do not wake up unnecessarily. Co-developed-by: Jens Axboe Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220325093755.4123343-1-dylany@fb.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index bb40c80fd9ca..e72f58e2d06d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -967,6 +967,7 @@ struct io_op_def { /* set if opcode supports polled "wait" */ unsigned pollin : 1; unsigned pollout : 1; + unsigned poll_exclusive : 1; /* op supports buffer selection */ unsigned buffer_select : 1; /* do prep async if is going to be punted */ @@ -1061,6 +1062,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, + .poll_exclusive = 1, }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, @@ -6280,7 +6282,8 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) } else { mask |= POLLOUT | POLLWRNORM; } - + if (def->poll_exclusive) + mask |= EPOLLEXCLUSIVE; if (!(issue_flags & IO_URING_F_UNLOCKED) && !list_empty(&ctx->apoll_cache)) { apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, From 41cdcc2202d4c466534b8f38975d2e6b16317c0c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 25 Mar 2022 11:52:18 +0000 Subject: [PATCH 14/19] io_uring: improve req fields comments Move a misplaced comment about req->creds and add a line with assumptions about req->link. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1e51d1e6b1f3708c2d4127b4e371f9daa4c5f859.1648209006.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e72f58e2d06d..0356b2642263 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -935,10 +935,11 @@ struct io_kiocb { struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ void *async_data; - /* custom credentials, valid IFF REQ_F_CREDS is set */ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; + /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ struct io_kiocb *link; + /* custom credentials, valid IFF REQ_F_CREDS is set */ const struct cred *creds; struct io_wq_work work; }; From ab0ac0959b028779ea43002db81daa12203cb57d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 25 Mar 2022 13:00:42 +0000 Subject: [PATCH 15/19] io_uring: fix invalid flags for io_put_kbuf() io_req_complete_failed() doesn't require callers to hold ->uring_lock, use IO_URING_F_UNLOCKED version of io_put_kbuf(). The only affected place is the fail path of io_apoll_task_func(). Also add a lockdep annotation to catch such bugs in the future. Fixes: 3b2b78a8eb7cc ("io_uring: extend provided buf return to fails") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ccf602dbf8df3b6a8552a262d8ee0a13a086fbc7.1648212967.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0356b2642263..614321836cc3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1370,6 +1370,8 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); spin_unlock(&ctx->completion_lock); } else { + lockdep_assert_held(&req->ctx->uring_lock); + cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); } @@ -2165,7 +2167,7 @@ static inline void io_req_complete(struct io_kiocb *req, s32 res) static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); - io_req_complete_post(req, res, io_put_kbuf(req, 0)); + io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); } static void io_req_complete_fail_submit(struct io_kiocb *req) From 8197b053a83335dd1b7eb7581a933924e25c1025 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 25 Mar 2022 13:00:43 +0000 Subject: [PATCH 16/19] io_uring: fix put_kbuf without proper locking io_put_kbuf_comp() should only be called while holding ->completion_lock, however there is no such assumption in io_clean_op() and thus it can corrupt ->io_buffer_comp. Take the lock there, and workaround the only user of io_clean_op() calling it with locks. Not the prettiest solution, but it's easier to refactor it for-next. Fixes: cc3cec8367cba ("io_uring: speedup provided buffer handling") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/743e2130b73ec6d48c4c5dd15db896c433431e6d.1648212967.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 614321836cc3..cc3a22d60fb4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1338,6 +1338,8 @@ static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) { + lockdep_assert_held(&req->ctx->completion_lock); + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return 0; return __io_put_kbuf(req, &req->ctx->io_buffers_comp); @@ -2123,6 +2125,12 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, } } io_req_put_rsrc(req, ctx); + /* + * Selected buffer deallocation in io_clean_op() assumes that + * we don't hold ->completion_lock. Clean them here to avoid + * deadlocks. + */ + io_put_kbuf_comp(req); io_dismantle_req(req); io_put_task(req->task, 1); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); @@ -7126,8 +7134,11 @@ fail: static void io_clean_op(struct io_kiocb *req) { - if (req->flags & REQ_F_BUFFER_SELECTED) + if (req->flags & REQ_F_BUFFER_SELECTED) { + spin_lock(&req->ctx->completion_lock); io_put_kbuf_comp(req); + spin_unlock(&req->ctx->completion_lock); + } if (req->flags & REQ_F_NEED_CLEANUP) { switch (req->opcode) { From c86d18f4aa93e0e66cda0e55827cd03eea6bc5f8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 25 Mar 2022 16:36:31 +0000 Subject: [PATCH 17/19] io_uring: fix memory leak of uid in files registration When there are no files for __io_sqe_files_scm() to process in the range, it'll free everything and return. However, it forgets to put uid. Fixes: 08a451739a9b5 ("io_uring: allow sparse fixed file sets") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/accee442376f33ce8aaebb099d04967533efde92.1648226048.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index cc3a22d60fb4..39a9ff31dbc5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8845,6 +8845,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) fput(fpl->fp[i]); } else { kfree_skb(skb); + free_uid(fpl->user); kfree(fpl); } From 9666d4206e9a14ff612e374b6b572b3efc797d46 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 29 Mar 2022 10:50:03 -0600 Subject: [PATCH 18/19] io_uring: fail links if msg-ring doesn't succeeed We must always call req_set_fail() if the request is failed, otherwise we won't sever links for dependent chains correctly. Fixes: 4f57f06ce218 ("io_uring: add support for IORING_OP_MSG_RING command") Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 39a9ff31dbc5..923410937dc7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4500,6 +4500,8 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) ret = 0; } + if (ret < 0) + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } From 3f1d52abf098c85b177b8c6f5b310e8347d1bc42 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 29 Mar 2022 10:43:56 -0600 Subject: [PATCH 19/19] io_uring: defer msg-ring file validity check until command issue In preparation for not using the file at prep time, defer checking if this file refers to a valid io_uring instance until issue time. Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 923410937dc7..3d0dbcd2f69c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4473,9 +4473,6 @@ static int io_msg_ring_prep(struct io_kiocb *req, sqe->splice_fd_in || sqe->buf_index || sqe->personality)) return -EINVAL; - if (req->file->f_op != &io_uring_fops) - return -EBADFD; - req->msg.user_data = READ_ONCE(sqe->off); req->msg.len = READ_ONCE(sqe->len); return 0; @@ -4485,9 +4482,14 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx; struct io_msg *msg = &req->msg; - int ret = -EOVERFLOW; bool filled; + int ret; + ret = -EBADFD; + if (req->file->f_op != &io_uring_fops) + goto done; + + ret = -EOVERFLOW; target_ctx = req->file->private_data; spin_lock(&target_ctx->completion_lock); @@ -4500,6 +4502,7 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) ret = 0; } +done: if (ret < 0) req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0);