diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 042eab793e26..a275f91d2ac0 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -440,11 +440,21 @@ struct io_uring_cqe { * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv * IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct * them from sends. + * IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get + * more completions. In other words, the buffer is being + * partially consumed, and will be used by the kernel for + * more completions. This is only set for buffers used via + * the incremental buffer consumption, as provided by + * a ring buffer setup with IOU_PBUF_RING_INC. For any + * other provided buffer type, all completions with a + * buffer passed back is automatically returned to the + * application. */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) +#define IORING_CQE_F_BUF_MORE (1U << 4) #define IORING_CQE_BUFFER_SHIFT 16 @@ -716,9 +726,17 @@ struct io_uring_buf_ring { * mmap(2) with the offset set as: * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) * to get a virtual mapping for the ring. + * IOU_PBUF_RING_INC: If set, buffers consumed from this buffer ring can be + * consumed incrementally. Normally one (or more) buffers + * are fully consumed. With incremental consumptions, it's + * feasible to register big ranges of buffers, and each + * use of it will consume only as much as it needs. This + * requires that both the kernel and application keep + * track of where the current read/recv index is at. */ enum io_uring_register_pbuf_ring_flags { IOU_PBUF_RING_MMAP = 1, + IOU_PBUF_RING_INC = 2, }; /* argument for IORING_(UN)REGISTER_PBUF_RING */ diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 55d01861d8c5..1f503bcc9c9f 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -212,14 +212,25 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, buf = io_ring_head_to_buf(br, head, bl->mask); if (arg->max_len) { u32 len = READ_ONCE(buf->len); - size_t needed; if (unlikely(!len)) return -ENOBUFS; - needed = (arg->max_len + len - 1) / len; - needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); - if (nr_avail > needed) - nr_avail = needed; + /* + * Limit incremental buffers to 1 segment. No point trying + * to peek ahead and map more than we need, when the buffers + * themselves should be large when setup with + * IOU_PBUF_RING_INC. + */ + if (bl->flags & IOBL_INC) { + nr_avail = 1; + } else { + size_t needed; + + needed = (arg->max_len + len - 1) / len; + needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); + if (nr_avail > needed) + nr_avail = needed; + } } /* @@ -244,16 +255,21 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, req->buf_index = buf->bid; do { - /* truncate end piece, if needed */ - if (buf->len > arg->max_len) - buf->len = arg->max_len; + u32 len = buf->len; + + /* truncate end piece, if needed, for non partial buffers */ + if (len > arg->max_len) { + len = arg->max_len; + if (!(bl->flags & IOBL_INC)) + buf->len = len; + } iov->iov_base = u64_to_user_ptr(buf->addr); - iov->iov_len = buf->len; + iov->iov_len = len; iov++; - arg->out_len += buf->len; - arg->max_len -= buf->len; + arg->out_len += len; + arg->max_len -= len; if (!arg->max_len) break; @@ -675,7 +691,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.resv[0] || reg.resv[1] || reg.resv[2]) return -EINVAL; - if (reg.flags & ~IOU_PBUF_RING_MMAP) + if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; if (!(reg.flags & IOU_PBUF_RING_MMAP)) { if (!reg.ring_addr) @@ -713,6 +729,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!ret) { bl->nr_entries = reg.ring_entries; bl->mask = reg.ring_entries - 1; + if (reg.flags & IOU_PBUF_RING_INC) + bl->flags |= IOBL_INC; io_buffer_add_list(ctx, bl, reg.bgid); return 0; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index b41e2a0a0505..36aadfe5ac00 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -9,6 +9,9 @@ enum { IOBL_BUF_RING = 1, /* ring mapped provided buffers, but mmap'ed by application */ IOBL_MMAP = 2, + /* buffers are consumed incrementally rather than always fully */ + IOBL_INC = 4, + }; struct io_buffer_list { @@ -124,24 +127,45 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) /* Mapped buffer ring, return io_uring_buf from head */ #define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] -static inline void io_kbuf_commit(struct io_kiocb *req, +static inline bool io_kbuf_commit(struct io_kiocb *req, struct io_buffer_list *bl, int len, int nr) { if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) - return; - bl->head += nr; + return true; + req->flags &= ~REQ_F_BUFFERS_COMMIT; + + if (unlikely(len < 0)) + return true; + + if (bl->flags & IOBL_INC) { + struct io_uring_buf *buf; + + buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); + if (WARN_ON_ONCE(len > buf->len)) + len = buf->len; + buf->len -= len; + if (buf->len) { + buf->addr += len; + return false; + } + } + + bl->head += nr; + return true; } -static inline void __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) +static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) { struct io_buffer_list *bl = req->buf_list; + bool ret = true; if (bl) { - io_kbuf_commit(req, bl, len, nr); + ret = io_kbuf_commit(req, bl, len, nr); req->buf_index = bl->bgid; } req->flags &= ~REQ_F_BUFFER_RING; + return ret; } static inline void __io_put_kbuf_list(struct io_kiocb *req, int len, @@ -176,10 +200,12 @@ static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, return 0; ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); - if (req->flags & REQ_F_BUFFER_RING) - __io_put_kbuf_ring(req, len, nbufs); - else + if (req->flags & REQ_F_BUFFER_RING) { + if (!__io_put_kbuf_ring(req, len, nbufs)) + ret |= IORING_CQE_F_BUF_MORE; + } else { __io_put_kbuf(req, len, issue_flags); + } return ret; }