io_uring-6.7-2023-11-30

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmVo5jYQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpuNNEAC7sm239X9ixFQ7E70kxz1FyPpF6KS5oVWT
 piAEbUkWQON5yI7M7C2l8w6/xDi8yCIf272HnvPlYfJzkoOfjt7hjWpxtKUOVYJe
 MtL+KSiXqtdv8fvYaS61dyPzqJQ7q7D1cnCVUv7eKm7hSo7NZQH94fzC7UW+Xm/G
 2wng8C5Ltd0IfLpqQJnrn/yGnsCw3PfQYiC7unMXB5NT5eriM5jnHGpl9EPMLxbP
 TWIyUYiqxzrd9QkCTdpEZkKP35Pho/tzCtc3mN0+9tcMuoESX0KnQiR5q5IPet4/
 kkTvZZ7Kw18k8Eb99JSH2G2maFrrEZg0C3MfTF0W4O2t19Pajx8cyhVyAa1ib32o
 TcT6+M1XdAp2rEpfDSRvNCqRpMXm1zARpo4GvEHqGbY5/VefXeJPPaJyAu0CLNlk
 p1FJCQq8hMHd71GCfzb9d1Z+Mozd7dOO1CJqPYz35WXdtXSJ0b8Hw/aVIaYT9JP7
 IbP9IE7ZuPPZq+BC6FTH1O2zbJ0h+PSC5yAONw+Py3YHUT586e11nCyhQUrOJQmE
 kJENcknQCtcFgckXzT5ROh+Vlt6KHjltrVOAT3Jl2LhRssczJo6/4+BZfgvHJipE
 TSOdKFS1Saxh0XX8DGovYT78rg3tullzkvWEVFRrDk6MlFOCHGAs1E0Prz7yqzE4
 KscqZOwIZw==
 =wA/u
 -----END PGP SIGNATURE-----

Merge tag 'io_uring-6.7-2023-11-30' of git://git.kernel.dk/linux

Pull io_uring fixes from Jens Axboe:

 - Fix an issue with discontig page checking for IORING_SETUP_NO_MMAP

 - Fix an issue with not allowing IORING_SETUP_NO_MMAP also disallowing
   mmap'ed buffer rings

 - Fix an issue with deferred release of memory mapped pages

 - Fix a lockdep issue with IORING_SETUP_NO_MMAP

 - Use fget/fput consistently, even from our sync system calls. No real
   issue here, but if we were ever to allow closing io_uring descriptors
   it would be required. Let's play it safe and just use the full ref
   counted versions upfront. Most uses of io_uring are threaded anyway,
   and hence already doing the full version underneath.

* tag 'io_uring-6.7-2023-11-30' of git://git.kernel.dk/linux:
  io_uring: use fget/fput consistently
  io_uring: free io_buffer_list entries via RCU
  io_uring/kbuf: prune deferred locked cache when tearing down
  io_uring/kbuf: recycle freed mapped buffer ring entries
  io_uring/kbuf: defer release of mapped buffer rings
  io_uring: enable io_mem_alloc/free to be used in other parts
  io_uring: don't guard IORING_OFF_PBUF_RING with SETUP_NO_MMAP
  io_uring: don't allow discontig pages for IORING_SETUP_NO_MMAP
This commit is contained in:
Linus Torvalds 2023-12-02 06:47:32 +09:00
commit c9a925b7bc
6 changed files with 224 additions and 70 deletions

View File

@ -340,6 +340,9 @@ struct io_ring_ctx {
struct list_head io_buffers_cache;
/* deferred free list, protected by ->uring_lock */
struct hlist_head io_buf_list;
/* Keep this last, we don't need it for the fast path */
struct wait_queue_head poll_wq;
struct io_restriction restrictions;

View File

@ -273,7 +273,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
};
ktime_t timeout = KTIME_MAX;
struct io_uring_sync_cancel_reg sc;
struct fd f = { };
struct file *file = NULL;
DEFINE_WAIT(wait);
int ret, i;
@ -295,10 +295,10 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
/* we can grab a normal file descriptor upfront */
if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
!(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
f = fdget(sc.fd);
if (!f.file)
file = fget(sc.fd);
if (!file)
return -EBADF;
cd.file = f.file;
cd.file = file;
}
ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
@ -348,6 +348,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
if (ret == -ENOENT || ret > 0)
ret = 0;
out:
fdput(f);
if (file)
fput(file);
return ret;
}

View File

@ -325,6 +325,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache);
INIT_HLIST_HEAD(&ctx->io_buf_list);
io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
sizeof(struct io_rsrc_node));
io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
@ -2666,7 +2667,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
static void io_mem_free(void *ptr)
void io_mem_free(void *ptr)
{
if (!ptr)
return;
@ -2697,6 +2698,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
{
struct page **page_array;
unsigned int nr_pages;
void *page_addr;
int ret, i;
*npages = 0;
@ -2718,27 +2720,29 @@ err:
io_pages_free(&page_array, ret > 0 ? ret : 0);
return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
}
/*
* Should be a single page. If the ring is small enough that we can
* use a normal page, that is fine. If we need multiple pages, then
* userspace should use a huge page. That's the only way to guarantee
* that we get contigious memory, outside of just being lucky or
* (currently) having low memory fragmentation.
*/
if (page_array[0] != page_array[ret - 1])
goto err;
/*
* Can't support mapping user allocated ring memory on 32-bit archs
* where it could potentially reside in highmem. Just fail those with
* -EINVAL, just like we did on kernels that didn't support this
* feature.
*/
page_addr = page_address(page_array[0]);
for (i = 0; i < nr_pages; i++) {
if (PageHighMem(page_array[i])) {
ret = -EINVAL;
ret = -EINVAL;
/*
* Can't support mapping user allocated ring memory on 32-bit
* archs where it could potentially reside in highmem. Just
* fail those with -EINVAL, just like we did on kernels that
* didn't support this feature.
*/
if (PageHighMem(page_array[i]))
goto err;
}
/*
* No support for discontig pages for now, should either be a
* single normal page, or a huge page. Later on we can add
* support for remapping discontig pages, for now we will
* just fail them with EINVAL.
*/
if (page_address(page_array[i]) != page_addr)
goto err;
page_addr += PAGE_SIZE;
}
*pages = page_array;
@ -2775,7 +2779,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
}
}
static void *io_mem_alloc(size_t size)
void *io_mem_alloc(size_t size)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
void *ret;
@ -2947,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
io_rings_free(ctx);
io_kbuf_mmap_list_free(ctx);
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
@ -3475,25 +3480,27 @@ static void *io_uring_validate_mmap_request(struct file *file,
struct page *page;
void *ptr;
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
ptr = ctx->rings;
break;
case IORING_OFF_SQES:
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
ptr = ctx->sq_sqes;
break;
case IORING_OFF_PBUF_RING: {
unsigned int bgid;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
mutex_lock(&ctx->uring_lock);
rcu_read_lock();
ptr = io_pbuf_get_address(ctx, bgid);
mutex_unlock(&ctx->uring_lock);
rcu_read_unlock();
if (!ptr)
return ERR_PTR(-EINVAL);
break;
@ -3645,7 +3652,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
size_t, argsz)
{
struct io_ring_ctx *ctx;
struct fd f;
struct file *file;
long ret;
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
@ -3663,20 +3670,19 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
f.file = tctx->registered_rings[fd];
f.flags = 0;
if (unlikely(!f.file))
file = tctx->registered_rings[fd];
if (unlikely(!file))
return -EBADF;
} else {
f = fdget(fd);
if (unlikely(!f.file))
file = fget(fd);
if (unlikely(!file))
return -EBADF;
ret = -EOPNOTSUPP;
if (unlikely(!io_is_uring_fops(f.file)))
if (unlikely(!io_is_uring_fops(file)))
goto out;
}
ctx = f.file->private_data;
ctx = file->private_data;
ret = -EBADFD;
if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
goto out;
@ -3770,7 +3776,8 @@ iopoll_locked:
}
}
out:
fdput(f);
if (!(flags & IORING_ENTER_REGISTERED_RING))
fput(file);
return ret;
}
@ -4611,7 +4618,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
struct fd f;
struct file *file;
bool use_registered_ring;
use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
@ -4630,27 +4637,27 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
f.file = tctx->registered_rings[fd];
f.flags = 0;
if (unlikely(!f.file))
file = tctx->registered_rings[fd];
if (unlikely(!file))
return -EBADF;
} else {
f = fdget(fd);
if (unlikely(!f.file))
file = fget(fd);
if (unlikely(!file))
return -EBADF;
ret = -EOPNOTSUPP;
if (!io_is_uring_fops(f.file))
if (!io_is_uring_fops(file))
goto out_fput;
}
ctx = f.file->private_data;
ctx = file->private_data;
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
fdput(f);
if (!use_registered_ring)
fput(file);
return ret;
}

View File

@ -86,6 +86,9 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all);
void *io_mem_alloc(size_t size);
void io_mem_free(void *ptr);
#if defined(CONFIG_PROVE_LOCKING)
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{

View File

@ -33,19 +33,42 @@ struct io_provide_buf {
__u16 bid;
};
struct io_buf_free {
struct hlist_node list;
void *mem;
size_t size;
int inuse;
};
static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
struct io_buffer_list *bl,
unsigned int bgid)
{
if (bl && bgid < BGID_ARRAY)
return &bl[bgid];
return xa_load(&ctx->io_bl_xa, bgid);
}
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
if (ctx->io_bl && bgid < BGID_ARRAY)
return &ctx->io_bl[bgid];
lockdep_assert_held(&ctx->uring_lock);
return xa_load(&ctx->io_bl_xa, bgid);
return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
}
static int io_buffer_add_list(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned int bgid)
{
/*
* Store buffer group ID and finally mark the list as visible.
* The normal lookup doesn't care about the visibility as we're
* always under the ->uring_lock, but the RCU lookup from mmap does.
*/
bl->bgid = bgid;
smp_store_release(&bl->is_ready, 1);
if (bgid < BGID_ARRAY)
return 0;
@ -196,21 +219,40 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
{
struct io_buffer_list *bl;
int i;
ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
GFP_KERNEL);
if (!ctx->io_bl)
bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
if (!bl)
return -ENOMEM;
for (i = 0; i < BGID_ARRAY; i++) {
INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
ctx->io_bl[i].bgid = i;
INIT_LIST_HEAD(&bl[i].buf_list);
bl[i].bgid = i;
}
smp_store_release(&ctx->io_bl, bl);
return 0;
}
/*
* Mark the given mapped range as free for reuse
*/
static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
struct io_buf_free *ibf;
hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
if (bl->buf_ring == ibf->mem) {
ibf->inuse = 0;
return;
}
}
/* can't happen... */
WARN_ON_ONCE(1);
}
static int __io_remove_buffers(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned nbufs)
{
@ -223,7 +265,11 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (bl->is_mapped) {
i = bl->buf_ring->tail - bl->head;
if (bl->is_mmap) {
folio_put(virt_to_folio(bl->buf_ring));
/*
* io_kbuf_list_free() will free the page(s) at
* ->release() time.
*/
io_kbuf_mark_free(ctx, bl);
bl->buf_ring = NULL;
bl->is_mmap = 0;
} else if (bl->buf_nr_pages) {
@ -274,9 +320,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
xa_for_each(&ctx->io_bl_xa, index, bl) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
__io_remove_buffers(ctx, bl, -1U);
kfree(bl);
kfree_rcu(bl, rcu);
}
/*
* Move deferred locked entries to cache before pruning
*/
spin_lock(&ctx->completion_lock);
if (!list_empty(&ctx->io_buffers_comp))
list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache);
spin_unlock(&ctx->completion_lock);
list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
buf = list_entry(item, struct io_buffer, list);
kmem_cache_free(io_buf_cachep, buf);
@ -460,7 +514,16 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
INIT_LIST_HEAD(&bl->buf_list);
ret = io_buffer_add_list(ctx, bl, p->bgid);
if (ret) {
kfree(bl);
/*
* Doesn't need rcu free as it was never visible, but
* let's keep it consistent throughout. Also can't
* be a lower indexed array group, as adding one
* where lookup failed cannot happen.
*/
if (p->bgid >= BGID_ARRAY)
kfree_rcu(bl, rcu);
else
WARN_ON_ONCE(1);
goto err;
}
}
@ -531,19 +594,63 @@ error_unpin:
return -EINVAL;
}
static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
/*
* See if we have a suitable region that we can reuse, rather than allocate
* both a new io_buf_free and mem region again. We leave it on the list as
* even a reused entry will need freeing at ring release.
*/
static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
size_t ring_size)
{
struct io_buf_free *ibf, *best = NULL;
size_t best_dist;
hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
size_t dist;
if (ibf->inuse || ibf->size < ring_size)
continue;
dist = ibf->size - ring_size;
if (!best || dist < best_dist) {
best = ibf;
if (!dist)
break;
best_dist = dist;
}
}
return best;
}
static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
struct io_buf_free *ibf;
size_t ring_size;
void *ptr;
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
if (!ptr)
return -ENOMEM;
bl->buf_ring = ptr;
/* Reuse existing entry, if we can */
ibf = io_lookup_buf_free_entry(ctx, ring_size);
if (!ibf) {
ptr = io_mem_alloc(ring_size);
if (!ptr)
return -ENOMEM;
/* Allocate and store deferred free entry */
ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
if (!ibf) {
io_mem_free(ptr);
return -ENOMEM;
}
ibf->mem = ptr;
ibf->size = ring_size;
hlist_add_head(&ibf->list, &ctx->io_buf_list);
}
ibf->inuse = 1;
bl->buf_ring = ibf->mem;
bl->is_mapped = 1;
bl->is_mmap = 1;
return 0;
@ -555,6 +662,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
struct io_buffer_list *bl, *free_bl = NULL;
int ret;
lockdep_assert_held(&ctx->uring_lock);
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
@ -599,7 +708,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!(reg.flags & IOU_PBUF_RING_MMAP))
ret = io_pin_pbuf_ring(&reg, bl);
else
ret = io_alloc_pbuf_ring(&reg, bl);
ret = io_alloc_pbuf_ring(ctx, &reg, bl);
if (!ret) {
bl->nr_entries = reg.ring_entries;
@ -609,7 +718,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return 0;
}
kfree(free_bl);
kfree_rcu(free_bl, rcu);
return ret;
}
@ -618,6 +727,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
struct io_uring_buf_reg reg;
struct io_buffer_list *bl;
lockdep_assert_held(&ctx->uring_lock);
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
@ -634,7 +745,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
__io_remove_buffers(ctx, bl, -1U);
if (bl->bgid >= BGID_ARRAY) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
kfree(bl);
kfree_rcu(bl, rcu);
}
return 0;
}
@ -643,9 +754,33 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
{
struct io_buffer_list *bl;
bl = io_buffer_get_list(ctx, bgid);
bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);
/*
* Ensure the list is fully setup. Only strictly needed for RCU lookup
* via mmap, and in that case only for the array indexed groups. For
* the xarray lookups, it's either visible and ready, or not at all.
*/
if (!smp_load_acquire(&bl->is_ready))
return NULL;
if (!bl || !bl->is_mmap)
return NULL;
return bl->buf_ring;
}
/*
* Called at or after ->release(), free the mmap'ed buffers that we used
* for memory mapped provided buffer rings.
*/
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
{
struct io_buf_free *ibf;
struct hlist_node *tmp;
hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
hlist_del(&ibf->list);
io_mem_free(ibf->mem);
kfree(ibf);
}
}

View File

@ -15,6 +15,7 @@ struct io_buffer_list {
struct page **buf_pages;
struct io_uring_buf_ring *buf_ring;
};
struct rcu_head rcu;
};
__u16 bgid;
@ -28,6 +29,8 @@ struct io_buffer_list {
__u8 is_mapped;
/* ring mapped provided buffers, but mmap'ed by application */
__u8 is_mmap;
/* bl is visible from an RCU point of view for lookup */
__u8 is_ready;
};
struct io_buffer {
@ -51,6 +54,8 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);