mirror of
https://github.com/torvalds/linux.git
synced 2024-11-11 06:31:49 +00:00
io_uring-6.7-2023-11-30
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmVo5jYQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpuNNEAC7sm239X9ixFQ7E70kxz1FyPpF6KS5oVWT piAEbUkWQON5yI7M7C2l8w6/xDi8yCIf272HnvPlYfJzkoOfjt7hjWpxtKUOVYJe MtL+KSiXqtdv8fvYaS61dyPzqJQ7q7D1cnCVUv7eKm7hSo7NZQH94fzC7UW+Xm/G 2wng8C5Ltd0IfLpqQJnrn/yGnsCw3PfQYiC7unMXB5NT5eriM5jnHGpl9EPMLxbP TWIyUYiqxzrd9QkCTdpEZkKP35Pho/tzCtc3mN0+9tcMuoESX0KnQiR5q5IPet4/ kkTvZZ7Kw18k8Eb99JSH2G2maFrrEZg0C3MfTF0W4O2t19Pajx8cyhVyAa1ib32o TcT6+M1XdAp2rEpfDSRvNCqRpMXm1zARpo4GvEHqGbY5/VefXeJPPaJyAu0CLNlk p1FJCQq8hMHd71GCfzb9d1Z+Mozd7dOO1CJqPYz35WXdtXSJ0b8Hw/aVIaYT9JP7 IbP9IE7ZuPPZq+BC6FTH1O2zbJ0h+PSC5yAONw+Py3YHUT586e11nCyhQUrOJQmE kJENcknQCtcFgckXzT5ROh+Vlt6KHjltrVOAT3Jl2LhRssczJo6/4+BZfgvHJipE TSOdKFS1Saxh0XX8DGovYT78rg3tullzkvWEVFRrDk6MlFOCHGAs1E0Prz7yqzE4 KscqZOwIZw== =wA/u -----END PGP SIGNATURE----- Merge tag 'io_uring-6.7-2023-11-30' of git://git.kernel.dk/linux Pull io_uring fixes from Jens Axboe: - Fix an issue with discontig page checking for IORING_SETUP_NO_MMAP - Fix an issue with not allowing IORING_SETUP_NO_MMAP also disallowing mmap'ed buffer rings - Fix an issue with deferred release of memory mapped pages - Fix a lockdep issue with IORING_SETUP_NO_MMAP - Use fget/fput consistently, even from our sync system calls. No real issue here, but if we were ever to allow closing io_uring descriptors it would be required. Let's play it safe and just use the full ref counted versions upfront. Most uses of io_uring are threaded anyway, and hence already doing the full version underneath. * tag 'io_uring-6.7-2023-11-30' of git://git.kernel.dk/linux: io_uring: use fget/fput consistently io_uring: free io_buffer_list entries via RCU io_uring/kbuf: prune deferred locked cache when tearing down io_uring/kbuf: recycle freed mapped buffer ring entries io_uring/kbuf: defer release of mapped buffer rings io_uring: enable io_mem_alloc/free to be used in other parts io_uring: don't guard IORING_OFF_PBUF_RING with SETUP_NO_MMAP io_uring: don't allow discontig pages for IORING_SETUP_NO_MMAP
This commit is contained in:
commit
c9a925b7bc
@ -340,6 +340,9 @@ struct io_ring_ctx {
|
||||
|
||||
struct list_head io_buffers_cache;
|
||||
|
||||
/* deferred free list, protected by ->uring_lock */
|
||||
struct hlist_head io_buf_list;
|
||||
|
||||
/* Keep this last, we don't need it for the fast path */
|
||||
struct wait_queue_head poll_wq;
|
||||
struct io_restriction restrictions;
|
||||
|
@ -273,7 +273,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
|
||||
};
|
||||
ktime_t timeout = KTIME_MAX;
|
||||
struct io_uring_sync_cancel_reg sc;
|
||||
struct fd f = { };
|
||||
struct file *file = NULL;
|
||||
DEFINE_WAIT(wait);
|
||||
int ret, i;
|
||||
|
||||
@ -295,10 +295,10 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
|
||||
/* we can grab a normal file descriptor upfront */
|
||||
if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
|
||||
!(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
|
||||
f = fdget(sc.fd);
|
||||
if (!f.file)
|
||||
file = fget(sc.fd);
|
||||
if (!file)
|
||||
return -EBADF;
|
||||
cd.file = f.file;
|
||||
cd.file = file;
|
||||
}
|
||||
|
||||
ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
|
||||
@ -348,6 +348,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
|
||||
if (ret == -ENOENT || ret > 0)
|
||||
ret = 0;
|
||||
out:
|
||||
fdput(f);
|
||||
if (file)
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
@ -325,6 +325,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
|
||||
INIT_LIST_HEAD(&ctx->sqd_list);
|
||||
INIT_LIST_HEAD(&ctx->cq_overflow_list);
|
||||
INIT_LIST_HEAD(&ctx->io_buffers_cache);
|
||||
INIT_HLIST_HEAD(&ctx->io_buf_list);
|
||||
io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
|
||||
sizeof(struct io_rsrc_node));
|
||||
io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
|
||||
@ -2666,7 +2667,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
|
||||
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
|
||||
}
|
||||
|
||||
static void io_mem_free(void *ptr)
|
||||
void io_mem_free(void *ptr)
|
||||
{
|
||||
if (!ptr)
|
||||
return;
|
||||
@ -2697,6 +2698,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
|
||||
{
|
||||
struct page **page_array;
|
||||
unsigned int nr_pages;
|
||||
void *page_addr;
|
||||
int ret, i;
|
||||
|
||||
*npages = 0;
|
||||
@ -2718,27 +2720,29 @@ err:
|
||||
io_pages_free(&page_array, ret > 0 ? ret : 0);
|
||||
return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
|
||||
}
|
||||
|
||||
page_addr = page_address(page_array[0]);
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
ret = -EINVAL;
|
||||
|
||||
/*
|
||||
* Should be a single page. If the ring is small enough that we can
|
||||
* use a normal page, that is fine. If we need multiple pages, then
|
||||
* userspace should use a huge page. That's the only way to guarantee
|
||||
* that we get contigious memory, outside of just being lucky or
|
||||
* (currently) having low memory fragmentation.
|
||||
* Can't support mapping user allocated ring memory on 32-bit
|
||||
* archs where it could potentially reside in highmem. Just
|
||||
* fail those with -EINVAL, just like we did on kernels that
|
||||
* didn't support this feature.
|
||||
*/
|
||||
if (page_array[0] != page_array[ret - 1])
|
||||
if (PageHighMem(page_array[i]))
|
||||
goto err;
|
||||
|
||||
/*
|
||||
* Can't support mapping user allocated ring memory on 32-bit archs
|
||||
* where it could potentially reside in highmem. Just fail those with
|
||||
* -EINVAL, just like we did on kernels that didn't support this
|
||||
* feature.
|
||||
* No support for discontig pages for now, should either be a
|
||||
* single normal page, or a huge page. Later on we can add
|
||||
* support for remapping discontig pages, for now we will
|
||||
* just fail them with EINVAL.
|
||||
*/
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
if (PageHighMem(page_array[i])) {
|
||||
ret = -EINVAL;
|
||||
if (page_address(page_array[i]) != page_addr)
|
||||
goto err;
|
||||
}
|
||||
page_addr += PAGE_SIZE;
|
||||
}
|
||||
|
||||
*pages = page_array;
|
||||
@ -2775,7 +2779,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
|
||||
}
|
||||
}
|
||||
|
||||
static void *io_mem_alloc(size_t size)
|
||||
void *io_mem_alloc(size_t size)
|
||||
{
|
||||
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
|
||||
void *ret;
|
||||
@ -2947,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
|
||||
ctx->mm_account = NULL;
|
||||
}
|
||||
io_rings_free(ctx);
|
||||
io_kbuf_mmap_list_free(ctx);
|
||||
|
||||
percpu_ref_exit(&ctx->refs);
|
||||
free_uid(ctx->user);
|
||||
@ -3475,25 +3480,27 @@ static void *io_uring_validate_mmap_request(struct file *file,
|
||||
struct page *page;
|
||||
void *ptr;
|
||||
|
||||
/* Don't allow mmap if the ring was setup without it */
|
||||
if (ctx->flags & IORING_SETUP_NO_MMAP)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
switch (offset & IORING_OFF_MMAP_MASK) {
|
||||
case IORING_OFF_SQ_RING:
|
||||
case IORING_OFF_CQ_RING:
|
||||
/* Don't allow mmap if the ring was setup without it */
|
||||
if (ctx->flags & IORING_SETUP_NO_MMAP)
|
||||
return ERR_PTR(-EINVAL);
|
||||
ptr = ctx->rings;
|
||||
break;
|
||||
case IORING_OFF_SQES:
|
||||
/* Don't allow mmap if the ring was setup without it */
|
||||
if (ctx->flags & IORING_SETUP_NO_MMAP)
|
||||
return ERR_PTR(-EINVAL);
|
||||
ptr = ctx->sq_sqes;
|
||||
break;
|
||||
case IORING_OFF_PBUF_RING: {
|
||||
unsigned int bgid;
|
||||
|
||||
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
rcu_read_lock();
|
||||
ptr = io_pbuf_get_address(ctx, bgid);
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
rcu_read_unlock();
|
||||
if (!ptr)
|
||||
return ERR_PTR(-EINVAL);
|
||||
break;
|
||||
@ -3645,7 +3652,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
|
||||
size_t, argsz)
|
||||
{
|
||||
struct io_ring_ctx *ctx;
|
||||
struct fd f;
|
||||
struct file *file;
|
||||
long ret;
|
||||
|
||||
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
|
||||
@ -3663,20 +3670,19 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
|
||||
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
|
||||
return -EINVAL;
|
||||
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
|
||||
f.file = tctx->registered_rings[fd];
|
||||
f.flags = 0;
|
||||
if (unlikely(!f.file))
|
||||
file = tctx->registered_rings[fd];
|
||||
if (unlikely(!file))
|
||||
return -EBADF;
|
||||
} else {
|
||||
f = fdget(fd);
|
||||
if (unlikely(!f.file))
|
||||
file = fget(fd);
|
||||
if (unlikely(!file))
|
||||
return -EBADF;
|
||||
ret = -EOPNOTSUPP;
|
||||
if (unlikely(!io_is_uring_fops(f.file)))
|
||||
if (unlikely(!io_is_uring_fops(file)))
|
||||
goto out;
|
||||
}
|
||||
|
||||
ctx = f.file->private_data;
|
||||
ctx = file->private_data;
|
||||
ret = -EBADFD;
|
||||
if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
|
||||
goto out;
|
||||
@ -3770,7 +3776,8 @@ iopoll_locked:
|
||||
}
|
||||
}
|
||||
out:
|
||||
fdput(f);
|
||||
if (!(flags & IORING_ENTER_REGISTERED_RING))
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -4611,7 +4618,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
|
||||
{
|
||||
struct io_ring_ctx *ctx;
|
||||
long ret = -EBADF;
|
||||
struct fd f;
|
||||
struct file *file;
|
||||
bool use_registered_ring;
|
||||
|
||||
use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
|
||||
@ -4630,27 +4637,27 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
|
||||
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
|
||||
return -EINVAL;
|
||||
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
|
||||
f.file = tctx->registered_rings[fd];
|
||||
f.flags = 0;
|
||||
if (unlikely(!f.file))
|
||||
file = tctx->registered_rings[fd];
|
||||
if (unlikely(!file))
|
||||
return -EBADF;
|
||||
} else {
|
||||
f = fdget(fd);
|
||||
if (unlikely(!f.file))
|
||||
file = fget(fd);
|
||||
if (unlikely(!file))
|
||||
return -EBADF;
|
||||
ret = -EOPNOTSUPP;
|
||||
if (!io_is_uring_fops(f.file))
|
||||
if (!io_is_uring_fops(file))
|
||||
goto out_fput;
|
||||
}
|
||||
|
||||
ctx = f.file->private_data;
|
||||
ctx = file->private_data;
|
||||
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
ret = __io_uring_register(ctx, opcode, arg, nr_args);
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
|
||||
out_fput:
|
||||
fdput(f);
|
||||
if (!use_registered_ring)
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -86,6 +86,9 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
|
||||
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
|
||||
bool cancel_all);
|
||||
|
||||
void *io_mem_alloc(size_t size);
|
||||
void io_mem_free(void *ptr);
|
||||
|
||||
#if defined(CONFIG_PROVE_LOCKING)
|
||||
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
|
||||
{
|
||||
|
173
io_uring/kbuf.c
173
io_uring/kbuf.c
@ -33,19 +33,42 @@ struct io_provide_buf {
|
||||
__u16 bid;
|
||||
};
|
||||
|
||||
struct io_buf_free {
|
||||
struct hlist_node list;
|
||||
void *mem;
|
||||
size_t size;
|
||||
int inuse;
|
||||
};
|
||||
|
||||
static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
|
||||
struct io_buffer_list *bl,
|
||||
unsigned int bgid)
|
||||
{
|
||||
if (bl && bgid < BGID_ARRAY)
|
||||
return &bl[bgid];
|
||||
|
||||
return xa_load(&ctx->io_bl_xa, bgid);
|
||||
}
|
||||
|
||||
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
|
||||
unsigned int bgid)
|
||||
{
|
||||
if (ctx->io_bl && bgid < BGID_ARRAY)
|
||||
return &ctx->io_bl[bgid];
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
|
||||
return xa_load(&ctx->io_bl_xa, bgid);
|
||||
return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
|
||||
}
|
||||
|
||||
static int io_buffer_add_list(struct io_ring_ctx *ctx,
|
||||
struct io_buffer_list *bl, unsigned int bgid)
|
||||
{
|
||||
/*
|
||||
* Store buffer group ID and finally mark the list as visible.
|
||||
* The normal lookup doesn't care about the visibility as we're
|
||||
* always under the ->uring_lock, but the RCU lookup from mmap does.
|
||||
*/
|
||||
bl->bgid = bgid;
|
||||
smp_store_release(&bl->is_ready, 1);
|
||||
|
||||
if (bgid < BGID_ARRAY)
|
||||
return 0;
|
||||
|
||||
@ -196,21 +219,40 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
|
||||
|
||||
static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_buffer_list *bl;
|
||||
int i;
|
||||
|
||||
ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
|
||||
GFP_KERNEL);
|
||||
if (!ctx->io_bl)
|
||||
bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
|
||||
if (!bl)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < BGID_ARRAY; i++) {
|
||||
INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
|
||||
ctx->io_bl[i].bgid = i;
|
||||
INIT_LIST_HEAD(&bl[i].buf_list);
|
||||
bl[i].bgid = i;
|
||||
}
|
||||
|
||||
smp_store_release(&ctx->io_bl, bl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark the given mapped range as free for reuse
|
||||
*/
|
||||
static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
|
||||
{
|
||||
struct io_buf_free *ibf;
|
||||
|
||||
hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
|
||||
if (bl->buf_ring == ibf->mem) {
|
||||
ibf->inuse = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* can't happen... */
|
||||
WARN_ON_ONCE(1);
|
||||
}
|
||||
|
||||
static int __io_remove_buffers(struct io_ring_ctx *ctx,
|
||||
struct io_buffer_list *bl, unsigned nbufs)
|
||||
{
|
||||
@ -223,7 +265,11 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
|
||||
if (bl->is_mapped) {
|
||||
i = bl->buf_ring->tail - bl->head;
|
||||
if (bl->is_mmap) {
|
||||
folio_put(virt_to_folio(bl->buf_ring));
|
||||
/*
|
||||
* io_kbuf_list_free() will free the page(s) at
|
||||
* ->release() time.
|
||||
*/
|
||||
io_kbuf_mark_free(ctx, bl);
|
||||
bl->buf_ring = NULL;
|
||||
bl->is_mmap = 0;
|
||||
} else if (bl->buf_nr_pages) {
|
||||
@ -274,9 +320,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
|
||||
xa_for_each(&ctx->io_bl_xa, index, bl) {
|
||||
xa_erase(&ctx->io_bl_xa, bl->bgid);
|
||||
__io_remove_buffers(ctx, bl, -1U);
|
||||
kfree(bl);
|
||||
kfree_rcu(bl, rcu);
|
||||
}
|
||||
|
||||
/*
|
||||
* Move deferred locked entries to cache before pruning
|
||||
*/
|
||||
spin_lock(&ctx->completion_lock);
|
||||
if (!list_empty(&ctx->io_buffers_comp))
|
||||
list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache);
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
|
||||
list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
|
||||
buf = list_entry(item, struct io_buffer, list);
|
||||
kmem_cache_free(io_buf_cachep, buf);
|
||||
@ -460,7 +514,16 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
|
||||
INIT_LIST_HEAD(&bl->buf_list);
|
||||
ret = io_buffer_add_list(ctx, bl, p->bgid);
|
||||
if (ret) {
|
||||
kfree(bl);
|
||||
/*
|
||||
* Doesn't need rcu free as it was never visible, but
|
||||
* let's keep it consistent throughout. Also can't
|
||||
* be a lower indexed array group, as adding one
|
||||
* where lookup failed cannot happen.
|
||||
*/
|
||||
if (p->bgid >= BGID_ARRAY)
|
||||
kfree_rcu(bl, rcu);
|
||||
else
|
||||
WARN_ON_ONCE(1);
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
@ -531,19 +594,63 @@ error_unpin:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
|
||||
/*
|
||||
* See if we have a suitable region that we can reuse, rather than allocate
|
||||
* both a new io_buf_free and mem region again. We leave it on the list as
|
||||
* even a reused entry will need freeing at ring release.
|
||||
*/
|
||||
static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
|
||||
size_t ring_size)
|
||||
{
|
||||
struct io_buf_free *ibf, *best = NULL;
|
||||
size_t best_dist;
|
||||
|
||||
hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
|
||||
size_t dist;
|
||||
|
||||
if (ibf->inuse || ibf->size < ring_size)
|
||||
continue;
|
||||
dist = ibf->size - ring_size;
|
||||
if (!best || dist < best_dist) {
|
||||
best = ibf;
|
||||
if (!dist)
|
||||
break;
|
||||
best_dist = dist;
|
||||
}
|
||||
}
|
||||
|
||||
return best;
|
||||
}
|
||||
|
||||
static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
|
||||
struct io_uring_buf_reg *reg,
|
||||
struct io_buffer_list *bl)
|
||||
{
|
||||
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
|
||||
struct io_buf_free *ibf;
|
||||
size_t ring_size;
|
||||
void *ptr;
|
||||
|
||||
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
|
||||
ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
|
||||
|
||||
/* Reuse existing entry, if we can */
|
||||
ibf = io_lookup_buf_free_entry(ctx, ring_size);
|
||||
if (!ibf) {
|
||||
ptr = io_mem_alloc(ring_size);
|
||||
if (!ptr)
|
||||
return -ENOMEM;
|
||||
|
||||
bl->buf_ring = ptr;
|
||||
/* Allocate and store deferred free entry */
|
||||
ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
|
||||
if (!ibf) {
|
||||
io_mem_free(ptr);
|
||||
return -ENOMEM;
|
||||
}
|
||||
ibf->mem = ptr;
|
||||
ibf->size = ring_size;
|
||||
hlist_add_head(&ibf->list, &ctx->io_buf_list);
|
||||
}
|
||||
ibf->inuse = 1;
|
||||
bl->buf_ring = ibf->mem;
|
||||
bl->is_mapped = 1;
|
||||
bl->is_mmap = 1;
|
||||
return 0;
|
||||
@ -555,6 +662,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
struct io_buffer_list *bl, *free_bl = NULL;
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
|
||||
if (copy_from_user(®, arg, sizeof(reg)))
|
||||
return -EFAULT;
|
||||
|
||||
@ -599,7 +708,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
if (!(reg.flags & IOU_PBUF_RING_MMAP))
|
||||
ret = io_pin_pbuf_ring(®, bl);
|
||||
else
|
||||
ret = io_alloc_pbuf_ring(®, bl);
|
||||
ret = io_alloc_pbuf_ring(ctx, ®, bl);
|
||||
|
||||
if (!ret) {
|
||||
bl->nr_entries = reg.ring_entries;
|
||||
@ -609,7 +718,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
return 0;
|
||||
}
|
||||
|
||||
kfree(free_bl);
|
||||
kfree_rcu(free_bl, rcu);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -618,6 +727,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
struct io_uring_buf_reg reg;
|
||||
struct io_buffer_list *bl;
|
||||
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
|
||||
if (copy_from_user(®, arg, sizeof(reg)))
|
||||
return -EFAULT;
|
||||
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
|
||||
@ -634,7 +745,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
__io_remove_buffers(ctx, bl, -1U);
|
||||
if (bl->bgid >= BGID_ARRAY) {
|
||||
xa_erase(&ctx->io_bl_xa, bl->bgid);
|
||||
kfree(bl);
|
||||
kfree_rcu(bl, rcu);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@ -643,9 +754,33 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
|
||||
{
|
||||
struct io_buffer_list *bl;
|
||||
|
||||
bl = io_buffer_get_list(ctx, bgid);
|
||||
bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);
|
||||
|
||||
/*
|
||||
* Ensure the list is fully setup. Only strictly needed for RCU lookup
|
||||
* via mmap, and in that case only for the array indexed groups. For
|
||||
* the xarray lookups, it's either visible and ready, or not at all.
|
||||
*/
|
||||
if (!smp_load_acquire(&bl->is_ready))
|
||||
return NULL;
|
||||
if (!bl || !bl->is_mmap)
|
||||
return NULL;
|
||||
|
||||
return bl->buf_ring;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called at or after ->release(), free the mmap'ed buffers that we used
|
||||
* for memory mapped provided buffer rings.
|
||||
*/
|
||||
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_buf_free *ibf;
|
||||
struct hlist_node *tmp;
|
||||
|
||||
hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
|
||||
hlist_del(&ibf->list);
|
||||
io_mem_free(ibf->mem);
|
||||
kfree(ibf);
|
||||
}
|
||||
}
|
||||
|
@ -15,6 +15,7 @@ struct io_buffer_list {
|
||||
struct page **buf_pages;
|
||||
struct io_uring_buf_ring *buf_ring;
|
||||
};
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
__u16 bgid;
|
||||
|
||||
@ -28,6 +29,8 @@ struct io_buffer_list {
|
||||
__u8 is_mapped;
|
||||
/* ring mapped provided buffers, but mmap'ed by application */
|
||||
__u8 is_mmap;
|
||||
/* bl is visible from an RCU point of view for lookup */
|
||||
__u8 is_ready;
|
||||
};
|
||||
|
||||
struct io_buffer {
|
||||
@ -51,6 +54,8 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
|
||||
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
|
||||
|
||||
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
|
||||
|
||||
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
|
||||
|
||||
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
|
||||
|
Loading…
Reference in New Issue
Block a user