mirror of
https://github.com/torvalds/linux.git
synced 2024-12-27 05:11:48 +00:00
for-5.6/io_uring-vfs-2020-01-29
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAl4yEegQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpn5ZD/4/WlXs2cUDgg1C65bzZFO4qvevm+VkXmsk GbyrnFstRekvSH01/ZQxlyDVKS8Wux0XIJ6OArCh1047LvL1bEE5dvOW5iIiwa/r grjQuwFAzIPsE2fgcAO17BKIUzq2Z96+hwDzH7dw0i32yBuLvNmY/1SxcCHKfPut uzGyp7t3/2dIHbpWILRndMYe0O9j9ubmOMvKyKTwy723yDEafsUoqu2mlpigzTq4 2i+DbYBIAd8qmLqG/m3e+vOt9xodJ2Q0hlO+v6DcP2SKXU64Hb/N98HadR//aWP9 41DBXqs+dvDBcu3Jxb80PFUTiOQZECJivkns5cNcjuSXmNkOuQhDQR5K372AHmR9 m6e6FSBxwej8HselAZCI6yu9uBKd0i+MM4FnFs/O73QGYx2ayXsEXp/Jad9xiYgW pC5XJTSqJQhPE0AYYEOzHPPcBLBcpvXHkvmGKdjkNb8OLhhgh2S/YG0DNC+8ABXr j1uIe/n3kJEEmOanUyiitGyLmDq+mXd7aCVKJL/J0KiGD8Gkc1avAZ1ZrTQgjujY FqqBFawO8gv3g0L4WMI8JI+HJGMnA488obet6UKm9+l/Z/urEpXzDAKf/W/vnx2B LD0FSA0bCh1tyO6JU+avFwHlwShtV7/rx/OhrmCK7CCYKtZCA2IEctxyr8U+PBIv DtwIMTYTsA== =ZZUI -----END PGP SIGNATURE----- Merge tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block Pull io_uring updates from Jens Axboe: - Support for various new opcodes (fallocate, openat, close, statx, fadvise, madvise, openat2, non-vectored read/write, send/recv, and epoll_ctl) - Faster ring quiesce for fileset updates - Optimizations for overflow condition checking - Support for max-sized clamping - Support for probing what opcodes are supported - Support for io-wq backend sharing between "sibling" rings - Support for registering personalities - Lots of little fixes and improvements * tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block: (64 commits) io_uring: add support for epoll_ctl(2) eventpoll: support non-blocking do_epoll_ctl() calls eventpoll: abstract out epoll_ctl() handler io_uring: fix linked command file table usage io_uring: support using a registered personality for commands io_uring: allow registering credentials io_uring: add io-wq workqueue sharing io-wq: allow grabbing existing io-wq io_uring/io-wq: don't use static creds/mm assignments io-wq: make the io_wq ref counted io_uring: fix refcounting with batched allocations at OOM io_uring: add comment for drain_next io_uring: don't attempt to copy iovec for READ/WRITE io_uring: honor IOSQE_ASYNC for linked reqs io_uring: prep req when do IOSQE_ASYNC io_uring: use labeled array init in io_op_defs io_uring: optimise sqe-to-req flags translation io_uring: remove REQ_F_IO_DRAINED io_uring: file switch work needs to get flushed on exit io_uring: hide uring_fd in ctx ...
This commit is contained in:
commit
896f8d23d0
@ -2249,10 +2249,12 @@ static void binder_deferred_fd_close(int fd)
|
||||
return;
|
||||
init_task_work(&twcb->twork, binder_do_fd_close);
|
||||
__close_fd_get_file(fd, &twcb->file);
|
||||
if (twcb->file)
|
||||
if (twcb->file) {
|
||||
filp_close(twcb->file, current->files);
|
||||
task_work_add(current, &twcb->twork, true);
|
||||
else
|
||||
} else {
|
||||
kfree(twcb);
|
||||
}
|
||||
}
|
||||
|
||||
static void binder_transaction_buffer_release(struct binder_proc *proc,
|
||||
|
@ -354,12 +354,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
|
||||
return container_of(p, struct ep_pqueue, pt)->epi;
|
||||
}
|
||||
|
||||
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
|
||||
static inline int ep_op_has_event(int op)
|
||||
{
|
||||
return op != EPOLL_CTL_DEL;
|
||||
}
|
||||
|
||||
/* Initialize the poll safe wake up structure */
|
||||
static void ep_nested_calls_init(struct nested_calls *ncalls)
|
||||
{
|
||||
@ -2074,27 +2068,28 @@ SYSCALL_DEFINE1(epoll_create, int, size)
|
||||
return do_epoll_create(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* The following function implements the controller interface for
|
||||
* the eventpoll file that enables the insertion/removal/change of
|
||||
* file descriptors inside the interest set.
|
||||
*/
|
||||
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
struct epoll_event __user *, event)
|
||||
static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
|
||||
bool nonblock)
|
||||
{
|
||||
if (!nonblock) {
|
||||
mutex_lock_nested(mutex, depth);
|
||||
return 0;
|
||||
}
|
||||
if (mutex_trylock(mutex))
|
||||
return 0;
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
|
||||
bool nonblock)
|
||||
{
|
||||
int error;
|
||||
int full_check = 0;
|
||||
struct fd f, tf;
|
||||
struct eventpoll *ep;
|
||||
struct epitem *epi;
|
||||
struct epoll_event epds;
|
||||
struct eventpoll *tep = NULL;
|
||||
|
||||
error = -EFAULT;
|
||||
if (ep_op_has_event(op) &&
|
||||
copy_from_user(&epds, event, sizeof(struct epoll_event)))
|
||||
goto error_return;
|
||||
|
||||
error = -EBADF;
|
||||
f = fdget(epfd);
|
||||
if (!f.file)
|
||||
@ -2112,7 +2107,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
|
||||
/* Check if EPOLLWAKEUP is allowed */
|
||||
if (ep_op_has_event(op))
|
||||
ep_take_care_of_epollwakeup(&epds);
|
||||
ep_take_care_of_epollwakeup(epds);
|
||||
|
||||
/*
|
||||
* We have to check that the file structure underneath the file descriptor
|
||||
@ -2128,11 +2123,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
|
||||
* Also, we do not currently supported nested exclusive wakeups.
|
||||
*/
|
||||
if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
|
||||
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
|
||||
if (op == EPOLL_CTL_MOD)
|
||||
goto error_tgt_fput;
|
||||
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
|
||||
(epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
|
||||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
|
||||
goto error_tgt_fput;
|
||||
}
|
||||
|
||||
@ -2157,13 +2152,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
* deep wakeup paths from forming in parallel through multiple
|
||||
* EPOLL_CTL_ADD operations.
|
||||
*/
|
||||
mutex_lock_nested(&ep->mtx, 0);
|
||||
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
|
||||
if (error)
|
||||
goto error_tgt_fput;
|
||||
if (op == EPOLL_CTL_ADD) {
|
||||
if (!list_empty(&f.file->f_ep_links) ||
|
||||
is_file_epoll(tf.file)) {
|
||||
full_check = 1;
|
||||
mutex_unlock(&ep->mtx);
|
||||
mutex_lock(&epmutex);
|
||||
error = epoll_mutex_lock(&epmutex, 0, nonblock);
|
||||
if (error)
|
||||
goto error_tgt_fput;
|
||||
full_check = 1;
|
||||
if (is_file_epoll(tf.file)) {
|
||||
error = -ELOOP;
|
||||
if (ep_loop_check(ep, tf.file) != 0) {
|
||||
@ -2173,10 +2172,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
} else
|
||||
list_add(&tf.file->f_tfile_llink,
|
||||
&tfile_check_list);
|
||||
mutex_lock_nested(&ep->mtx, 0);
|
||||
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
|
||||
if (error) {
|
||||
out_del:
|
||||
list_del(&tf.file->f_tfile_llink);
|
||||
goto error_tgt_fput;
|
||||
}
|
||||
if (is_file_epoll(tf.file)) {
|
||||
tep = tf.file->private_data;
|
||||
mutex_lock_nested(&tep->mtx, 1);
|
||||
error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
|
||||
if (error) {
|
||||
mutex_unlock(&ep->mtx);
|
||||
goto out_del;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2192,8 +2200,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
switch (op) {
|
||||
case EPOLL_CTL_ADD:
|
||||
if (!epi) {
|
||||
epds.events |= EPOLLERR | EPOLLHUP;
|
||||
error = ep_insert(ep, &epds, tf.file, fd, full_check);
|
||||
epds->events |= EPOLLERR | EPOLLHUP;
|
||||
error = ep_insert(ep, epds, tf.file, fd, full_check);
|
||||
} else
|
||||
error = -EEXIST;
|
||||
if (full_check)
|
||||
@ -2208,8 +2216,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
case EPOLL_CTL_MOD:
|
||||
if (epi) {
|
||||
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
|
||||
epds.events |= EPOLLERR | EPOLLHUP;
|
||||
error = ep_modify(ep, epi, &epds);
|
||||
epds->events |= EPOLLERR | EPOLLHUP;
|
||||
error = ep_modify(ep, epi, epds);
|
||||
}
|
||||
} else
|
||||
error = -ENOENT;
|
||||
@ -2231,6 +2239,23 @@ error_return:
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* The following function implements the controller interface for
|
||||
* the eventpoll file that enables the insertion/removal/change of
|
||||
* file descriptors inside the interest set.
|
||||
*/
|
||||
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
struct epoll_event __user *, event)
|
||||
{
|
||||
struct epoll_event epds;
|
||||
|
||||
if (ep_op_has_event(op) &&
|
||||
copy_from_user(&epds, event, sizeof(struct epoll_event)))
|
||||
return -EFAULT;
|
||||
|
||||
return do_epoll_ctl(epfd, op, fd, &epds, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Implement the event wait interface for the eventpoll file. It is the kernel
|
||||
* part of the user space epoll_wait(2).
|
||||
|
@ -642,7 +642,9 @@ out_unlock:
|
||||
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
|
||||
|
||||
/*
|
||||
* variant of __close_fd that gets a ref on the file for later fput
|
||||
* variant of __close_fd that gets a ref on the file for later fput.
|
||||
* The caller must ensure that filp_close() called on the file, and then
|
||||
* an fput().
|
||||
*/
|
||||
int __close_fd_get_file(unsigned int fd, struct file **res)
|
||||
{
|
||||
@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
|
||||
spin_unlock(&files->file_lock);
|
||||
get_file(file);
|
||||
*res = file;
|
||||
return filp_close(file, files);
|
||||
return 0;
|
||||
|
||||
out_unlock:
|
||||
spin_unlock(&files->file_lock);
|
||||
|
@ -124,6 +124,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
|
||||
const struct open_flags *op);
|
||||
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
|
||||
const char *, const struct open_flags *);
|
||||
extern struct open_how build_open_how(int flags, umode_t mode);
|
||||
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
|
||||
|
||||
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
|
||||
long do_faccessat(int dfd, const char __user *filename, int mode);
|
||||
@ -182,3 +184,9 @@ extern const struct dentry_operations ns_dentry_operations;
|
||||
|
||||
/* direct-io.c: */
|
||||
int sb_init_dio_done_wq(struct super_block *sb);
|
||||
|
||||
/*
|
||||
* fs/stat.c:
|
||||
*/
|
||||
unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags);
|
||||
int cp_statx(const struct kstat *stat, struct statx __user *buffer);
|
||||
|
103
fs/io-wq.c
103
fs/io-wq.c
@ -56,7 +56,8 @@ struct io_worker {
|
||||
|
||||
struct rcu_head rcu;
|
||||
struct mm_struct *mm;
|
||||
const struct cred *creds;
|
||||
const struct cred *cur_creds;
|
||||
const struct cred *saved_creds;
|
||||
struct files_struct *restore_files;
|
||||
};
|
||||
|
||||
@ -109,10 +110,10 @@ struct io_wq {
|
||||
|
||||
struct task_struct *manager;
|
||||
struct user_struct *user;
|
||||
const struct cred *creds;
|
||||
struct mm_struct *mm;
|
||||
refcount_t refs;
|
||||
struct completion done;
|
||||
|
||||
refcount_t use_refs;
|
||||
};
|
||||
|
||||
static bool io_worker_get(struct io_worker *worker)
|
||||
@ -135,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
|
||||
{
|
||||
bool dropped_lock = false;
|
||||
|
||||
if (worker->creds) {
|
||||
revert_creds(worker->creds);
|
||||
worker->creds = NULL;
|
||||
if (worker->saved_creds) {
|
||||
revert_creds(worker->saved_creds);
|
||||
worker->cur_creds = worker->saved_creds = NULL;
|
||||
}
|
||||
|
||||
if (current->files != worker->restore_files) {
|
||||
@ -396,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
|
||||
{
|
||||
if (worker->mm) {
|
||||
unuse_mm(worker->mm);
|
||||
mmput(worker->mm);
|
||||
worker->mm = NULL;
|
||||
}
|
||||
if (!work->mm) {
|
||||
set_fs(KERNEL_DS);
|
||||
return;
|
||||
}
|
||||
if (mmget_not_zero(work->mm)) {
|
||||
use_mm(work->mm);
|
||||
if (!worker->mm)
|
||||
set_fs(USER_DS);
|
||||
worker->mm = work->mm;
|
||||
/* hang on to this mm */
|
||||
work->mm = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
/* failed grabbing mm, ensure work gets cancelled */
|
||||
work->flags |= IO_WQ_WORK_CANCEL;
|
||||
}
|
||||
|
||||
static void io_wq_switch_creds(struct io_worker *worker,
|
||||
struct io_wq_work *work)
|
||||
{
|
||||
const struct cred *old_creds = override_creds(work->creds);
|
||||
|
||||
worker->cur_creds = work->creds;
|
||||
if (worker->saved_creds)
|
||||
put_cred(old_creds); /* creds set by previous switch */
|
||||
else
|
||||
worker->saved_creds = old_creds;
|
||||
}
|
||||
|
||||
static void io_worker_handle_work(struct io_worker *worker)
|
||||
__releases(wqe->lock)
|
||||
{
|
||||
@ -438,24 +476,19 @@ next:
|
||||
if (work->flags & IO_WQ_WORK_CB)
|
||||
work->func(&work);
|
||||
|
||||
if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
|
||||
current->files != work->files) {
|
||||
if (work->files && current->files != work->files) {
|
||||
task_lock(current);
|
||||
current->files = work->files;
|
||||
task_unlock(current);
|
||||
}
|
||||
if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm &&
|
||||
wq->mm) {
|
||||
if (mmget_not_zero(wq->mm)) {
|
||||
use_mm(wq->mm);
|
||||
set_fs(USER_DS);
|
||||
worker->mm = wq->mm;
|
||||
} else {
|
||||
work->flags |= IO_WQ_WORK_CANCEL;
|
||||
}
|
||||
}
|
||||
if (!worker->creds)
|
||||
worker->creds = override_creds(wq->creds);
|
||||
if (work->mm != worker->mm)
|
||||
io_wq_switch_mm(worker, work);
|
||||
if (worker->cur_creds != work->creds)
|
||||
io_wq_switch_creds(worker, work);
|
||||
/*
|
||||
* OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
|
||||
* the worker function will do the right thing.
|
||||
*/
|
||||
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
|
||||
work->flags |= IO_WQ_WORK_CANCEL;
|
||||
if (worker->mm)
|
||||
@ -720,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
|
||||
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
|
||||
{
|
||||
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
|
||||
int work_flags;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
@ -734,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
|
||||
return;
|
||||
}
|
||||
|
||||
work_flags = work->flags;
|
||||
spin_lock_irqsave(&wqe->lock, flags);
|
||||
wq_list_add_tail(&work->list, &wqe->work_list);
|
||||
wqe->flags &= ~IO_WQE_FLAG_STALLED;
|
||||
spin_unlock_irqrestore(&wqe->lock, flags);
|
||||
|
||||
if (!atomic_read(&acct->nr_running))
|
||||
if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
|
||||
!atomic_read(&acct->nr_running))
|
||||
io_wqe_wake_worker(wqe, acct);
|
||||
}
|
||||
|
||||
@ -828,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
|
||||
*/
|
||||
spin_lock_irqsave(&worker->lock, flags);
|
||||
if (worker->cur_work &&
|
||||
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
|
||||
data->cancel(worker->cur_work, data->caller_data)) {
|
||||
send_sig(SIGINT, worker->task, 1);
|
||||
ret = true;
|
||||
@ -902,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
|
||||
return false;
|
||||
|
||||
spin_lock_irqsave(&worker->lock, flags);
|
||||
if (worker->cur_work == work) {
|
||||
if (worker->cur_work == work &&
|
||||
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
|
||||
send_sig(SIGINT, worker->task, 1);
|
||||
ret = true;
|
||||
}
|
||||
@ -1026,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
|
||||
|
||||
/* caller must already hold a reference to this */
|
||||
wq->user = data->user;
|
||||
wq->creds = data->creds;
|
||||
|
||||
for_each_node(node) {
|
||||
struct io_wqe *wqe;
|
||||
@ -1053,9 +1090,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
|
||||
|
||||
init_completion(&wq->done);
|
||||
|
||||
/* caller must have already done mmgrab() on this mm */
|
||||
wq->mm = data->mm;
|
||||
|
||||
wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
|
||||
if (!IS_ERR(wq->manager)) {
|
||||
wake_up_process(wq->manager);
|
||||
@ -1064,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
refcount_set(&wq->use_refs, 1);
|
||||
reinit_completion(&wq->done);
|
||||
return wq;
|
||||
}
|
||||
@ -1078,13 +1113,21 @@ err:
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
|
||||
{
|
||||
if (data->get_work != wq->get_work || data->put_work != wq->put_work)
|
||||
return false;
|
||||
|
||||
return refcount_inc_not_zero(&wq->use_refs);
|
||||
}
|
||||
|
||||
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
|
||||
{
|
||||
wake_up_process(worker->task);
|
||||
return false;
|
||||
}
|
||||
|
||||
void io_wq_destroy(struct io_wq *wq)
|
||||
static void __io_wq_destroy(struct io_wq *wq)
|
||||
{
|
||||
int node;
|
||||
|
||||
@ -1104,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq)
|
||||
kfree(wq->wqes);
|
||||
kfree(wq);
|
||||
}
|
||||
|
||||
void io_wq_destroy(struct io_wq *wq)
|
||||
{
|
||||
if (refcount_dec_and_test(&wq->use_refs))
|
||||
__io_wq_destroy(wq);
|
||||
}
|
||||
|
11
fs/io-wq.h
11
fs/io-wq.h
@ -7,11 +7,11 @@ enum {
|
||||
IO_WQ_WORK_CANCEL = 1,
|
||||
IO_WQ_WORK_HAS_MM = 2,
|
||||
IO_WQ_WORK_HASHED = 4,
|
||||
IO_WQ_WORK_NEEDS_USER = 8,
|
||||
IO_WQ_WORK_NEEDS_FILES = 16,
|
||||
IO_WQ_WORK_UNBOUND = 32,
|
||||
IO_WQ_WORK_INTERNAL = 64,
|
||||
IO_WQ_WORK_CB = 128,
|
||||
IO_WQ_WORK_NO_CANCEL = 256,
|
||||
IO_WQ_WORK_CONCURRENT = 512,
|
||||
|
||||
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
|
||||
};
|
||||
@ -72,6 +72,8 @@ struct io_wq_work {
|
||||
};
|
||||
void (*func)(struct io_wq_work **);
|
||||
struct files_struct *files;
|
||||
struct mm_struct *mm;
|
||||
const struct cred *creds;
|
||||
unsigned flags;
|
||||
};
|
||||
|
||||
@ -81,21 +83,22 @@ struct io_wq_work {
|
||||
(work)->func = _func; \
|
||||
(work)->flags = 0; \
|
||||
(work)->files = NULL; \
|
||||
(work)->mm = NULL; \
|
||||
(work)->creds = NULL; \
|
||||
} while (0) \
|
||||
|
||||
typedef void (get_work_fn)(struct io_wq_work *);
|
||||
typedef void (put_work_fn)(struct io_wq_work *);
|
||||
|
||||
struct io_wq_data {
|
||||
struct mm_struct *mm;
|
||||
struct user_struct *user;
|
||||
const struct cred *creds;
|
||||
|
||||
get_work_fn *get_work;
|
||||
put_work_fn *put_work;
|
||||
};
|
||||
|
||||
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
|
||||
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
|
||||
void io_wq_destroy(struct io_wq *wq);
|
||||
|
||||
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
|
||||
|
2218
fs/io_uring.c
2218
fs/io_uring.c
File diff suppressed because it is too large
Load Diff
@ -958,7 +958,7 @@ EXPORT_SYMBOL(open_with_fake_path);
|
||||
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
|
||||
#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
|
||||
|
||||
static inline struct open_how build_open_how(int flags, umode_t mode)
|
||||
inline struct open_how build_open_how(int flags, umode_t mode)
|
||||
{
|
||||
struct open_how how = {
|
||||
.flags = flags & VALID_OPEN_FLAGS,
|
||||
@ -974,8 +974,7 @@ static inline struct open_how build_open_how(int flags, umode_t mode)
|
||||
return how;
|
||||
}
|
||||
|
||||
static inline int build_open_flags(const struct open_how *how,
|
||||
struct open_flags *op)
|
||||
inline int build_open_flags(const struct open_how *how, struct open_flags *op)
|
||||
{
|
||||
int flags = how->flags;
|
||||
int lookup_flags = 0;
|
||||
|
34
fs/stat.c
34
fs/stat.c
@ -21,6 +21,8 @@
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/unistd.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
/**
|
||||
* generic_fillattr - Fill in the basic attributes from the inode struct
|
||||
* @inode: Inode to use as the source
|
||||
@ -150,6 +152,23 @@ int vfs_statx_fd(unsigned int fd, struct kstat *stat,
|
||||
}
|
||||
EXPORT_SYMBOL(vfs_statx_fd);
|
||||
|
||||
inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags)
|
||||
{
|
||||
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
|
||||
AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
|
||||
return -EINVAL;
|
||||
|
||||
*lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
|
||||
if (flags & AT_SYMLINK_NOFOLLOW)
|
||||
*lookup_flags &= ~LOOKUP_FOLLOW;
|
||||
if (flags & AT_NO_AUTOMOUNT)
|
||||
*lookup_flags &= ~LOOKUP_AUTOMOUNT;
|
||||
if (flags & AT_EMPTY_PATH)
|
||||
*lookup_flags |= LOOKUP_EMPTY;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* vfs_statx - Get basic and extra attributes by filename
|
||||
* @dfd: A file descriptor representing the base dir for a relative filename
|
||||
@ -170,19 +189,10 @@ int vfs_statx(int dfd, const char __user *filename, int flags,
|
||||
{
|
||||
struct path path;
|
||||
int error = -EINVAL;
|
||||
unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
|
||||
unsigned lookup_flags;
|
||||
|
||||
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
|
||||
AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
|
||||
if (vfs_stat_set_lookup_flags(&lookup_flags, flags))
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & AT_SYMLINK_NOFOLLOW)
|
||||
lookup_flags &= ~LOOKUP_FOLLOW;
|
||||
if (flags & AT_NO_AUTOMOUNT)
|
||||
lookup_flags &= ~LOOKUP_AUTOMOUNT;
|
||||
if (flags & AT_EMPTY_PATH)
|
||||
lookup_flags |= LOOKUP_EMPTY;
|
||||
|
||||
retry:
|
||||
error = user_path_at(dfd, filename, lookup_flags, &path);
|
||||
if (error)
|
||||
@ -523,7 +533,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
|
||||
}
|
||||
#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */
|
||||
|
||||
static noinline_for_stack int
|
||||
noinline_for_stack int
|
||||
cp_statx(const struct kstat *stat, struct statx __user *buffer)
|
||||
{
|
||||
struct statx tmp;
|
||||
|
@ -61,6 +61,15 @@ static inline void eventpoll_release(struct file *file)
|
||||
eventpoll_release_file(file);
|
||||
}
|
||||
|
||||
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
|
||||
bool nonblock);
|
||||
|
||||
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
|
||||
static inline int ep_op_has_event(int op)
|
||||
{
|
||||
return op != EPOLL_CTL_DEL;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void eventpoll_init_file(struct file *file) {}
|
||||
|
@ -2323,6 +2323,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
|
||||
struct list_head *uf, bool downgrade);
|
||||
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
|
||||
struct list_head *uf);
|
||||
extern int do_madvise(unsigned long start, size_t len_in, int behavior);
|
||||
|
||||
static inline unsigned long
|
||||
do_mmap_pgoff(struct file *file, unsigned long addr,
|
||||
|
@ -209,6 +209,36 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
|
||||
percpu_ref_get_many(ref, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* percpu_ref_tryget_many - try to increment a percpu refcount
|
||||
* @ref: percpu_ref to try-get
|
||||
* @nr: number of references to get
|
||||
*
|
||||
* Increment a percpu refcount by @nr unless its count already reached zero.
|
||||
* Returns %true on success; %false on failure.
|
||||
*
|
||||
* This function is safe to call as long as @ref is between init and exit.
|
||||
*/
|
||||
static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
|
||||
unsigned long nr)
|
||||
{
|
||||
unsigned long __percpu *percpu_count;
|
||||
bool ret;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
if (__ref_is_percpu(ref, &percpu_count)) {
|
||||
this_cpu_add(*percpu_count, nr);
|
||||
ret = true;
|
||||
} else {
|
||||
ret = atomic_long_add_unless(&ref->count, nr, 0);
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* percpu_ref_tryget - try to increment a percpu refcount
|
||||
* @ref: percpu_ref to try-get
|
||||
@ -220,21 +250,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
|
||||
*/
|
||||
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
|
||||
{
|
||||
unsigned long __percpu *percpu_count;
|
||||
bool ret;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
if (__ref_is_percpu(ref, &percpu_count)) {
|
||||
this_cpu_inc(*percpu_count);
|
||||
ret = true;
|
||||
} else {
|
||||
ret = atomic_long_inc_not_zero(&ref->count);
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
return percpu_ref_tryget_many(ref, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -320,6 +320,7 @@ TRACE_EVENT(io_uring_complete,
|
||||
* io_uring_submit_sqe - called before submitting one SQE
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @opcode: opcode of request
|
||||
* @user_data: user data associated with the request
|
||||
* @force_nonblock: whether a context blocking or not
|
||||
* @sq_thread: true if sq_thread has submitted this SQE
|
||||
@ -329,12 +330,14 @@ TRACE_EVENT(io_uring_complete,
|
||||
*/
|
||||
TRACE_EVENT(io_uring_submit_sqe,
|
||||
|
||||
TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread),
|
||||
TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock,
|
||||
bool sq_thread),
|
||||
|
||||
TP_ARGS(ctx, user_data, force_nonblock, sq_thread),
|
||||
TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( u8, opcode )
|
||||
__field( u64, user_data )
|
||||
__field( bool, force_nonblock )
|
||||
__field( bool, sq_thread )
|
||||
@ -342,13 +345,15 @@ TRACE_EVENT(io_uring_submit_sqe,
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->opcode = opcode;
|
||||
__entry->user_data = user_data;
|
||||
__entry->force_nonblock = force_nonblock;
|
||||
__entry->sq_thread = sq_thread;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d",
|
||||
__entry->ctx, (unsigned long long) __entry->user_data,
|
||||
TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d",
|
||||
__entry->ctx, __entry->opcode,
|
||||
(unsigned long long) __entry->user_data,
|
||||
__entry->force_nonblock, __entry->sq_thread)
|
||||
);
|
||||
|
||||
|
@ -34,21 +34,43 @@ struct io_uring_sqe {
|
||||
__u32 timeout_flags;
|
||||
__u32 accept_flags;
|
||||
__u32 cancel_flags;
|
||||
__u32 open_flags;
|
||||
__u32 statx_flags;
|
||||
__u32 fadvise_advice;
|
||||
};
|
||||
__u64 user_data; /* data to be passed back at completion time */
|
||||
union {
|
||||
__u16 buf_index; /* index into fixed buffers, if used */
|
||||
struct {
|
||||
/* index into fixed buffers, if used */
|
||||
__u16 buf_index;
|
||||
/* personality to use, if used */
|
||||
__u16 personality;
|
||||
};
|
||||
__u64 __pad2[3];
|
||||
};
|
||||
};
|
||||
|
||||
enum {
|
||||
IOSQE_FIXED_FILE_BIT,
|
||||
IOSQE_IO_DRAIN_BIT,
|
||||
IOSQE_IO_LINK_BIT,
|
||||
IOSQE_IO_HARDLINK_BIT,
|
||||
IOSQE_ASYNC_BIT,
|
||||
};
|
||||
|
||||
/*
|
||||
* sqe->flags
|
||||
*/
|
||||
#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */
|
||||
#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */
|
||||
#define IOSQE_IO_LINK (1U << 2) /* links next sqe */
|
||||
#define IOSQE_IO_HARDLINK (1U << 3) /* like LINK, but stronger */
|
||||
/* use fixed fileset */
|
||||
#define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT)
|
||||
/* issue after inflight IO */
|
||||
#define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT)
|
||||
/* links next sqe */
|
||||
#define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT)
|
||||
/* like LINK, but stronger */
|
||||
#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
|
||||
/* always go async */
|
||||
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
|
||||
|
||||
/*
|
||||
* io_uring_setup() flags
|
||||
@ -57,6 +79,8 @@ struct io_uring_sqe {
|
||||
#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */
|
||||
#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
|
||||
#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
|
||||
#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
|
||||
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
|
||||
|
||||
enum {
|
||||
IORING_OP_NOP,
|
||||
@ -76,6 +100,19 @@ enum {
|
||||
IORING_OP_ASYNC_CANCEL,
|
||||
IORING_OP_LINK_TIMEOUT,
|
||||
IORING_OP_CONNECT,
|
||||
IORING_OP_FALLOCATE,
|
||||
IORING_OP_OPENAT,
|
||||
IORING_OP_CLOSE,
|
||||
IORING_OP_FILES_UPDATE,
|
||||
IORING_OP_STATX,
|
||||
IORING_OP_READ,
|
||||
IORING_OP_WRITE,
|
||||
IORING_OP_FADVISE,
|
||||
IORING_OP_MADVISE,
|
||||
IORING_OP_SEND,
|
||||
IORING_OP_RECV,
|
||||
IORING_OP_OPENAT2,
|
||||
IORING_OP_EPOLL_CTL,
|
||||
|
||||
/* this goes last, obviously */
|
||||
IORING_OP_LAST,
|
||||
@ -153,7 +190,8 @@ struct io_uring_params {
|
||||
__u32 sq_thread_cpu;
|
||||
__u32 sq_thread_idle;
|
||||
__u32 features;
|
||||
__u32 resv[4];
|
||||
__u32 wq_fd;
|
||||
__u32 resv[3];
|
||||
struct io_sqring_offsets sq_off;
|
||||
struct io_cqring_offsets cq_off;
|
||||
};
|
||||
@ -164,6 +202,8 @@ struct io_uring_params {
|
||||
#define IORING_FEAT_SINGLE_MMAP (1U << 0)
|
||||
#define IORING_FEAT_NODROP (1U << 1)
|
||||
#define IORING_FEAT_SUBMIT_STABLE (1U << 2)
|
||||
#define IORING_FEAT_RW_CUR_POS (1U << 3)
|
||||
#define IORING_FEAT_CUR_PERSONALITY (1U << 4)
|
||||
|
||||
/*
|
||||
* io_uring_register(2) opcodes and arguments
|
||||
@ -175,6 +215,10 @@ struct io_uring_params {
|
||||
#define IORING_REGISTER_EVENTFD 4
|
||||
#define IORING_UNREGISTER_EVENTFD 5
|
||||
#define IORING_REGISTER_FILES_UPDATE 6
|
||||
#define IORING_REGISTER_EVENTFD_ASYNC 7
|
||||
#define IORING_REGISTER_PROBE 8
|
||||
#define IORING_REGISTER_PERSONALITY 9
|
||||
#define IORING_UNREGISTER_PERSONALITY 10
|
||||
|
||||
struct io_uring_files_update {
|
||||
__u32 offset;
|
||||
@ -182,4 +226,21 @@ struct io_uring_files_update {
|
||||
__aligned_u64 /* __s32 * */ fds;
|
||||
};
|
||||
|
||||
#define IO_URING_OP_SUPPORTED (1U << 0)
|
||||
|
||||
struct io_uring_probe_op {
|
||||
__u8 op;
|
||||
__u8 resv;
|
||||
__u16 flags; /* IO_URING_OP_* flags */
|
||||
__u32 resv2;
|
||||
};
|
||||
|
||||
struct io_uring_probe {
|
||||
__u8 last_op; /* last opcode supported */
|
||||
__u8 ops_len; /* length of ops[] array below */
|
||||
__u16 resv;
|
||||
__u32 resv2[3];
|
||||
struct io_uring_probe_op ops[0];
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -1044,7 +1044,7 @@ madvise_behavior_valid(int behavior)
|
||||
* -EBADF - map exists, but area maps something that isn't a file.
|
||||
* -EAGAIN - a kernel resource was temporarily unavailable.
|
||||
*/
|
||||
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
|
||||
int do_madvise(unsigned long start, size_t len_in, int behavior)
|
||||
{
|
||||
unsigned long end, tmp;
|
||||
struct vm_area_struct *vma, *prev;
|
||||
@ -1141,3 +1141,8 @@ out:
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
|
||||
{
|
||||
return do_madvise(start, len_in, behavior);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user