epoll: Add a flag, EPOLLWAKEUP, to prevent suspend while epoll events are ready

When an epoll_event, that has the EPOLLWAKEUP flag set, is ready, a
wakeup_source will be active to prevent suspend. This can be used to
handle wakeup events from a driver that support poll, e.g. input, if
that driver wakes up the waitqueue passed to epoll before allowing
suspend.

Signed-off-by: Arve Hjønnevåg <arve@android.com>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
This commit is contained in:
Arve Hjønnevåg 2012-05-01 21:33:34 +02:00 committed by Rafael J. Wysocki
parent b86ff9820f
commit 4d7e30d989
3 changed files with 103 additions and 4 deletions

View File

@ -33,6 +33,7 @@
#include <linux/bitops.h> #include <linux/bitops.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/anon_inodes.h> #include <linux/anon_inodes.h>
#include <linux/device.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/io.h> #include <asm/io.h>
#include <asm/mman.h> #include <asm/mman.h>
@ -87,7 +88,7 @@
*/ */
/* Epoll private bits inside the event mask */ /* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
/* Maximum number of nesting allowed inside epoll sets */ /* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4 #define EP_MAX_NESTS 4
@ -154,6 +155,9 @@ struct epitem {
/* List header used to link this item to the "struct file" items list */ /* List header used to link this item to the "struct file" items list */
struct list_head fllink; struct list_head fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source *ws;
/* The structure that describe the interested events and the source fd */ /* The structure that describe the interested events and the source fd */
struct epoll_event event; struct epoll_event event;
}; };
@ -194,6 +198,9 @@ struct eventpoll {
*/ */
struct epitem *ovflist; struct epitem *ovflist;
/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */ /* The user that created the eventpoll descriptor */
struct user_struct *user; struct user_struct *user;
@ -588,8 +595,10 @@ static int ep_scan_ready_list(struct eventpoll *ep,
* queued into ->ovflist but the "txlist" might already * queued into ->ovflist but the "txlist" might already
* contain them, and the list_splice() below takes care of them. * contain them, and the list_splice() below takes care of them.
*/ */
if (!ep_is_linked(&epi->rdllink)) if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist); list_add_tail(&epi->rdllink, &ep->rdllist);
__pm_stay_awake(epi->ws);
}
} }
/* /*
* We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
@ -602,6 +611,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
* Quickly re-inject items left on "txlist". * Quickly re-inject items left on "txlist".
*/ */
list_splice(&txlist, &ep->rdllist); list_splice(&txlist, &ep->rdllist);
__pm_relax(ep->ws);
if (!list_empty(&ep->rdllist)) { if (!list_empty(&ep->rdllist)) {
/* /*
@ -656,6 +666,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
list_del_init(&epi->rdllink); list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags); spin_unlock_irqrestore(&ep->lock, flags);
wakeup_source_unregister(epi->ws);
/* At this point it is safe to free the eventpoll item */ /* At this point it is safe to free the eventpoll item */
kmem_cache_free(epi_cache, epi); kmem_cache_free(epi_cache, epi);
@ -706,6 +718,7 @@ static void ep_free(struct eventpoll *ep)
mutex_unlock(&epmutex); mutex_unlock(&epmutex);
mutex_destroy(&ep->mtx); mutex_destroy(&ep->mtx);
free_uid(ep->user); free_uid(ep->user);
wakeup_source_unregister(ep->ws);
kfree(ep); kfree(ep);
} }
@ -737,6 +750,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
* callback, but it's not actually ready, as far as * callback, but it's not actually ready, as far as
* caller requested events goes. We can remove it here. * caller requested events goes. We can remove it here.
*/ */
__pm_relax(epi->ws);
list_del_init(&epi->rdllink); list_del_init(&epi->rdllink);
} }
} }
@ -927,13 +941,23 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
if (epi->next == EP_UNACTIVE_PTR) { if (epi->next == EP_UNACTIVE_PTR) {
epi->next = ep->ovflist; epi->next = ep->ovflist;
ep->ovflist = epi; ep->ovflist = epi;
if (epi->ws) {
/*
* Activate ep->ws since epi->ws may get
* deactivated at any time.
*/
__pm_stay_awake(ep->ws);
}
} }
goto out_unlock; goto out_unlock;
} }
/* If this file is already in the ready list we exit soon */ /* If this file is already in the ready list we exit soon */
if (!ep_is_linked(&epi->rdllink)) if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist); list_add_tail(&epi->rdllink, &ep->rdllist);
__pm_stay_awake(epi->ws);
}
/* /*
* Wake up ( if active ) both the eventpoll wait list and the ->poll() * Wake up ( if active ) both the eventpoll wait list and the ->poll()
@ -1091,6 +1115,30 @@ static int reverse_path_check(void)
return error; return error;
} }
static int ep_create_wakeup_source(struct epitem *epi)
{
const char *name;
if (!epi->ep->ws) {
epi->ep->ws = wakeup_source_register("eventpoll");
if (!epi->ep->ws)
return -ENOMEM;
}
name = epi->ffd.file->f_path.dentry->d_name.name;
epi->ws = wakeup_source_register(name);
if (!epi->ws)
return -ENOMEM;
return 0;
}
static void ep_destroy_wakeup_source(struct epitem *epi)
{
wakeup_source_unregister(epi->ws);
epi->ws = NULL;
}
/* /*
* Must be called with "mtx" held. * Must be called with "mtx" held.
*/ */
@ -1118,6 +1166,13 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
epi->event = *event; epi->event = *event;
epi->nwait = 0; epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR; epi->next = EP_UNACTIVE_PTR;
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error)
goto error_create_wakeup_source;
} else {
epi->ws = NULL;
}
/* Initialize the poll table using the queue callback */ /* Initialize the poll table using the queue callback */
epq.epi = epi; epq.epi = epi;
@ -1164,6 +1219,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
/* If the file is already "ready" we drop it inside the ready list */ /* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist); list_add_tail(&epi->rdllink, &ep->rdllist);
__pm_stay_awake(epi->ws);
/* Notify waiting tasks that events are available */ /* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq)) if (waitqueue_active(&ep->wq))
@ -1204,6 +1260,9 @@ error_unregister:
list_del_init(&epi->rdllink); list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags); spin_unlock_irqrestore(&ep->lock, flags);
wakeup_source_unregister(epi->ws);
error_create_wakeup_source:
kmem_cache_free(epi_cache, epi); kmem_cache_free(epi_cache, epi);
return error; return error;
@ -1229,6 +1288,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
epi->event.events = event->events; epi->event.events = event->events;
pt._key = event->events; pt._key = event->events;
epi->event.data = event->data; /* protected by mtx */ epi->event.data = event->data; /* protected by mtx */
if (epi->event.events & EPOLLWAKEUP) {
if (!epi->ws)
ep_create_wakeup_source(epi);
} else if (epi->ws) {
ep_destroy_wakeup_source(epi);
}
/* /*
* Get current event bits. We can safely use the file* here because * Get current event bits. We can safely use the file* here because
@ -1244,6 +1309,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
spin_lock_irq(&ep->lock); spin_lock_irq(&ep->lock);
if (!ep_is_linked(&epi->rdllink)) { if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist); list_add_tail(&epi->rdllink, &ep->rdllist);
__pm_stay_awake(epi->ws);
/* Notify waiting tasks that events are available */ /* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq)) if (waitqueue_active(&ep->wq))
@ -1282,6 +1348,18 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
!list_empty(head) && eventcnt < esed->maxevents;) { !list_empty(head) && eventcnt < esed->maxevents;) {
epi = list_first_entry(head, struct epitem, rdllink); epi = list_first_entry(head, struct epitem, rdllink);
/*
* Activate ep->ws before deactivating epi->ws to prevent
* triggering auto-suspend here (in case we reactive epi->ws
* below).
*
* This could be rearranged to delay the deactivation of epi->ws
* instead, but then epi->ws would temporarily be out of sync
* with ep_is_linked().
*/
if (epi->ws && epi->ws->active)
__pm_stay_awake(ep->ws);
__pm_relax(epi->ws);
list_del_init(&epi->rdllink); list_del_init(&epi->rdllink);
pt._key = epi->event.events; pt._key = epi->event.events;
@ -1298,6 +1376,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
if (__put_user(revents, &uevent->events) || if (__put_user(revents, &uevent->events) ||
__put_user(epi->event.data, &uevent->data)) { __put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head); list_add(&epi->rdllink, head);
__pm_stay_awake(epi->ws);
return eventcnt ? eventcnt : -EFAULT; return eventcnt ? eventcnt : -EFAULT;
} }
eventcnt++; eventcnt++;
@ -1317,6 +1396,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
* poll callback will queue them in ep->ovflist. * poll callback will queue them in ep->ovflist.
*/ */
list_add_tail(&epi->rdllink, &ep->rdllist); list_add_tail(&epi->rdllink, &ep->rdllist);
__pm_stay_awake(epi->ws);
} }
} }
} }
@ -1629,6 +1709,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
if (!tfile->f_op || !tfile->f_op->poll) if (!tfile->f_op || !tfile->f_op->poll)
goto error_tgt_fput; goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
if ((epds.events & EPOLLWAKEUP) && !capable(CAP_EPOLLWAKEUP))
goto error_tgt_fput;
/* /*
* We have to check that the file structure underneath the file descriptor * We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit * the user passed to us _is_ an eventpoll file. And also we do not permit

View File

@ -360,8 +360,11 @@ struct cpu_vfs_cap_data {
#define CAP_WAKE_ALARM 35 #define CAP_WAKE_ALARM 35
/* Allow preventing system suspends while epoll events are pending */
#define CAP_LAST_CAP CAP_WAKE_ALARM #define CAP_EPOLLWAKEUP 36
#define CAP_LAST_CAP CAP_EPOLLWAKEUP
#define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)

View File

@ -26,6 +26,18 @@
#define EPOLL_CTL_DEL 2 #define EPOLL_CTL_DEL 2
#define EPOLL_CTL_MOD 3 #define EPOLL_CTL_MOD 3
/*
* Request the handling of system wakeup events so as to prevent system suspends
* from happening while those events are being processed.
*
* Assuming neither EPOLLET nor EPOLLONESHOT is set, system suspends will not be
* re-allowed until epoll_wait is called again after consuming the wakeup
* event(s).
*
* Requires CAP_EPOLLWAKEUP
*/
#define EPOLLWAKEUP (1 << 29)
/* Set the One Shot behaviour for the target file descriptor */ /* Set the One Shot behaviour for the target file descriptor */
#define EPOLLONESHOT (1 << 30) #define EPOLLONESHOT (1 << 30)