[PATCH] pi-futex: futex_lock_pi/futex_unlock_pi support

This adds the actual pi-futex implementation, based on rt-mutexes.

[dino@in.ibm.com: fix an oops-causing race]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
Ingo Molnar 2006-06-27 02:54:58 -07:00 committed by Linus Torvalds
parent 0cdbee9920
commit c87e2837be
7 changed files with 828 additions and 41 deletions

View File

@ -12,6 +12,9 @@
#define FUTEX_REQUEUE 3 #define FUTEX_REQUEUE 3
#define FUTEX_CMP_REQUEUE 4 #define FUTEX_CMP_REQUEUE 4
#define FUTEX_WAKE_OP 5 #define FUTEX_WAKE_OP 5
#define FUTEX_LOCK_PI 6
#define FUTEX_UNLOCK_PI 7
#define FUTEX_TRYLOCK_PI 8
/* /*
* Support for robust futexes: the kernel cleans up held futexes at * Support for robust futexes: the kernel cleans up held futexes at
@ -97,10 +100,14 @@ extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr);
#ifdef CONFIG_FUTEX #ifdef CONFIG_FUTEX
extern void exit_robust_list(struct task_struct *curr); extern void exit_robust_list(struct task_struct *curr);
extern void exit_pi_state_list(struct task_struct *curr);
#else #else
static inline void exit_robust_list(struct task_struct *curr) static inline void exit_robust_list(struct task_struct *curr)
{ {
} }
static inline void exit_pi_state_list(struct task_struct *curr)
{
}
#endif #endif
#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ #define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */

View File

@ -84,6 +84,7 @@ struct sched_param {
#include <asm/processor.h> #include <asm/processor.h>
struct exec_domain; struct exec_domain;
struct futex_pi_state;
/* /*
* List of flags we want to share for kernel threads, * List of flags we want to share for kernel threads,
@ -915,6 +916,8 @@ struct task_struct {
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list; struct compat_robust_list_head __user *compat_robust_list;
#endif #endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
atomic_t fs_excl; /* holding fs exclusive resources */ atomic_t fs_excl; /* holding fs exclusive resources */
struct rcu_head rcu; struct rcu_head rcu;

View File

@ -925,6 +925,14 @@ fastcall NORET_TYPE void do_exit(long code)
mpol_free(tsk->mempolicy); mpol_free(tsk->mempolicy);
tsk->mempolicy = NULL; tsk->mempolicy = NULL;
#endif #endif
/*
* This must happen late, after the PID is not
* hashed anymore:
*/
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
if (unlikely(current->pi_state_cache))
kfree(current->pi_state_cache);
/* /*
* If DEBUG_MUTEXES is on, make sure we are holding no locks: * If DEBUG_MUTEXES is on, make sure we are holding no locks:
*/ */

View File

@ -1092,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags,
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
p->compat_robust_list = NULL; p->compat_robust_list = NULL;
#endif #endif
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;
/* /*
* sigaltstack should be cleared when sharing the same VM * sigaltstack should be cleared when sharing the same VM
*/ */

View File

@ -12,6 +12,10 @@
* (C) Copyright 2006 Red Hat Inc, All Rights Reserved * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
* Thanks to Thomas Gleixner for suggestions, analysis and fixes. * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
* *
* PI-futex support started by Ingo Molnar and Thomas Gleixner
* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
* Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
*
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew * enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation. * Kirkwood for proof-of-concept implementation.
@ -46,6 +50,8 @@
#include <linux/signal.h> #include <linux/signal.h>
#include <asm/futex.h> #include <asm/futex.h>
#include "rtmutex_common.h"
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/* /*
@ -74,6 +80,27 @@ union futex_key {
} both; } both;
}; };
/*
* Priority Inheritance state:
*/
struct futex_pi_state {
/*
* list of 'owned' pi_state instances - these have to be
* cleaned up in do_exit() if the task exits prematurely:
*/
struct list_head list;
/*
* The PI object:
*/
struct rt_mutex pi_mutex;
struct task_struct *owner;
atomic_t refcount;
union futex_key key;
};
/* /*
* We use this hashed waitqueue instead of a normal wait_queue_t, so * We use this hashed waitqueue instead of a normal wait_queue_t, so
* we can wake only the relevant ones (hashed queues may be shared). * we can wake only the relevant ones (hashed queues may be shared).
@ -96,6 +123,10 @@ struct futex_q {
/* For fd, sigio sent using these: */ /* For fd, sigio sent using these: */
int fd; int fd;
struct file *filp; struct file *filp;
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
struct task_struct *task;
}; };
/* /*
@ -258,6 +289,232 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
return ret ? -EFAULT : 0; return ret ? -EFAULT : 0;
} }
/*
* Fault handling. Called with current->mm->mmap_sem held.
*/
static int futex_handle_fault(unsigned long address, int attempt)
{
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
return -EFAULT;
switch (handle_mm_fault(mm, vma, address, 1)) {
case VM_FAULT_MINOR:
current->min_flt++;
break;
case VM_FAULT_MAJOR:
current->maj_flt++;
break;
default:
return -EFAULT;
}
return 0;
}
/*
* PI code:
*/
static int refill_pi_state_cache(void)
{
struct futex_pi_state *pi_state;
if (likely(current->pi_state_cache))
return 0;
pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
if (!pi_state)
return -ENOMEM;
memset(pi_state, 0, sizeof(*pi_state));
INIT_LIST_HEAD(&pi_state->list);
/* pi_mutex gets initialized later */
pi_state->owner = NULL;
atomic_set(&pi_state->refcount, 1);
current->pi_state_cache = pi_state;
return 0;
}
static struct futex_pi_state * alloc_pi_state(void)
{
struct futex_pi_state *pi_state = current->pi_state_cache;
WARN_ON(!pi_state);
current->pi_state_cache = NULL;
return pi_state;
}
static void free_pi_state(struct futex_pi_state *pi_state)
{
if (!atomic_dec_and_test(&pi_state->refcount))
return;
/*
* If pi_state->owner is NULL, the owner is most probably dying
* and has cleaned up the pi_state already
*/
if (pi_state->owner) {
spin_lock_irq(&pi_state->owner->pi_lock);
list_del_init(&pi_state->list);
spin_unlock_irq(&pi_state->owner->pi_lock);
rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
}
if (current->pi_state_cache)
kfree(pi_state);
else {
/*
* pi_state->list is already empty.
* clear pi_state->owner.
* refcount is at 0 - put it back to 1.
*/
pi_state->owner = NULL;
atomic_set(&pi_state->refcount, 1);
current->pi_state_cache = pi_state;
}
}
/*
* Look up the task based on what TID userspace gave us.
* We dont trust it.
*/
static struct task_struct * futex_find_get_task(pid_t pid)
{
struct task_struct *p;
read_lock(&tasklist_lock);
p = find_task_by_pid(pid);
if (!p)
goto out_unlock;
if ((current->euid != p->euid) && (current->euid != p->uid)) {
p = NULL;
goto out_unlock;
}
if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
p = NULL;
goto out_unlock;
}
get_task_struct(p);
out_unlock:
read_unlock(&tasklist_lock);
return p;
}
/*
* This task is holding PI mutexes at exit time => bad.
* Kernel cleans up PI-state, but userspace is likely hosed.
* (Robust-futex cleanup is separate and might save the day for userspace.)
*/
void exit_pi_state_list(struct task_struct *curr)
{
struct futex_hash_bucket *hb;
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
union futex_key key;
/*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
* versus waiters unqueueing themselfs
*/
spin_lock_irq(&curr->pi_lock);
while (!list_empty(head)) {
next = head->next;
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
spin_unlock_irq(&curr->pi_lock);
hb = hash_futex(&key);
spin_lock(&hb->lock);
spin_lock_irq(&curr->pi_lock);
if (head->next != next) {
spin_unlock(&hb->lock);
continue;
}
list_del_init(&pi_state->list);
WARN_ON(pi_state->owner != curr);
pi_state->owner = NULL;
spin_unlock_irq(&curr->pi_lock);
rt_mutex_unlock(&pi_state->pi_mutex);
spin_unlock(&hb->lock);
spin_lock_irq(&curr->pi_lock);
}
spin_unlock_irq(&curr->pi_lock);
}
static int
lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
struct list_head *head;
struct task_struct *p;
pid_t pid;
head = &hb->chain;
list_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &me->key)) {
/*
* Another waiter already exists - bump up
* the refcount and return its pi_state:
*/
pi_state = this->pi_state;
atomic_inc(&pi_state->refcount);
me->pi_state = pi_state;
return 0;
}
}
/*
* We are the first waiter - try to look up the real owner and
* attach the new pi_state to it:
*/
pid = uval & FUTEX_TID_MASK;
p = futex_find_get_task(pid);
if (!p)
return -ESRCH;
pi_state = alloc_pi_state();
/*
* Initialize the pi_mutex in locked state and make 'p'
* the owner of it:
*/
rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
/* Store the key for possible exit cleanups: */
pi_state->key = me->key;
spin_lock_irq(&p->pi_lock);
list_add(&pi_state->list, &p->pi_state_list);
pi_state->owner = p;
spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
me->pi_state = pi_state;
return 0;
}
/* /*
* The hash bucket lock must be held when this is called. * The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed. * Afterwards, the futex_q must not be accessed.
@ -285,6 +542,70 @@ static void wake_futex(struct futex_q *q)
q->lock_ptr = NULL; q->lock_ptr = NULL;
} }
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
{
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
u32 curval, newval;
if (!pi_state)
return -EINVAL;
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
* This happens when we have stolen the lock and the original
* pending owner did not enqueue itself back on the rt_mutex.
* Thats not a tragedy. We know that way, that a lock waiter
* is on the fly. We make the futex_q waiter the pending owner.
*/
if (!new_owner)
new_owner = this->task;
/*
* We pass it to the next owner. (The WAITERS bit is always
* kept enabled while there is PI state around. We must also
* preserve the owner died bit.)
*/
newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
inc_preempt_count();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
dec_preempt_count();
if (curval == -EFAULT)
return -EFAULT;
if (curval != uval)
return -EINVAL;
list_del_init(&pi_state->owner->pi_state_list);
list_add(&pi_state->list, &new_owner->pi_state_list);
pi_state->owner = new_owner;
rt_mutex_unlock(&pi_state->pi_mutex);
return 0;
}
static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
{
u32 oldval;
/*
* There is no waiter, so we unlock the futex. The owner died
* bit has not to be preserved here. We are the owner:
*/
inc_preempt_count();
oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
dec_preempt_count();
if (oldval == -EFAULT)
return oldval;
if (oldval != uval)
return -EAGAIN;
return 0;
}
/* /*
* Wake up all waiters hashed on the physical page that is mapped * Wake up all waiters hashed on the physical page that is mapped
* to this virtual address: * to this virtual address:
@ -309,6 +630,8 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
list_for_each_entry_safe(this, next, head, list) { list_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key)) { if (match_futex (&this->key, &key)) {
if (this->pi_state)
return -EINVAL;
wake_futex(this); wake_futex(this);
if (++ret >= nr_wake) if (++ret >= nr_wake)
break; break;
@ -385,27 +708,9 @@ retry:
* still holding the mmap_sem. * still holding the mmap_sem.
*/ */
if (attempt++) { if (attempt++) {
struct vm_area_struct * vma; if (futex_handle_fault((unsigned long)uaddr2,
struct mm_struct *mm = current->mm; attempt))
unsigned long address = (unsigned long)uaddr2;
ret = -EFAULT;
if (attempt >= 2 ||
!(vma = find_vma(mm, address)) ||
vma->vm_start > address ||
!(vma->vm_flags & VM_WRITE))
goto out; goto out;
switch (handle_mm_fault(mm, vma, address, 1)) {
case VM_FAULT_MINOR:
current->min_flt++;
break;
case VM_FAULT_MAJOR:
current->maj_flt++;
break;
default:
goto out;
}
goto retry; goto retry;
} }
@ -572,6 +877,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{ {
list_add_tail(&q->list, &hb->chain); list_add_tail(&q->list, &hb->chain);
q->task = current;
spin_unlock(&hb->lock); spin_unlock(&hb->lock);
} }
@ -626,6 +932,9 @@ static int unqueue_me(struct futex_q *q)
} }
WARN_ON(list_empty(&q->list)); WARN_ON(list_empty(&q->list));
list_del(&q->list); list_del(&q->list);
BUG_ON(q->pi_state);
spin_unlock(lock_ptr); spin_unlock(lock_ptr);
ret = 1; ret = 1;
} }
@ -634,16 +943,36 @@ static int unqueue_me(struct futex_q *q)
return ret; return ret;
} }
/*
* PI futexes can not be requeued and must remove themself from the
* hash bucket. The hash bucket lock is held on entry and dropped here.
*/
static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
{
WARN_ON(list_empty(&q->list));
list_del(&q->list);
BUG_ON(!q->pi_state);
free_pi_state(q->pi_state);
q->pi_state = NULL;
spin_unlock(&hb->lock);
drop_key_refs(&q->key);
}
static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
{ {
DECLARE_WAITQUEUE(wait, current); struct task_struct *curr = current;
DECLARE_WAITQUEUE(wait, curr);
struct futex_hash_bucket *hb; struct futex_hash_bucket *hb;
struct futex_q q; struct futex_q q;
u32 uval; u32 uval;
int ret; int ret;
q.pi_state = NULL;
retry: retry:
down_read(&current->mm->mmap_sem); down_read(&curr->mm->mmap_sem);
ret = get_futex_key(uaddr, &q.key); ret = get_futex_key(uaddr, &q.key);
if (unlikely(ret != 0)) if (unlikely(ret != 0))
@ -680,7 +1009,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
* If we would have faulted, release mmap_sem, fault it in and * If we would have faulted, release mmap_sem, fault it in and
* start all over again. * start all over again.
*/ */
up_read(&current->mm->mmap_sem); up_read(&curr->mm->mmap_sem);
ret = get_user(uval, uaddr); ret = get_user(uval, uaddr);
@ -688,11 +1017,9 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
goto retry; goto retry;
return ret; return ret;
} }
if (uval != val) {
ret = -EWOULDBLOCK; ret = -EWOULDBLOCK;
queue_unlock(&q, hb); if (uval != val)
goto out_release_sem; goto out_unlock_release_sem;
}
/* Only actually queue if *uaddr contained val. */ /* Only actually queue if *uaddr contained val. */
__queue_me(&q, hb); __queue_me(&q, hb);
@ -701,7 +1028,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
* Now the futex is queued and we have checked the data, we * Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep. * don't want to hold mmap_sem while we sleep.
*/ */
up_read(&current->mm->mmap_sem); up_read(&curr->mm->mmap_sem);
/* /*
* There might have been scheduling since the queue_me(), as we * There might have been scheduling since the queue_me(), as we
@ -739,8 +1066,415 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
*/ */
return -EINTR; return -EINTR;
out_unlock_release_sem:
queue_unlock(&q, hb);
out_release_sem: out_release_sem:
up_read(&curr->mm->mmap_sem);
return ret;
}
/*
* Userspace tried a 0 -> TID atomic transition of the futex value
* and failed. The kernel side here does the whole locking operation:
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
struct hrtimer_sleeper *to)
{
struct task_struct *curr = current;
struct futex_hash_bucket *hb;
u32 uval, newval, curval;
struct futex_q q;
int ret, attempt = 0;
if (refill_pi_state_cache())
return -ENOMEM;
q.pi_state = NULL;
retry:
down_read(&curr->mm->mmap_sem);
ret = get_futex_key(uaddr, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
hb = queue_lock(&q, -1, NULL);
retry_locked:
/*
* To avoid races, we attempt to take the lock here again
* (by doing a 0 -> TID atomic cmpxchg), while holding all
* the locks. It will most likely not succeed.
*/
newval = current->pid;
inc_preempt_count();
curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
dec_preempt_count();
if (unlikely(curval == -EFAULT))
goto uaddr_faulted;
/* We own the lock already */
if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
if (!detect && 0)
force_sig(SIGKILL, current);
ret = -EDEADLK;
goto out_unlock_release_sem;
}
/*
* Surprise - we got the lock. Just return
* to userspace:
*/
if (unlikely(!curval))
goto out_unlock_release_sem;
uval = curval;
newval = uval | FUTEX_WAITERS;
inc_preempt_count();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
dec_preempt_count();
if (unlikely(curval == -EFAULT))
goto uaddr_faulted;
if (unlikely(curval != uval))
goto retry_locked;
/*
* We dont have the lock. Look up the PI state (or create it if
* we are the first waiter):
*/
ret = lookup_pi_state(uval, hb, &q);
if (unlikely(ret)) {
/*
* There were no waiters and the owner task lookup
* failed. When the OWNER_DIED bit is set, then we
* know that this is a robust futex and we actually
* take the lock. This is safe as we are protected by
* the hash bucket lock. We also set the waiters bit
* unconditionally here, to simplify glibc handling of
* multiple tasks racing to acquire the lock and
* cleanup the problems which were left by the dead
* owner.
*/
if (curval & FUTEX_OWNER_DIED) {
uval = newval;
newval = current->pid |
FUTEX_OWNER_DIED | FUTEX_WAITERS;
inc_preempt_count();
curval = futex_atomic_cmpxchg_inatomic(uaddr,
uval, newval);
dec_preempt_count();
if (unlikely(curval == -EFAULT))
goto uaddr_faulted;
if (unlikely(curval != uval))
goto retry_locked;
ret = 0;
}
goto out_unlock_release_sem;
}
/*
* Only actually queue now that the atomic ops are done:
*/
__queue_me(&q, hb);
/*
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
*/
up_read(&curr->mm->mmap_sem);
WARN_ON(!q.pi_state);
/*
* Block on the PI mutex:
*/
if (!trylock)
ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
else {
ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK;
}
down_read(&curr->mm->mmap_sem);
hb = queue_lock(&q, -1, NULL);
/*
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case.
*/
if (!ret && q.pi_state->owner != curr) {
u32 newtid = current->pid | FUTEX_WAITERS;
/* Owner died? */
if (q.pi_state->owner != NULL) {
spin_lock_irq(&q.pi_state->owner->pi_lock);
list_del_init(&q.pi_state->list);
spin_unlock_irq(&q.pi_state->owner->pi_lock);
} else
newtid |= FUTEX_OWNER_DIED;
q.pi_state->owner = current;
spin_lock_irq(&current->pi_lock);
list_add(&q.pi_state->list, &current->pi_state_list);
spin_unlock_irq(&current->pi_lock);
/* Unqueue and drop the lock */
unqueue_me_pi(&q, hb);
up_read(&curr->mm->mmap_sem);
/*
* We own it, so we have to replace the pending owner
* TID. This must be atomic as we have preserve the
* owner died bit here.
*/
ret = get_user(uval, uaddr);
while (!ret) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
curval = futex_atomic_cmpxchg_inatomic(uaddr,
uval, newval);
if (curval == -EFAULT)
ret = -EFAULT;
if (curval == uval)
break;
uval = curval;
}
} else {
/*
* Catch the rare case, where the lock was released
* when we were on the way back before we locked
* the hash bucket.
*/
if (ret && q.pi_state->owner == curr) {
if (rt_mutex_trylock(&q.pi_state->pi_mutex))
ret = 0;
}
/* Unqueue and drop the lock */
unqueue_me_pi(&q, hb);
up_read(&curr->mm->mmap_sem);
}
if (!detect && ret == -EDEADLK && 0)
force_sig(SIGKILL, current);
return ret;
out_unlock_release_sem:
queue_unlock(&q, hb);
out_release_sem:
up_read(&curr->mm->mmap_sem);
return ret;
uaddr_faulted:
/*
* We have to r/w *(int __user *)uaddr, but we can't modify it
* non-atomically. Therefore, if get_user below is not
* enough, we need to handle the fault ourselves, while
* still holding the mmap_sem.
*/
if (attempt++) {
if (futex_handle_fault((unsigned long)uaddr, attempt))
goto out_unlock_release_sem;
goto retry_locked;
}
queue_unlock(&q, hb);
up_read(&curr->mm->mmap_sem);
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
goto retry;
return ret;
}
/*
* Restart handler
*/
static long futex_lock_pi_restart(struct restart_block *restart)
{
struct hrtimer_sleeper timeout, *to = NULL;
int ret;
restart->fn = do_no_restart_syscall;
if (restart->arg2 || restart->arg3) {
to = &timeout;
hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
hrtimer_init_sleeper(to, current);
to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
(u64) restart->arg0;
}
pr_debug("lock_pi restart: %p, %d (%d)\n",
(u32 __user *)restart->arg0, current->pid);
ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
0, to);
if (ret != -EINTR)
return ret;
restart->fn = futex_lock_pi_restart;
/* The other values are filled in */
return -ERESTART_RESTARTBLOCK;
}
/*
* Called from the syscall entry below.
*/
static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
long nsec, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct restart_block *restart;
int ret;
if (sec != MAX_SCHEDULE_TIMEOUT) {
to = &timeout;
hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
hrtimer_init_sleeper(to, current);
to->timer.expires = ktime_set(sec, nsec);
}
ret = do_futex_lock_pi(uaddr, detect, trylock, to);
if (ret != -EINTR)
return ret;
pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
restart = &current_thread_info()->restart_block;
restart->fn = futex_lock_pi_restart;
restart->arg0 = (unsigned long) uaddr;
restart->arg1 = detect;
if (to) {
restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
restart->arg3 = to->timer.expires.tv64 >> 32;
} else
restart->arg2 = restart->arg3 = 0;
return -ERESTART_RESTARTBLOCK;
}
/*
* Userspace attempted a TID -> 0 atomic transition, and failed.
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
static int futex_unlock_pi(u32 __user *uaddr)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
u32 uval;
struct list_head *head;
union futex_key key;
int ret, attempt = 0;
retry:
if (get_user(uval, uaddr))
return -EFAULT;
/*
* We release only a lock we actually own:
*/
if ((uval & FUTEX_TID_MASK) != current->pid)
return -EPERM;
/*
* First take all the futex related locks:
*/
down_read(&current->mm->mmap_sem);
ret = get_futex_key(uaddr, &key);
if (unlikely(ret != 0))
goto out;
hb = hash_futex(&key);
spin_lock(&hb->lock);
retry_locked:
/*
* To avoid races, try to do the TID -> 0 atomic transition
* again. If it succeeds then we can return without waking
* anyone else up:
*/
inc_preempt_count();
uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
dec_preempt_count();
if (unlikely(uval == -EFAULT))
goto pi_faulted;
/*
* Rare case: we managed to release the lock atomically,
* no need to wake anyone else up:
*/
if (unlikely(uval == current->pid))
goto out_unlock;
/*
* Ok, other tasks may need to be woken up - check waiters
* and do the wakeup if necessary:
*/
head = &hb->chain;
list_for_each_entry_safe(this, next, head, list) {
if (!match_futex (&this->key, &key))
continue;
ret = wake_futex_pi(uaddr, uval, this);
/*
* The atomic access to the futex value
* generated a pagefault, so retry the
* user-access and the wakeup:
*/
if (ret == -EFAULT)
goto pi_faulted;
goto out_unlock;
}
/*
* No waiters - kernel unlocks the futex:
*/
ret = unlock_futex_pi(uaddr, uval);
if (ret == -EFAULT)
goto pi_faulted;
out_unlock:
spin_unlock(&hb->lock);
out:
up_read(&current->mm->mmap_sem); up_read(&current->mm->mmap_sem);
return ret;
pi_faulted:
/*
* We have to r/w *(int __user *)uaddr, but we can't modify it
* non-atomically. Therefore, if get_user below is not
* enough, we need to handle the fault ourselves, while
* still holding the mmap_sem.
*/
if (attempt++) {
if (futex_handle_fault((unsigned long)uaddr, attempt))
goto out_unlock;
goto retry_locked;
}
spin_unlock(&hb->lock);
up_read(&current->mm->mmap_sem);
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
goto retry;
return ret; return ret;
} }
@ -819,6 +1553,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
err = -ENOMEM; err = -ENOMEM;
goto error; goto error;
} }
q->pi_state = NULL;
down_read(&current->mm->mmap_sem); down_read(&current->mm->mmap_sem);
err = get_futex_key(uaddr, &q->key); err = get_futex_key(uaddr, &q->key);
@ -856,7 +1591,7 @@ error:
* Implementation: user-space maintains a per-thread list of locks it * Implementation: user-space maintains a per-thread list of locks it
* is holding. Upon do_exit(), the kernel carefully walks this list, * is holding. Upon do_exit(), the kernel carefully walks this list,
* and marks all locks that are owned by this thread with the * and marks all locks that are owned by this thread with the
* FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
* always manipulated with the lock held, so the list is private and * always manipulated with the lock held, so the list is private and
* per-thread. Userspace also maintains a per-thread 'list_op_pending' * per-thread. Userspace also maintains a per-thread 'list_op_pending'
* field, to allow the kernel to clean up if the thread dies after * field, to allow the kernel to clean up if the thread dies after
@ -931,7 +1666,7 @@ err_unlock:
*/ */
int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
{ {
u32 uval; u32 uval, nval;
retry: retry:
if (get_user(uval, uaddr)) if (get_user(uval, uaddr))
@ -948,8 +1683,12 @@ retry:
* thread-death.) The rest of the cleanup is done in * thread-death.) The rest of the cleanup is done in
* userspace. * userspace.
*/ */
if (futex_atomic_cmpxchg_inatomic(uaddr, uval, nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
uval | FUTEX_OWNER_DIED) != uval) uval | FUTEX_OWNER_DIED);
if (nval == -EFAULT)
return -1;
if (nval != uval)
goto retry; goto retry;
if (uval & FUTEX_WAITERS) if (uval & FUTEX_WAITERS)
@ -994,7 +1733,7 @@ void exit_robust_list(struct task_struct *curr)
while (entry != &head->list) { while (entry != &head->list) {
/* /*
* A pending lock might already be on the list, so * A pending lock might already be on the list, so
* dont process it twice: * don't process it twice:
*/ */
if (entry != pending) if (entry != pending)
if (handle_futex_death((void *)entry + futex_offset, if (handle_futex_death((void *)entry + futex_offset,
@ -1040,6 +1779,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
case FUTEX_WAKE_OP: case FUTEX_WAKE_OP:
ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
break; break;
case FUTEX_LOCK_PI:
ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
break;
case FUTEX_UNLOCK_PI:
ret = futex_unlock_pi(uaddr);
break;
case FUTEX_TRYLOCK_PI:
ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
break;
default: default:
ret = -ENOSYS; ret = -ENOSYS;
} }
@ -1055,17 +1803,22 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
unsigned long timeout = MAX_SCHEDULE_TIMEOUT; unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
u32 val2 = 0; u32 val2 = 0;
if (utime && (op == FUTEX_WAIT)) { if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
if (copy_from_user(&t, utime, sizeof(t)) != 0) if (copy_from_user(&t, utime, sizeof(t)) != 0)
return -EFAULT; return -EFAULT;
if (!timespec_valid(&t)) if (!timespec_valid(&t))
return -EINVAL; return -EINVAL;
if (op == FUTEX_WAIT)
timeout = timespec_to_jiffies(&t) + 1; timeout = timespec_to_jiffies(&t) + 1;
else {
timeout = t.tv_sec;
val2 = t.tv_nsec;
}
} }
/* /*
* requeue parameter in 'utime' if op == FUTEX_REQUEUE. * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
*/ */
if (op >= FUTEX_REQUEUE) if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
val2 = (u32) (unsigned long) utime; val2 = (u32) (unsigned long) utime;
return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);

View File

@ -129,14 +129,19 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
unsigned long timeout = MAX_SCHEDULE_TIMEOUT; unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
int val2 = 0; int val2 = 0;
if (utime && (op == FUTEX_WAIT)) { if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
if (get_compat_timespec(&t, utime)) if (get_compat_timespec(&t, utime))
return -EFAULT; return -EFAULT;
if (!timespec_valid(&t)) if (!timespec_valid(&t))
return -EINVAL; return -EINVAL;
if (op == FUTEX_WAIT)
timeout = timespec_to_jiffies(&t) + 1; timeout = timespec_to_jiffies(&t) + 1;
else {
timeout = t.tv_sec;
val2 = t.tv_nsec;
} }
if (op >= FUTEX_REQUEUE) }
if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
val2 = (int) (unsigned long) utime; val2 = (int) (unsigned long) utime;
return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);

View File

@ -112,4 +112,12 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
} }
/*
* PI-futex support (proxy locking functions, etc.):
*/
extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
#endif #endif