futex: Pull rt_mutex_futex_unlock() out from under hb->lock
There's a number of 'interesting' problems, all caused by holding
hb->lock while doing the rt_mutex_unlock() equivalient.
Notably:
 - a PI inversion on hb->lock; and,
 - a SCHED_DEADLINE crash because of pointer instability.
The previous changes:
 - changed the locking rules to cover {uval,pi_state} with wait_lock.
 - allow to do rt_mutex_futex_unlock() without dropping wait_lock; which in
   turn allows to rely on wait_lock atomicity completely.
 - simplified the waiter conundrum.
It's now sufficient to hold rtmutex::wait_lock and a reference on the
pi_state to protect the state consistency, so hb->lock can be dropped
before calling rt_mutex_futex_unlock().
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.900002056@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
			
			
This commit is contained in:
		
							parent
							
								
									73d786bd04
								
							
						
					
					
						commit
						16ffa12d74
					
				
							
								
								
									
										154
									
								
								kernel/futex.c
									
									
									
									
									
								
							
							
						
						
									
										154
									
								
								kernel/futex.c
									
									
									
									
									
								
							| @ -921,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr) | ||||
| 		pi_state->owner = NULL; | ||||
| 		raw_spin_unlock_irq(&curr->pi_lock); | ||||
| 
 | ||||
| 		rt_mutex_futex_unlock(&pi_state->pi_mutex); | ||||
| 
 | ||||
| 		get_pi_state(pi_state); | ||||
| 		spin_unlock(&hb->lock); | ||||
| 
 | ||||
| 		rt_mutex_futex_unlock(&pi_state->pi_mutex); | ||||
| 		put_pi_state(pi_state); | ||||
| 
 | ||||
| 		raw_spin_lock_irq(&curr->pi_lock); | ||||
| 	} | ||||
| 	raw_spin_unlock_irq(&curr->pi_lock); | ||||
| @ -1037,6 +1039,11 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval, | ||||
| 	 * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), | ||||
| 	 * which in turn means that futex_lock_pi() still has a reference on | ||||
| 	 * our pi_state. | ||||
| 	 * | ||||
| 	 * The waiter holding a reference on @pi_state also protects against | ||||
| 	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() | ||||
| 	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently | ||||
| 	 * free pi_state before we can take a reference ourselves. | ||||
| 	 */ | ||||
| 	WARN_ON(!atomic_read(&pi_state->refcount)); | ||||
| 
 | ||||
| @ -1380,48 +1387,40 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) | ||||
| 	smp_store_release(&q->lock_ptr, NULL); | ||||
| } | ||||
| 
 | ||||
| static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter, | ||||
| 			 struct futex_hash_bucket *hb) | ||||
| /*
 | ||||
|  * Caller must hold a reference on @pi_state. | ||||
|  */ | ||||
| static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) | ||||
| { | ||||
| 	struct task_struct *new_owner; | ||||
| 	struct futex_pi_state *pi_state = top_waiter->pi_state; | ||||
| 	u32 uninitialized_var(curval), newval; | ||||
| 	struct task_struct *new_owner; | ||||
| 	bool deboost = false; | ||||
| 	DEFINE_WAKE_Q(wake_q); | ||||
| 	bool deboost; | ||||
| 	int ret = 0; | ||||
| 
 | ||||
| 	if (!pi_state) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If current does not own the pi_state then the futex is | ||||
| 	 * inconsistent and user space fiddled with the futex value. | ||||
| 	 */ | ||||
| 	if (pi_state->owner != current) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | ||||
| 	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * When we interleave with futex_lock_pi() where it does | ||||
| 	 * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter, | ||||
| 	 * but the rt_mutex's wait_list can be empty (either still, or again, | ||||
| 	 * depending on which side we land). | ||||
| 	 * | ||||
| 	 * When this happens, give up our locks and try again, giving the | ||||
| 	 * futex_lock_pi() instance time to complete, either by waiting on the | ||||
| 	 * rtmutex or removing itself from the futex queue. | ||||
| 	 */ | ||||
| 	if (!new_owner) { | ||||
| 		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||||
| 		return -EAGAIN; | ||||
| 		/*
 | ||||
| 		 * Since we held neither hb->lock nor wait_lock when coming | ||||
| 		 * into this function, we could have raced with futex_lock_pi() | ||||
| 		 * such that we might observe @this futex_q waiter, but the | ||||
| 		 * rt_mutex's wait_list can be empty (either still, or again, | ||||
| 		 * depending on which side we land). | ||||
| 		 * | ||||
| 		 * When this happens, give up our locks and try again, giving | ||||
| 		 * the futex_lock_pi() instance time to complete, either by | ||||
| 		 * waiting on the rtmutex or removing itself from the futex | ||||
| 		 * queue. | ||||
| 		 */ | ||||
| 		ret = -EAGAIN; | ||||
| 		goto out_unlock; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We pass it to the next owner. The WAITERS bit is always | ||||
| 	 * kept enabled while there is PI state around. We cleanup the | ||||
| 	 * owner died bit, because we are the owner. | ||||
| 	 * We pass it to the next owner. The WAITERS bit is always kept | ||||
| 	 * enabled while there is PI state around. We cleanup the owner | ||||
| 	 * died bit, because we are the owner. | ||||
| 	 */ | ||||
| 	newval = FUTEX_WAITERS | task_pid_vnr(new_owner); | ||||
| 
 | ||||
| @ -1444,10 +1443,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter | ||||
| 			ret = -EINVAL; | ||||
| 	} | ||||
| 
 | ||||
| 	if (ret) { | ||||
| 		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||||
| 		return ret; | ||||
| 	} | ||||
| 	if (ret) | ||||
| 		goto out_unlock; | ||||
| 
 | ||||
| 	raw_spin_lock(&pi_state->owner->pi_lock); | ||||
| 	WARN_ON(list_empty(&pi_state->list)); | ||||
| @ -1465,15 +1462,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter | ||||
| 	 */ | ||||
| 	deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); | ||||
| 
 | ||||
| out_unlock: | ||||
| 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||||
| 	spin_unlock(&hb->lock); | ||||
| 
 | ||||
| 	if (deboost) { | ||||
| 		wake_up_q(&wake_q); | ||||
| 		rt_mutex_adjust_prio(current); | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
| @ -2232,7 +2229,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | ||||
| 	/*
 | ||||
| 	 * We are here either because we stole the rtmutex from the | ||||
| 	 * previous highest priority waiter or we are the highest priority | ||||
| 	 * waiter but failed to get the rtmutex the first time. | ||||
| 	 * waiter but have failed to get the rtmutex the first time. | ||||
| 	 * | ||||
| 	 * We have to replace the newowner TID in the user space variable. | ||||
| 	 * This must be atomic as we have to preserve the owner died bit here. | ||||
| 	 * | ||||
| @ -2249,7 +2247,7 @@ retry: | ||||
| 	if (get_futex_value_locked(&uval, uaddr)) | ||||
| 		goto handle_fault; | ||||
| 
 | ||||
| 	while (1) { | ||||
| 	for (;;) { | ||||
| 		newval = (uval & FUTEX_OWNER_DIED) | newtid; | ||||
| 
 | ||||
| 		if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) | ||||
| @ -2345,6 +2343,10 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) | ||||
| 		/*
 | ||||
| 		 * Got the lock. We might not be the anticipated owner if we | ||||
| 		 * did a lock-steal - fix up the PI-state in that case: | ||||
| 		 * | ||||
| 		 * We can safely read pi_state->owner without holding wait_lock | ||||
| 		 * because we now own the rt_mutex, only the owner will attempt | ||||
| 		 * to change it. | ||||
| 		 */ | ||||
| 		if (q->pi_state->owner != current) | ||||
| 			ret = fixup_pi_state_owner(uaddr, q, current); | ||||
| @ -2584,6 +2586,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, | ||||
| 			 ktime_t *time, int trylock) | ||||
| { | ||||
| 	struct hrtimer_sleeper timeout, *to = NULL; | ||||
| 	struct futex_pi_state *pi_state = NULL; | ||||
| 	struct futex_hash_bucket *hb; | ||||
| 	struct futex_q q = futex_q_init; | ||||
| 	int res, ret; | ||||
| @ -2670,12 +2673,19 @@ retry_private: | ||||
| 	 * If fixup_owner() faulted and was unable to handle the fault, unlock | ||||
| 	 * it and return the fault to userspace. | ||||
| 	 */ | ||||
| 	if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) | ||||
| 		rt_mutex_futex_unlock(&q.pi_state->pi_mutex); | ||||
| 	if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { | ||||
| 		pi_state = q.pi_state; | ||||
| 		get_pi_state(pi_state); | ||||
| 	} | ||||
| 
 | ||||
| 	/* Unqueue and drop the lock */ | ||||
| 	unqueue_me_pi(&q); | ||||
| 
 | ||||
| 	if (pi_state) { | ||||
| 		rt_mutex_futex_unlock(&pi_state->pi_mutex); | ||||
| 		put_pi_state(pi_state); | ||||
| 	} | ||||
| 
 | ||||
| 	goto out_put_key; | ||||
| 
 | ||||
| out_unlock_put_key: | ||||
| @ -2738,10 +2748,36 @@ retry: | ||||
| 	 */ | ||||
| 	top_waiter = futex_top_waiter(hb, &key); | ||||
| 	if (top_waiter) { | ||||
| 		ret = wake_futex_pi(uaddr, uval, top_waiter, hb); | ||||
| 		struct futex_pi_state *pi_state = top_waiter->pi_state; | ||||
| 
 | ||||
| 		ret = -EINVAL; | ||||
| 		if (!pi_state) | ||||
| 			goto out_unlock; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * In case of success wake_futex_pi dropped the hash | ||||
| 		 * bucket lock. | ||||
| 		 * If current does not own the pi_state then the futex is | ||||
| 		 * inconsistent and user space fiddled with the futex value. | ||||
| 		 */ | ||||
| 		if (pi_state->owner != current) | ||||
| 			goto out_unlock; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Grab a reference on the pi_state and drop hb->lock. | ||||
| 		 * | ||||
| 		 * The reference ensures pi_state lives, dropping the hb->lock | ||||
| 		 * is tricky.. wake_futex_pi() will take rt_mutex::wait_lock to | ||||
| 		 * close the races against futex_lock_pi(), but in case of | ||||
| 		 * _any_ fail we'll abort and retry the whole deal. | ||||
| 		 */ | ||||
| 		get_pi_state(pi_state); | ||||
| 		spin_unlock(&hb->lock); | ||||
| 
 | ||||
| 		ret = wake_futex_pi(uaddr, uval, pi_state); | ||||
| 
 | ||||
| 		put_pi_state(pi_state); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Success, we're done! No tricky corner cases. | ||||
| 		 */ | ||||
| 		if (!ret) | ||||
| 			goto out_putkey; | ||||
| @ -2756,7 +2792,6 @@ retry: | ||||
| 		 * setting the FUTEX_WAITERS bit. Try again. | ||||
| 		 */ | ||||
| 		if (ret == -EAGAIN) { | ||||
| 			spin_unlock(&hb->lock); | ||||
| 			put_futex_key(&key); | ||||
| 			goto retry; | ||||
| 		} | ||||
| @ -2764,7 +2799,7 @@ retry: | ||||
| 		 * wake_futex_pi has detected invalid state. Tell user | ||||
| 		 * space. | ||||
| 		 */ | ||||
| 		goto out_unlock; | ||||
| 		goto out_putkey; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| @ -2774,8 +2809,10 @@ retry: | ||||
| 	 * preserve the WAITERS bit not the OWNER_DIED one. We are the | ||||
| 	 * owner. | ||||
| 	 */ | ||||
| 	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) | ||||
| 	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { | ||||
| 		spin_unlock(&hb->lock); | ||||
| 		goto pi_faulted; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If uval has changed, let user space handle it. | ||||
| @ -2789,7 +2826,6 @@ out_putkey: | ||||
| 	return ret; | ||||
| 
 | ||||
| pi_faulted: | ||||
| 	spin_unlock(&hb->lock); | ||||
| 	put_futex_key(&key); | ||||
| 
 | ||||
| 	ret = fault_in_user_writeable(uaddr); | ||||
| @ -2893,6 +2929,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | ||||
| 				 u32 __user *uaddr2) | ||||
| { | ||||
| 	struct hrtimer_sleeper timeout, *to = NULL; | ||||
| 	struct futex_pi_state *pi_state = NULL; | ||||
| 	struct rt_mutex_waiter rt_waiter; | ||||
| 	struct futex_hash_bucket *hb; | ||||
| 	union futex_key key2 = FUTEX_KEY_INIT; | ||||
| @ -2977,8 +3014,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | ||||
| 		if (q.pi_state && (q.pi_state->owner != current)) { | ||||
| 			spin_lock(q.lock_ptr); | ||||
| 			ret = fixup_pi_state_owner(uaddr2, &q, current); | ||||
| 			if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) | ||||
| 				rt_mutex_futex_unlock(&q.pi_state->pi_mutex); | ||||
| 			if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { | ||||
| 				pi_state = q.pi_state; | ||||
| 				get_pi_state(pi_state); | ||||
| 			} | ||||
| 			/*
 | ||||
| 			 * Drop the reference to the pi state which | ||||
| 			 * the requeue_pi() code acquired for us. | ||||
| @ -3017,13 +3056,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | ||||
| 		 * the fault, unlock the rt_mutex and return the fault to | ||||
| 		 * userspace. | ||||
| 		 */ | ||||
| 		if (ret && rt_mutex_owner(pi_mutex) == current) | ||||
| 			rt_mutex_futex_unlock(pi_mutex); | ||||
| 		if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { | ||||
| 			pi_state = q.pi_state; | ||||
| 			get_pi_state(pi_state); | ||||
| 		} | ||||
| 
 | ||||
| 		/* Unqueue and drop the lock. */ | ||||
| 		unqueue_me_pi(&q); | ||||
| 	} | ||||
| 
 | ||||
| 	if (pi_state) { | ||||
| 		rt_mutex_futex_unlock(&pi_state->pi_mutex); | ||||
| 		put_pi_state(pi_state); | ||||
| 	} | ||||
| 
 | ||||
| 	if (ret == -EINTR) { | ||||
| 		/*
 | ||||
| 		 * We've already been requeued, but cannot restart by calling | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user