Three small fixes/updates for the locking core code:

- Plug a task struct reference leak in the percpu rswem implementation. - Document the refcount interaction with PID_MAX_LIMIT - Improve the 'invalid wait context' data dump in lockdep so it contains all information which is required to decode the problem -----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAl6TEJoTHHRnbHhAbGlu dXRyb25peC5kZQAKCRCmGPVMDXSYoeY6D/42AKqIvpAPiPA+Aod9o8o7qDCoPJQN kwCx939+Mwrwm8YS3WJGJvR/jgoGeq33gerY93D0+jkMyhmvYeWbUIEdCenyy0+C tkA7pN1j61D7jMqmL8PgrObhQMM8tTdNNsiNxg5qZA8wI2oFBMbAStx5mCficCyJ 8t3hh/2aRVlNN1MBjo25+jEWEXUIchAzmxnmR+5t/pRYWythOhLuWW05D8b1JoN3 9zdHj9ZScONlrK+VmpXZl41SokT385arMXcmUuS+zPx6zl4P6yTgeHMLsm4Kt3+j 1q0hm4fUEEW2IrF1Uc9pxtJh/39UVwvX8Wfj/wJLm3b1akmO+3tQ2cpiee4NHk2H WogvBNGG4QeGMrRkg0seiqUE6SOaNPXdGRszPc6EkzwpOywDjHVYV5qbII1VOpbK z/TJrtv8g8ejgAvyG0dEtEDydm2SqPAIAZFXj9JQ0I/gUZKTW5rvGQWdMXW6HPif wdJJ6xb4DC3f5DrBrXgpkMSdwfea2boIoLDMf9tCcdlRs6vmvY3iRLHy2xBLcRFt 9yVDUxO40eJUxztbFKuZsQvgJyBxZubyK+Yo9CCvWQ95GfIr2JurIJEcD2fxVyRU 57/Avt3zt7iViZ9LHJnL4JnWRxftWOdWusF2dsufmnuveCVhtA5B5bl7nGntpIT2 mfJEDrJ7zzhgyA== =Agrh -----END PGP SIGNATURE----- Merge tag 'locking-urgent-2020-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull locking fixes from Thomas Gleixner: "Three small fixes/updates for the locking core code: - Plug a task struct reference leak in the percpu rswem implementation. - Document the refcount interaction with PID_MAX_LIMIT - Improve the 'invalid wait context' data dump in lockdep so it contains all information which is required to decode the problem" * tag 'locking-urgent-2020-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: locking/lockdep: Improve 'invalid wait context' splat locking/refcount: Document interaction with PID_MAX_LIMIT locking/percpu-rwsem: Fix a task_struct refcount
2020-04-12 09:47:10 -07:00 · 2020-04-12 09:47:10 -07:00 · 652fa53caa
commit 652fa53caa
parent 4119bf9f1d 9a019db0b6
3 changed files with 51 additions and 26 deletions
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@ -38,11 +38,24 @@
 * atomic operations, then the count will continue to edge closer to 0. If it
 * reaches a value of 1 before /any/ of the threads reset it to the saturated
 * value, then a concurrent refcount_dec_and_test() may erroneously free the
- * underlying object. Given the precise timing details involved with the
- * round-robin scheduling of each thread manipulating the refcount and the need
- * to hit the race multiple times in succession, there doesn't appear to be a
- * practical avenue of attack even if using refcount_add() operations with
- * larger increments.
+ * underlying object.
+ * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
+ * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
+ * With the current PID limit, if no batched refcounting operations are used and
+ * the attacker can't repeatedly trigger kernel oopses in the middle of refcount
+ * operations, this makes it impossible for a saturated refcount to leave the
+ * saturation range, even if it is possible for multiple uses of the same
+ * refcount to nest in the context of a single task:
+ *
+ *     (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
+ *     0x40000000 / 0x400000 = 0x100 = 256
+ *
+ * If hundreds of references are added/removed with a single refcounting
+ * operation, it may potentially be possible to leave the saturation range; but
+ * given the precise timing details involved with the round-robin scheduling of
+ * each thread manipulating the refcount and the need to hit the race multiple
+ * times in succession, there doesn't appear to be a practical avenue of attack
+ * even if using refcount_add() operations with larger increments.
 *
 * Memory ordering
 * ===============
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@ -3952,10 +3952,36 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 	return ret;
 }

+static inline short task_wait_context(struct task_struct *curr)
+{
+	/*
+	 * Set appropriate wait type for the context; for IRQs we have to take
+	 * into account force_irqthread as that is implied by PREEMPT_RT.
+	 */
+	if (curr->hardirq_context) {
+		/*
+		 * Check if force_irqthreads will run us threaded.
+		 */
+		if (curr->hardirq_threaded || curr->irq_config)
+			return LD_WAIT_CONFIG;
+
+		return LD_WAIT_SPIN;
+	} else if (curr->softirq_context) {
+		/*
+		 * Softirqs are always threaded.
+		 */
+		return LD_WAIT_CONFIG;
+	}
+
+	return LD_WAIT_MAX;
+}
+
 static int
 print_lock_invalid_wait_context(struct task_struct *curr,
 				struct held_lock *hlock)
 {
+	short curr_inner;
+
 	if (!debug_locks_off())
 		return 0;
 	if (debug_locks_silent)
@ -3971,6 +3997,10 @@ print_lock_invalid_wait_context(struct task_struct *curr,
 	print_lock(hlock);

 	pr_warn("other info that might help us debug this:\n");
+
+	curr_inner = task_wait_context(curr);
+	pr_warn("context-{%d:%d}\n", curr_inner, curr_inner);
+
 	lockdep_print_held_locks(curr);

 	pr_warn("stack backtrace:\n");
@ -4017,26 +4047,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
 	}
 	depth++;

-	/*
-	 * Set appropriate wait type for the context; for IRQs we have to take
-	 * into account force_irqthread as that is implied by PREEMPT_RT.
-	 */
-	if (curr->hardirq_context) {
-		/*
-		 * Check if force_irqthreads will run us threaded.
-		 */
-		if (curr->hardirq_threaded || curr->irq_config)
-			curr_inner = LD_WAIT_CONFIG;
-		else
-			curr_inner = LD_WAIT_SPIN;
-	} else if (curr->softirq_context) {
-		/*
-		 * Softirqs are always threaded.
-		 */
-		curr_inner = LD_WAIT_CONFIG;
-	} else {
-		curr_inner = LD_WAIT_MAX;
-	}
+	curr_inner = task_wait_context(curr);

 	for (; depth < curr->lockdep_depth; depth++) {
 		struct held_lock *prev = curr->held_locks + depth;
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@ -118,14 +118,15 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
 				      unsigned int mode, int wake_flags,
 				      void *key)
 {
-	struct task_struct *p = get_task_struct(wq_entry->private);
 	bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
 	struct percpu_rw_semaphore *sem = key;
+	struct task_struct *p;

 	/* concurrent against percpu_down_write(), can get stolen */
 	if (!__percpu_rwsem_trylock(sem, reader))
 		return 1;

+	p = get_task_struct(wq_entry->private);
 	list_del_init(&wq_entry->entry);
 	smp_store_release(&wq_entry->private, NULL);