From 7aeba709a048d870c15940af8b620b16281c3b9e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 30 May 2024 15:45:42 +0200
Subject: [PATCH 01/64] rcu/nocb: Introduce RCU_NOCB_LOCKDEP_WARN()

Checking for races against concurrent (de-)offloading implies the
creation of !CONFIG_RCU_NOCB_CPU stubs to check if each relevant lock
is held. For now this only implies the nocb_lock but more are to be
expected.

Create instead a NOCB specific version of RCU_LOCKDEP_WARN() to avoid
the proliferation of stubs.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcupdate.h |  7 +++++++
 kernel/rcu/tree_nocb.h   | 14 --------------
 kernel/rcu/tree_plugin.h |  4 ++--
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 13f6f00aecf9..d48d3c237305 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -144,11 +144,18 @@ void rcu_init_nohz(void);
 int rcu_nocb_cpu_offload(int cpu);
 int rcu_nocb_cpu_deoffload(int cpu);
 void rcu_nocb_flush_deferred_wakeup(void);
+
+#define RCU_NOCB_LOCKDEP_WARN(c, s) RCU_LOCKDEP_WARN(c, s)
+
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
 static inline void rcu_init_nohz(void) { }
 static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
 static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
 static inline void rcu_nocb_flush_deferred_wakeup(void) { }
+
+#define RCU_NOCB_LOCKDEP_WARN(c, s)
+
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
 /*
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 3ce30841119a..f4112fc663a7 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -16,10 +16,6 @@
 #ifdef CONFIG_RCU_NOCB_CPU
 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
 static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
-	return lockdep_is_held(&rdp->nocb_lock);
-}
 
 static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
 {
@@ -1653,16 +1649,6 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
 
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
 
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
-	return 0;
-}
-
-static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
-{
-	return false;
-}
-
 /* No ->nocb_lock to acquire.  */
 static void rcu_nocb_lock(struct rcu_data *rdp)
 {
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c569da65b421..f752b2a1d887 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -24,10 +24,10 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
 	 * timers have their own means of synchronization against the
 	 * offloaded state updaters.
 	 */
-	RCU_LOCKDEP_WARN(
+	RCU_NOCB_LOCKDEP_WARN(
 		!(lockdep_is_held(&rcu_state.barrier_mutex) ||
 		  (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
-		  rcu_lockdep_is_held_nocb(rdp) ||
+		  lockdep_is_held(&rdp->nocb_lock) ||
 		  (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) &&
 		   rdp == this_cpu_ptr(&rcu_data)) ||
 		  rcu_current_is_nocb_kthread(rdp)),

From ff81428ede8a290fc5fd85135e39be1065ae6176 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 30 May 2024 15:45:43 +0200
Subject: [PATCH 02/64] rcu/nocb: Move nocb field at the end of state struct

nocb_is_setup is a rarely used field, mostly on boot and CPU hotplug.
It shouldn't occupy the middle of the rcu state hot fields cacheline.

Move it to the end and build it conditionally while at it. More cold
NOCB fields are to come.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index fcf2b4aa3441..a297dc89a09c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -411,7 +411,6 @@ struct rcu_state {
 	arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
 						/* Synchronize offline with */
 						/*  GP pre-initialization. */
-	int nocb_is_setup;			/* nocb is setup from boot */
 
 	/* synchronize_rcu() part. */
 	struct llist_head srs_next;	/* request a GP users. */
@@ -420,6 +419,10 @@ struct rcu_state {
 	struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX];
 	struct work_struct srs_cleanup_work;
 	atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+	int nocb_is_setup;			/* nocb is setup from boot */
+#endif
 };
 
 /* Values for rcu_state structure's gp_flags field. */

From 7be88a857eb84d2e0677690b81ee423dee51c93d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 30 May 2024 15:45:44 +0200
Subject: [PATCH 03/64] rcu/nocb: Assert no callbacks while nocb kthread
 allocation fails

When a NOCB CPU fails to create a nocb kthread on bringup, the CPU is
then deoffloaded. The barrier mutex is locked at this stage. It is
typically used to protect against concurrent (de-)offloading and/or
concurrent rcu_barrier() that would otherwise risk a nocb locking
imbalance. However:

* rcu_barrier() can't run concurrently if it's the boot CPU on early
  boot-up.

* rcu_barrier() can run concurrently if it's a secondary CPU but it is
  expected to see 0 callbacks on this target because it's the first
  time it boots.

* (de-)offloading can't happen concurrently with smp_init(), as
  rcutorture is initialized later, at least not before device_initcall(),
  and userspace isn't available yet.

* (de-)offloading can't happen concurrently with cpu_up(), courtesy of
  cpu_hotplug_lock.

But:

* The lazy shrinker might run concurrently with cpu_up(). It shouldn't
  try to grab the nocb_lock and risk an imbalance due to lazy_len
  supposed to be 0 but be extra cautious.

* Also be cautious against resume from hibernation potential subtleties.

So keep the locking and add some assertions and comments.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree_nocb.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index f4112fc663a7..fdd0616f2fd1 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1442,7 +1442,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
 				"rcuog/%d", rdp_gp->cpu);
 		if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
 			mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
-			goto end;
+			goto err;
 		}
 		WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
 		if (kthread_prio)
@@ -1454,7 +1454,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
 	t = kthread_create(rcu_nocb_cb_kthread, rdp,
 			   "rcuo%c/%d", rcu_state.abbr, cpu);
 	if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
-		goto end;
+		goto err;
 
 	if (rcu_rdp_is_offloaded(rdp))
 		wake_up_process(t);
@@ -1467,7 +1467,15 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
 	WRITE_ONCE(rdp->nocb_cb_kthread, t);
 	WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
 	return;
-end:
+
+err:
+	/*
+	 * No need to protect against concurrent rcu_barrier()
+	 * because the number of callbacks should be 0 for a non-boot CPU,
+	 * therefore rcu_barrier() shouldn't even try to grab the nocb_lock.
+	 * But hold barrier_mutex to avoid nocb_lock imbalance from shrinker.
+	 */
+	WARN_ON_ONCE(system_state > SYSTEM_BOOTING && rcu_segcblist_n_cbs(&rdp->cblist));
 	mutex_lock(&rcu_state.barrier_mutex);
 	if (rcu_rdp_is_offloaded(rdp)) {
 		rcu_nocb_rdp_deoffload(rdp);

From 7121dd915a262aee1f5a88cf86b1543f3a9439d7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 30 May 2024 15:45:45 +0200
Subject: [PATCH 04/64] rcu/nocb: Introduce nocb mutex

The barrier_mutex is used currently to protect (de-)offloading
operations and prevent from nocb_lock locking imbalance in rcu_barrier()
and shrinker, and also from misordered RCU barrier invocation.

Now since RCU (de-)offloading is going to happen on offline CPUs, an RCU
barrier will have to be executed while transitionning from offloaded to
de-offloaded state. And this can't happen while holding the
barrier_mutex.

Introduce a NOCB mutex to protect (de-)offloading transitions. The
barrier_mutex is still held for now when necessary to avoid barrier
callbacks reordering and nocb_lock imbalance.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree.c        |  3 +++
 kernel/rcu/tree.h        |  1 +
 kernel/rcu/tree_nocb.h   | 20 ++++++++++++--------
 kernel/rcu/tree_plugin.h |  1 +
 4 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e641cc681901..2b9e713854b0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -97,6 +97,9 @@ static struct rcu_state rcu_state = {
 	.srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
 		rcu_sr_normal_gp_cleanup_work),
 	.srs_cleanups_pending = ATOMIC_INIT(0),
+#ifdef CONFIG_RCU_NOCB_CPU
+	.nocb_mutex = __MUTEX_INITIALIZER(rcu_state.nocb_mutex),
+#endif
 };
 
 /* Dump rcu_node combining tree at boot to verify correct setup. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a297dc89a09c..16e6fe63d93c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -421,6 +421,7 @@ struct rcu_state {
 	atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */
 
 #ifdef CONFIG_RCU_NOCB_CPU
+	struct mutex nocb_mutex;		/* Guards (de-)offloading */
 	int nocb_is_setup;			/* nocb is setup from boot */
 #endif
 };
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index fdd0616f2fd1..16bcb8b13a5e 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1141,6 +1141,7 @@ int rcu_nocb_cpu_deoffload(int cpu)
 	int ret = 0;
 
 	cpus_read_lock();
+	mutex_lock(&rcu_state.nocb_mutex);
 	mutex_lock(&rcu_state.barrier_mutex);
 	if (rcu_rdp_is_offloaded(rdp)) {
 		if (cpu_online(cpu)) {
@@ -1153,6 +1154,7 @@ int rcu_nocb_cpu_deoffload(int cpu)
 		}
 	}
 	mutex_unlock(&rcu_state.barrier_mutex);
+	mutex_unlock(&rcu_state.nocb_mutex);
 	cpus_read_unlock();
 
 	return ret;
@@ -1228,6 +1230,7 @@ int rcu_nocb_cpu_offload(int cpu)
 	int ret = 0;
 
 	cpus_read_lock();
+	mutex_lock(&rcu_state.nocb_mutex);
 	mutex_lock(&rcu_state.barrier_mutex);
 	if (!rcu_rdp_is_offloaded(rdp)) {
 		if (cpu_online(cpu)) {
@@ -1240,6 +1243,7 @@ int rcu_nocb_cpu_offload(int cpu)
 		}
 	}
 	mutex_unlock(&rcu_state.barrier_mutex);
+	mutex_unlock(&rcu_state.nocb_mutex);
 	cpus_read_unlock();
 
 	return ret;
@@ -1257,7 +1261,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 		return 0;
 
 	/*  Protect rcu_nocb_mask against concurrent (de-)offloading. */
-	if (!mutex_trylock(&rcu_state.barrier_mutex))
+	if (!mutex_trylock(&rcu_state.nocb_mutex))
 		return 0;
 
 	/* Snapshot count of all CPUs */
@@ -1267,7 +1271,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 		count +=  READ_ONCE(rdp->lazy_len);
 	}
 
-	mutex_unlock(&rcu_state.barrier_mutex);
+	mutex_unlock(&rcu_state.nocb_mutex);
 
 	return count ? count : SHRINK_EMPTY;
 }
@@ -1285,9 +1289,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 	 * Protect against concurrent (de-)offloading. Otherwise nocb locking
 	 * may be ignored or imbalanced.
 	 */
-	if (!mutex_trylock(&rcu_state.barrier_mutex)) {
+	if (!mutex_trylock(&rcu_state.nocb_mutex)) {
 		/*
-		 * But really don't insist if barrier_mutex is contended since we
+		 * But really don't insist if nocb_mutex is contended since we
 		 * can't guarantee that it will never engage in a dependency
 		 * chain involving memory allocation. The lock is seldom contended
 		 * anyway.
@@ -1326,7 +1330,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 			break;
 	}
 
-	mutex_unlock(&rcu_state.barrier_mutex);
+	mutex_unlock(&rcu_state.nocb_mutex);
 
 	return count ? count : SHRINK_STOP;
 }
@@ -1473,15 +1477,15 @@ err:
 	 * No need to protect against concurrent rcu_barrier()
 	 * because the number of callbacks should be 0 for a non-boot CPU,
 	 * therefore rcu_barrier() shouldn't even try to grab the nocb_lock.
-	 * But hold barrier_mutex to avoid nocb_lock imbalance from shrinker.
+	 * But hold nocb_mutex to avoid nocb_lock imbalance from shrinker.
 	 */
 	WARN_ON_ONCE(system_state > SYSTEM_BOOTING && rcu_segcblist_n_cbs(&rdp->cblist));
-	mutex_lock(&rcu_state.barrier_mutex);
+	mutex_lock(&rcu_state.nocb_mutex);
 	if (rcu_rdp_is_offloaded(rdp)) {
 		rcu_nocb_rdp_deoffload(rdp);
 		cpumask_clear_cpu(cpu, rcu_nocb_mask);
 	}
-	mutex_unlock(&rcu_state.barrier_mutex);
+	mutex_unlock(&rcu_state.nocb_mutex);
 }
 
 /* How many CB CPU IDs per GP kthread?  Default of -1 for sqrt(nr_cpu_ids). */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index f752b2a1d887..c662376c8af0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -28,6 +28,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
 		!(lockdep_is_held(&rcu_state.barrier_mutex) ||
 		  (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
 		  lockdep_is_held(&rdp->nocb_lock) ||
+		  lockdep_is_held(&rcu_state.nocb_mutex) ||
 		  (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) &&
 		   rdp == this_cpu_ptr(&rcu_data)) ||
 		  rcu_current_is_nocb_kthread(rdp)),

From 4e26ce423116e9f48f73723cc33add295e56ad31 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 30 May 2024 15:45:46 +0200
Subject: [PATCH 05/64] rcu/nocb: (De-)offload callbacks on offline CPUs only

Currently callbacks can be (de-)offloaded only on online CPUs. This
involves an overly elaborated state machine in order to make sure that
callbacks are always handled during the process while ensuring
synchronization between rcu_core and NOCB kthreads.

The only potential user of NOCB (de-)offloading appears to be a
nohz_full toggling interface through cpusets. And the general agreement
is now to work toward toggling the nohz_full state on offline CPUs to
simplify the whole picture.

Therefore, convert the (de-)offloading to only support offline CPUs.
This involves the following changes:

* Call rcu_barrier() before deoffloading. An offline offloaded CPU may
  still carry callbacks in its queue ignored by
  rcutree_migrate_callbacks(). Those callbacks must all be flushed
  before switching to a regular queue because no more kthreads will
  handle those before the CPU ever gets re-onlined.

  This means that further calls to rcu_barrier() will find an empty
  queue until the CPU goes through rcutree_report_cpu_starting(). As a
  result it is guaranteed that further rcu_barrier() won't try to lock
  the nocb_lock for that target and thus won't risk an imbalance.

  Therefore barrier_mutex doesn't need to be locked anymore upon
  deoffloading.

* Assume the queue is empty before offloading, as
  rcutree_migrate_callbacks() took care of everything.

  This means that further calls to rcu_barrier() will find an empty
  queue until the CPU goes through rcutree_report_cpu_starting(). As a
  result it is guaranteed that further rcu_barrier() won't risk a
  nocb_lock imbalance.

  Therefore barrier_mutex doesn't need to be locked anymore upon
  offloading.

* No need to flush bypass anymore.

Further simplifications will follow in upcoming patches.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree_nocb.h | 82 +++++++++++-------------------------------
 1 file changed, 21 insertions(+), 61 deletions(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 16bcb8b13a5e..8e766396df3a 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1049,43 +1049,26 @@ static int rdp_offload_toggle(struct rcu_data *rdp,
 	return wake_gp;
 }
 
-static long rcu_nocb_rdp_deoffload(void *arg)
+static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
 {
-	struct rcu_data *rdp = arg;
 	struct rcu_segcblist *cblist = &rdp->cblist;
 	unsigned long flags;
 	int wake_gp;
 	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
 
-	/*
-	 * rcu_nocb_rdp_deoffload() may be called directly if
-	 * rcuog/o[p] spawn failed, because at this time the rdp->cpu
-	 * is not online yet.
-	 */
-	WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu));
+	/* CPU must be offline, unless it's early boot */
+	WARN_ON_ONCE(cpu_online(rdp->cpu) && rdp->cpu != raw_smp_processor_id());
 
 	pr_info("De-offloading %d\n", rdp->cpu);
 
+	/* Flush all callbacks from segcblist and bypass */
+	rcu_barrier();
+
 	rcu_nocb_lock_irqsave(rdp, flags);
-	/*
-	 * Flush once and for all now. This suffices because we are
-	 * running on the target CPU holding ->nocb_lock (thus having
-	 * interrupts disabled), and because rdp_offload_toggle()
-	 * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
-	 * Thus future calls to rcu_segcblist_completely_offloaded() will
-	 * return false, which means that future calls to rcu_nocb_try_bypass()
-	 * will refuse to put anything into the bypass.
-	 */
-	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
-	/*
-	 * Start with invoking rcu_core() early. This way if the current thread
-	 * happens to preempt an ongoing call to rcu_core() in the middle,
-	 * leaving some work dismissed because rcu_core() still thinks the rdp is
-	 * completely offloaded, we are guaranteed a nearby future instance of
-	 * rcu_core() to catch up.
-	 */
+	WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+	WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
+
 	rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE);
-	invoke_rcu_core();
 	wake_gp = rdp_offload_toggle(rdp, false, flags);
 
 	mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
@@ -1128,10 +1111,6 @@ static long rcu_nocb_rdp_deoffload(void *arg)
 	 */
 	raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 
-	/* Sanity check */
-	WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
-
-
 	return 0;
 }
 
@@ -1142,18 +1121,16 @@ int rcu_nocb_cpu_deoffload(int cpu)
 
 	cpus_read_lock();
 	mutex_lock(&rcu_state.nocb_mutex);
-	mutex_lock(&rcu_state.barrier_mutex);
 	if (rcu_rdp_is_offloaded(rdp)) {
-		if (cpu_online(cpu)) {
-			ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+		if (!cpu_online(cpu)) {
+			ret = rcu_nocb_rdp_deoffload(rdp);
 			if (!ret)
 				cpumask_clear_cpu(cpu, rcu_nocb_mask);
 		} else {
-			pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu);
+			pr_info("NOCB: Cannot CB-deoffload online CPU %d\n", rdp->cpu);
 			ret = -EINVAL;
 		}
 	}
-	mutex_unlock(&rcu_state.barrier_mutex);
 	mutex_unlock(&rcu_state.nocb_mutex);
 	cpus_read_unlock();
 
@@ -1161,15 +1138,14 @@ int rcu_nocb_cpu_deoffload(int cpu)
 }
 EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
 
-static long rcu_nocb_rdp_offload(void *arg)
+static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
 {
-	struct rcu_data *rdp = arg;
 	struct rcu_segcblist *cblist = &rdp->cblist;
 	unsigned long flags;
 	int wake_gp;
 	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
 
-	WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+	WARN_ON_ONCE(cpu_online(rdp->cpu));
 	/*
 	 * For now we only support re-offload, ie: the rdp must have been
 	 * offloaded on boot first.
@@ -1182,28 +1158,15 @@ static long rcu_nocb_rdp_offload(void *arg)
 
 	pr_info("Offloading %d\n", rdp->cpu);
 
+	WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+	WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
+
 	/*
 	 * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
 	 * is set.
 	 */
 	raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
 
-	/*
-	 * We didn't take the nocb lock while working on the
-	 * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode).
-	 * Every modifications that have been done previously on
-	 * rdp->cblist must be visible remotely by the nocb kthreads
-	 * upon wake up after reading the cblist flags.
-	 *
-	 * The layout against nocb_lock enforces that ordering:
-	 *
-	 *  __rcu_nocb_rdp_offload()   nocb_cb_wait()/nocb_gp_wait()
-	 * -------------------------   ----------------------------
-	 *      WRITE callbacks           rcu_nocb_lock()
-	 *      rcu_nocb_lock()           READ flags
-	 *      WRITE flags               READ callbacks
-	 *      rcu_nocb_unlock()         rcu_nocb_unlock()
-	 */
 	wake_gp = rdp_offload_toggle(rdp, true, flags);
 	if (wake_gp)
 		wake_up_process(rdp_gp->nocb_gp_kthread);
@@ -1214,8 +1177,7 @@ static long rcu_nocb_rdp_offload(void *arg)
 			      rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
 
 	/*
-	 * All kthreads are ready to work, we can finally relieve rcu_core() and
-	 * enable nocb bypass.
+	 * All kthreads are ready to work, we can finally enable nocb bypass.
 	 */
 	rcu_nocb_lock_irqsave(rdp, flags);
 	rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE);
@@ -1231,18 +1193,16 @@ int rcu_nocb_cpu_offload(int cpu)
 
 	cpus_read_lock();
 	mutex_lock(&rcu_state.nocb_mutex);
-	mutex_lock(&rcu_state.barrier_mutex);
 	if (!rcu_rdp_is_offloaded(rdp)) {
-		if (cpu_online(cpu)) {
-			ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+		if (!cpu_online(cpu)) {
+			ret = rcu_nocb_rdp_offload(rdp);
 			if (!ret)
 				cpumask_set_cpu(cpu, rcu_nocb_mask);
 		} else {
-			pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu);
+			pr_info("NOCB: Cannot CB-offload online CPU %d\n", rdp->cpu);
 			ret = -EINVAL;
 		}
 	}
-	mutex_unlock(&rcu_state.barrier_mutex);
 	mutex_unlock(&rcu_state.nocb_mutex);
 	cpus_read_unlock();
 

From d2e7f55ff2e3d878a63149c90b360cbec101a83e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 30 May 2024 15:45:47 +0200
Subject: [PATCH 06/64] rcu/nocb: Remove halfway (de-)offloading handling from
 bypass

Bypass enqueue can't happen anymore in the middle of (de-)offloading
since this sort of transition now only applies to offline CPUs.

The related safety check can therefore be removed.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree_nocb.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 8e766396df3a..af44e75eb0cd 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -409,14 +409,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 		return false;
 	}
 
-	// In the process of (de-)offloading: no bypassing, but
-	// locking.
-	if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
-		rcu_nocb_lock(rdp);
-		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
-		return false; /* Not offloaded, no bypassing. */
-	}
-
 	// Don't use ->nocb_bypass during early boot.
 	if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
 		rcu_nocb_lock(rdp);

From 5a4f9059a8f46fc1fbbae74781a5f7c8d74c732b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 30 May 2024 15:45:48 +0200
Subject: [PATCH 07/64] rcu/nocb: Remove halfway (de-)offloading handling from
 rcu_core()'s QS reporting

RCU core can't be running anymore while in the middle of (de-)offloading
since this sort of transition now only applies to offline CPUs.

The locked callback acceleration handling during the transition can
therefore be removed.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree.c | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2b9e713854b0..60f271f5c079 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2386,7 +2386,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
 {
 	unsigned long flags;
 	unsigned long mask;
-	bool needacc = false;
 	struct rcu_node *rnp;
 
 	WARN_ON_ONCE(rdp->cpu != smp_processor_id());
@@ -2423,23 +2422,11 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
 			 * to return true.  So complain, but don't awaken.
 			 */
 			WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp));
-		} else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
-			/*
-			 * ...but NOCB kthreads may miss or delay callbacks acceleration
-			 * if in the middle of a (de-)offloading process.
-			 */
-			needacc = true;
 		}
 
 		rcu_disable_urgency_upon_qs(rdp);
 		rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
 		/* ^^^ Released rnp->lock */
-
-		if (needacc) {
-			rcu_nocb_lock_irqsave(rdp, flags);
-			rcu_accelerate_cbs_unlocked(rnp, rdp);
-			rcu_nocb_unlock_irqrestore(rdp, flags);
-		}
 	}
 }
 

From df7c249a0ed464b3f690c6f04b8cbc0fdbf30afc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 30 May 2024 15:45:49 +0200
Subject: [PATCH 08/64] rcu/nocb: Remove halfway (de-)offloading handling from
 rcu_core

RCU core can't be running anymore while in the middle of (de-)offloading
since this sort of transition now only applies to offline CPUs.

The locked callback acceleration handling during the transition can
therefore be removed, along with concurrent batch execution.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree.c | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 60f271f5c079..1a272c678533 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2781,24 +2781,6 @@ static __latent_entropy void rcu_core(void)
 	unsigned long flags;
 	struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
 	struct rcu_node *rnp = rdp->mynode;
-	/*
-	 * On RT rcu_core() can be preempted when IRQs aren't disabled.
-	 * Therefore this function can race with concurrent NOCB (de-)offloading
-	 * on this CPU and the below condition must be considered volatile.
-	 * However if we race with:
-	 *
-	 * _ Offloading:   In the worst case we accelerate or process callbacks
-	 *                 concurrently with NOCB kthreads. We are guaranteed to
-	 *                 call rcu_nocb_lock() if that happens.
-	 *
-	 * _ Deoffloading: In the worst case we miss callbacks acceleration or
-	 *                 processing. This is fine because the early stage
-	 *                 of deoffloading invokes rcu_core() after setting
-	 *                 SEGCBLIST_RCU_CORE. So we guarantee that we'll process
-	 *                 what could have been dismissed without the need to wait
-	 *                 for the next rcu_pending() check in the next jiffy.
-	 */
-	const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
 
 	if (cpu_is_offline(smp_processor_id()))
 		return;
@@ -2818,17 +2800,17 @@ static __latent_entropy void rcu_core(void)
 
 	/* No grace period and unregistered callbacks? */
 	if (!rcu_gp_in_progress() &&
-	    rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) {
-		rcu_nocb_lock_irqsave(rdp, flags);
+	    rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) {
+		local_irq_save(flags);
 		if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
 			rcu_accelerate_cbs_unlocked(rnp, rdp);
-		rcu_nocb_unlock_irqrestore(rdp, flags);
+		local_irq_restore(flags);
 	}
 
 	rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
 
 	/* If there are callbacks ready, invoke them. */
-	if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
+	if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_ready_cbs(&rdp->cblist) &&
 	    likely(READ_ONCE(rcu_scheduler_fully_active))) {
 		rcu_do_batch(rdp);
 		/* Re-invoke RCU core processing if there are callbacks remaining. */

From bae6076ebbd14cc1f1bd4de65c2db21d3ab109d7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 30 May 2024 15:45:50 +0200
Subject: [PATCH 09/64] rcu/nocb: Remove SEGCBLIST_RCU_CORE

RCU core can't be running anymore while in the middle of (de-)offloading
since this sort of transition now only applies to offline CPUs.

The SEGCBLIST_RCU_CORE state can therefore be removed.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcu_segcblist.h | 9 ++++-----
 kernel/rcu/rcu_segcblist.h    | 9 ---------
 kernel/rcu/tree.c             | 3 ---
 kernel/rcu/tree_nocb.h        | 9 ---------
 4 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index ba95c06675e1..5469c54cd778 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -185,11 +185,10 @@ struct rcu_cblist {
  *  ----------------------------------------------------------------------------
  */
 #define SEGCBLIST_ENABLED	BIT(0)
-#define SEGCBLIST_RCU_CORE	BIT(1)
-#define SEGCBLIST_LOCKING	BIT(2)
-#define SEGCBLIST_KTHREAD_CB	BIT(3)
-#define SEGCBLIST_KTHREAD_GP	BIT(4)
-#define SEGCBLIST_OFFLOADED	BIT(5)
+#define SEGCBLIST_LOCKING	BIT(1)
+#define SEGCBLIST_KTHREAD_CB	BIT(2)
+#define SEGCBLIST_KTHREAD_GP	BIT(3)
+#define SEGCBLIST_OFFLOADED	BIT(4)
 
 struct rcu_segcblist {
 	struct rcu_head *head;
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 4fe877f5f654..7a0962dfee86 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -95,15 +95,6 @@ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
 	return false;
 }
 
-static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp)
-{
-	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
-	    !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE))
-		return true;
-
-	return false;
-}
-
 /*
  * Are all segments following the specified segment of the specified
  * rcu_segcblist structure empty of callbacks?  (The specified
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1a272c678533..82e831b969e4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -79,9 +79,6 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *);
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
 	.gpwrap = true,
-#ifdef CONFIG_RCU_NOCB_CPU
-	.cblist.flags = SEGCBLIST_RCU_CORE,
-#endif
 };
 static struct rcu_state rcu_state = {
 	.level = { &rcu_state.node[0] },
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index af44e75eb0cd..24daf606de0c 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1060,7 +1060,6 @@ static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
 	WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
 	WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
 
-	rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE);
 	wake_gp = rdp_offload_toggle(rdp, false, flags);
 
 	mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
@@ -1168,13 +1167,6 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
 	swait_event_exclusive(rdp->nocb_state_wq,
 			      rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
 
-	/*
-	 * All kthreads are ready to work, we can finally enable nocb bypass.
-	 */
-	rcu_nocb_lock_irqsave(rdp, flags);
-	rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE);
-	rcu_nocb_unlock_irqrestore(rdp, flags);
-
 	return 0;
 }
 
@@ -1350,7 +1342,6 @@ void __init rcu_init_nohz(void)
 			rcu_segcblist_init(&rdp->cblist);
 		rcu_segcblist_offload(&rdp->cblist, true);
 		rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
-		rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE);
 	}
 	rcu_organize_nocb_kthreads();
 }

From 91e43b9044a4f9b6976dc28b3f1446b733cc68de Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 30 May 2024 15:45:51 +0200
Subject: [PATCH 10/64] rcu/nocb: Remove SEGCBLIST_KTHREAD_CB

This state excerpt from the (de-)offloading state machine was used to
implement an ad-hoc kthread parking of rcuo kthreads. This code has
been removed and therefore the related state can be erased as well.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcu_segcblist.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 5469c54cd778..1ef1bb54853d 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -186,9 +186,8 @@ struct rcu_cblist {
  */
 #define SEGCBLIST_ENABLED	BIT(0)
 #define SEGCBLIST_LOCKING	BIT(1)
-#define SEGCBLIST_KTHREAD_CB	BIT(2)
-#define SEGCBLIST_KTHREAD_GP	BIT(3)
-#define SEGCBLIST_OFFLOADED	BIT(4)
+#define SEGCBLIST_KTHREAD_GP	BIT(2)
+#define SEGCBLIST_OFFLOADED	BIT(3)
 
 struct rcu_segcblist {
 	struct rcu_head *head;

From cf3b1501a8aa8e817baede5e3e1f2beb26a09527 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 14 Jun 2024 13:35:47 -0700
Subject: [PATCH 11/64] rcutorture: Remove redundant rcu_torture_ops
 get_gp_completed fields

The rcu_torture_ops structure's ->get_gp_completed and
->get_gp_completed_full fields are redundant with its ->get_comp_state
and ->get_comp_state_full fields.  This commit therefore removes the
former in favor of the latter.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcutorture.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 08bf7c669dd3..836e27250a8b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -366,8 +366,6 @@ struct rcu_torture_ops {
 	bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2);
 	unsigned long (*get_gp_state)(void);
 	void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
-	unsigned long (*get_gp_completed)(void);
-	void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp);
 	unsigned long (*start_gp_poll)(void);
 	void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp);
 	bool (*poll_gp_state)(unsigned long oldstate);
@@ -553,8 +551,6 @@ static struct rcu_torture_ops rcu_ops = {
 	.get_comp_state_full	= get_completed_synchronize_rcu_full,
 	.get_gp_state		= get_state_synchronize_rcu,
 	.get_gp_state_full	= get_state_synchronize_rcu_full,
-	.get_gp_completed	= get_completed_synchronize_rcu,
-	.get_gp_completed_full	= get_completed_synchronize_rcu_full,
 	.start_gp_poll		= start_poll_synchronize_rcu,
 	.start_gp_poll_full	= start_poll_synchronize_rcu_full,
 	.poll_gp_state		= poll_state_synchronize_rcu,
@@ -1437,8 +1433,8 @@ rcu_torture_writer(void *arg)
 					  rcu_torture_writer_state_getname(),
 					  rcu_torture_writer_state,
 					  cookie, cur_ops->get_gp_state());
-				if (cur_ops->get_gp_completed) {
-					cookie = cur_ops->get_gp_completed();
+				if (cur_ops->get_comp_state) {
+					cookie = cur_ops->get_comp_state();
 					WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
 				}
 				cur_ops->readunlock(idx);
@@ -1452,8 +1448,8 @@ rcu_torture_writer(void *arg)
 					  rcu_torture_writer_state_getname(),
 					  rcu_torture_writer_state,
 					  cpumask_pr_args(cpu_online_mask));
-				if (cur_ops->get_gp_completed_full) {
-					cur_ops->get_gp_completed_full(&cookie_full);
+				if (cur_ops->get_comp_state_full) {
+					cur_ops->get_comp_state_full(&cookie_full);
 					WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
 				}
 				cur_ops->readunlock(idx);

From 1bc5bb9a61378bbb2ca73ab06651d5b174876a8e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 14 Jun 2024 16:47:12 -0700
Subject: [PATCH 12/64] rcutorture: Add SRCU ->same_gp_state and
 ->get_comp_state functions

This commit points the SRCU ->same_gp_state and ->get_comp_state fields
to same_state_synchronize_srcu() and get_completed_synchronize_srcu(),
allowing them to be tested.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcutorture.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 836e27250a8b..b2e6201b4569 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -736,6 +736,8 @@ static struct rcu_torture_ops srcu_ops = {
 	.deferred_free	= srcu_torture_deferred_free,
 	.sync		= srcu_torture_synchronize,
 	.exp_sync	= srcu_torture_synchronize_expedited,
+	.same_gp_state	= same_state_synchronize_srcu,
+	.get_comp_state = get_completed_synchronize_srcu,
 	.get_gp_state	= srcu_torture_get_gp_state,
 	.start_gp_poll	= srcu_torture_start_gp_poll,
 	.poll_gp_state	= srcu_torture_poll_gp_state,
@@ -776,6 +778,8 @@ static struct rcu_torture_ops srcud_ops = {
 	.deferred_free	= srcu_torture_deferred_free,
 	.sync		= srcu_torture_synchronize,
 	.exp_sync	= srcu_torture_synchronize_expedited,
+	.same_gp_state	= same_state_synchronize_srcu,
+	.get_comp_state = get_completed_synchronize_srcu,
 	.get_gp_state	= srcu_torture_get_gp_state,
 	.start_gp_poll	= srcu_torture_start_gp_poll,
 	.poll_gp_state	= srcu_torture_poll_gp_state,

From 72ed29f68e6370841ba10edce5b9a213259eb9a9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 14 Jun 2024 21:34:17 -0700
Subject: [PATCH 13/64] rcutorture: Generic test for NUM_ACTIVE_*RCU_POLL*

The rcutorture test suite has specific tests for both of the
NUM_ACTIVE_RCU_POLL_OLDSTATE and NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE
macros provided for RCU polled grace periods.  However, with the
advent of NUM_ACTIVE_SRCU_POLL_OLDSTATE, a more generic test is needed.
This commit therefore adds ->poll_active and ->poll_active_full fields
to the rcu_torture_ops structure and converts the existing specific
tests to use these fields, when present.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcutorture.c | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b2e6201b4569..acf9f9945d2b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -373,6 +373,8 @@ struct rcu_torture_ops {
 	bool (*poll_need_2gp)(bool poll, bool poll_full);
 	void (*cond_sync)(unsigned long oldstate);
 	void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp);
+	int poll_active;
+	int poll_active_full;
 	call_rcu_func_t call;
 	void (*cb_barrier)(void);
 	void (*fqs)(void);
@@ -558,6 +560,8 @@ static struct rcu_torture_ops rcu_ops = {
 	.poll_need_2gp		= rcu_poll_need_2gp,
 	.cond_sync		= cond_synchronize_rcu,
 	.cond_sync_full		= cond_synchronize_rcu_full,
+	.poll_active		= NUM_ACTIVE_RCU_POLL_OLDSTATE,
+	.poll_active_full	= NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE,
 	.get_gp_state_exp	= get_state_synchronize_rcu,
 	.start_gp_poll_exp	= start_poll_synchronize_rcu_expedited,
 	.start_gp_poll_exp_full	= start_poll_synchronize_rcu_expedited_full,
@@ -741,6 +745,7 @@ static struct rcu_torture_ops srcu_ops = {
 	.get_gp_state	= srcu_torture_get_gp_state,
 	.start_gp_poll	= srcu_torture_start_gp_poll,
 	.poll_gp_state	= srcu_torture_poll_gp_state,
+	.poll_active	= NUM_ACTIVE_SRCU_POLL_OLDSTATE,
 	.call		= srcu_torture_call,
 	.cb_barrier	= srcu_torture_barrier,
 	.stats		= srcu_torture_stats,
@@ -783,6 +788,7 @@ static struct rcu_torture_ops srcud_ops = {
 	.get_gp_state	= srcu_torture_get_gp_state,
 	.start_gp_poll	= srcu_torture_start_gp_poll,
 	.poll_gp_state	= srcu_torture_poll_gp_state,
+	.poll_active	= NUM_ACTIVE_SRCU_POLL_OLDSTATE,
 	.call		= srcu_torture_call,
 	.cb_barrier	= srcu_torture_barrier,
 	.stats		= srcu_torture_stats,
@@ -1374,13 +1380,15 @@ rcu_torture_writer(void *arg)
 	int i;
 	int idx;
 	int oldnice = task_nice(current);
-	struct rcu_gp_oldstate rgo[NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE];
+	struct rcu_gp_oldstate *rgo = NULL;
+	int rgo_size = 0;
 	struct rcu_torture *rp;
 	struct rcu_torture *old_rp;
 	static DEFINE_TORTURE_RANDOM(rand);
 	unsigned long stallsdone = jiffies;
 	bool stutter_waited;
-	unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+	unsigned long *ulo = NULL;
+	int ulo_size = 0;
 
 	// If a new stall test is added, this must be adjusted.
 	if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu)
@@ -1401,6 +1409,16 @@ rcu_torture_writer(void *arg)
 		torture_kthread_stopping("rcu_torture_writer");
 		return 0;
 	}
+	if (cur_ops->poll_active > 0) {
+		ulo = kzalloc(cur_ops->poll_active * sizeof(ulo[0]), GFP_KERNEL);
+		if (!WARN_ON(!ulo))
+			ulo_size = cur_ops->poll_active;
+	}
+	if (cur_ops->poll_active_full > 0) {
+		rgo = kzalloc(cur_ops->poll_active_full * sizeof(rgo[0]), GFP_KERNEL);
+		if (!WARN_ON(!rgo))
+			rgo_size = cur_ops->poll_active_full;
+	}
 
 	do {
 		rcu_torture_writer_state = RTWS_FIXED_DELAY;
@@ -1502,19 +1520,19 @@ rcu_torture_writer(void *arg)
 				break;
 			case RTWS_POLL_GET:
 				rcu_torture_writer_state = RTWS_POLL_GET;
-				for (i = 0; i < ARRAY_SIZE(ulo); i++)
+				for (i = 0; i < ulo_size; i++)
 					ulo[i] = cur_ops->get_comp_state();
 				gp_snap = cur_ops->start_gp_poll();
 				rcu_torture_writer_state = RTWS_POLL_WAIT;
 				while (!cur_ops->poll_gp_state(gp_snap)) {
 					gp_snap1 = cur_ops->get_gp_state();
-					for (i = 0; i < ARRAY_SIZE(ulo); i++)
+					for (i = 0; i < ulo_size; i++)
 						if (cur_ops->poll_gp_state(ulo[i]) ||
 						    cur_ops->same_gp_state(ulo[i], gp_snap1)) {
 							ulo[i] = gp_snap1;
 							break;
 						}
-					WARN_ON_ONCE(i >= ARRAY_SIZE(ulo));
+					WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size);
 					torture_hrtimeout_jiffies(torture_random(&rand) % 16,
 								  &rand);
 				}
@@ -1522,20 +1540,20 @@ rcu_torture_writer(void *arg)
 				break;
 			case RTWS_POLL_GET_FULL:
 				rcu_torture_writer_state = RTWS_POLL_GET_FULL;
-				for (i = 0; i < ARRAY_SIZE(rgo); i++)
+				for (i = 0; i < rgo_size; i++)
 					cur_ops->get_comp_state_full(&rgo[i]);
 				cur_ops->start_gp_poll_full(&gp_snap_full);
 				rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
 				while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
 					cur_ops->get_gp_state_full(&gp_snap1_full);
-					for (i = 0; i < ARRAY_SIZE(rgo); i++)
+					for (i = 0; i < rgo_size; i++)
 						if (cur_ops->poll_gp_state_full(&rgo[i]) ||
 						    cur_ops->same_gp_state_full(&rgo[i],
 										&gp_snap1_full)) {
 							rgo[i] = gp_snap1_full;
 							break;
 						}
-					WARN_ON_ONCE(i >= ARRAY_SIZE(rgo));
+					WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size);
 					torture_hrtimeout_jiffies(torture_random(&rand) % 16,
 								  &rand);
 				}
@@ -1617,6 +1635,8 @@ rcu_torture_writer(void *arg)
 		pr_alert("%s" TORTURE_FLAG
 			 " Dynamic grace-period expediting was disabled.\n",
 			 torture_type);
+	kfree(ulo);
+	kfree(rgo);
 	rcu_torture_writer_state = RTWS_STOPPING;
 	torture_kthread_stopping("rcu_torture_writer");
 	return 0;

From dc86460e77e8cea4896ff19de905001922653311 Mon Sep 17 00:00:00 2001
From: Zhouyi Zhou <zhouzhouyi@gmail.com>
Date: Wed, 19 Jun 2024 23:06:58 +0000
Subject: [PATCH 14/64] rcutorture: Add CFcommon.arch for arch-specific Kconfig
 options

Add CFcommon.arch for arch-specific Kconfig options.

In accordance with [1], [2] and [3], move the x86-specific kernel option
CONFIG_HYPERVISOR_GUEST to CFcommon.i686 and CFcommon.x86_64, and also
move the x86/PowerPC CONFIG_KVM_GUEST Kconfig option to CFcommon.i686,
CFcommon.x86_64, and CFcommon.ppc64le.

The "arch" in CFcommon.arch is taken from the "uname -m" command.

[1] https://lore.kernel.org/all/20240427005626.1365935-1-zhouzhouyi@gmail.com/
[2] https://lore.kernel.org/all/059d36ce-6453-42be-a31e-895abd35d590@paulmck-laptop/
[3] https://lore.kernel.org/all/ZnBkHosMDhsh4H8g@J2N7QTR9R3/

Tested in x86_64 and PPC VM of Open Source Lab of Oregon State University.

Fixes: a6fda6dab93c ("rcutorture: Tweak kvm options")
Suggested-by: Paul E. McKenney <paulmck@kernel.org>
Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh        | 2 ++
 tools/testing/selftests/rcutorture/configs/rcu/CFcommon         | 2 --
 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686    | 2 ++
 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le | 1 +
 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64  | 2 ++
 5 files changed, 7 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index b33cd8753689..ad79784e552d 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -68,6 +68,8 @@ config_override_param "--gdb options" KcList "$TORTURE_KCONFIG_GDB_ARG"
 config_override_param "--kasan options" KcList "$TORTURE_KCONFIG_KASAN_ARG"
 config_override_param "--kcsan options" KcList "$TORTURE_KCONFIG_KCSAN_ARG"
 config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG"
+config_override_param "$config_dir/CFcommon.$(uname -m)" KcList \
+		      "`cat $config_dir/CFcommon.$(uname -m) 2> /dev/null`"
 cp $T/KcList $resdir/ConfigFragment
 
 base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'`
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
index 0e92d85313aa..217597e84905 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
@@ -1,7 +1,5 @@
 CONFIG_RCU_TORTURE_TEST=y
 CONFIG_PRINTK_TIME=y
-CONFIG_HYPERVISOR_GUEST=y
 CONFIG_PARAVIRT=y
-CONFIG_KVM_GUEST=y
 CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n
 CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686
new file mode 100644
index 000000000000..d8b2f555686f
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686
@@ -0,0 +1,2 @@
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_KVM_GUEST=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le
new file mode 100644
index 000000000000..133da04247ee
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le
@@ -0,0 +1 @@
+CONFIG_KVM_GUEST=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64
new file mode 100644
index 000000000000..d8b2f555686f
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64
@@ -0,0 +1,2 @@
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_KVM_GUEST=y

From 20cee0b3daa315237edded31ddf72a4e948d5e1d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Jul 2024 09:18:33 -0700
Subject: [PATCH 15/64] rcutorture: Make rcu_torture_write_types() print number
 of update types

This commit follows the list of update types with their count, resulting
in console output like this:

	rcu_torture_write_types: Testing conditional GPs.
	rcu_torture_write_types: Testing conditional expedited GPs.
	rcu_torture_write_types: Testing conditional full-state GPs.
	rcu_torture_write_types: Testing expedited GPs.
	rcu_torture_write_types: Testing asynchronous GPs.
	rcu_torture_write_types: Testing polling GPs.
	rcu_torture_write_types: Testing polling full-state GPs.
	rcu_torture_write_types: Testing polling expedited GPs.
	rcu_torture_write_types: Testing polling full-state expedited GPs.
	rcu_torture_write_types: Testing normal GPs.
	rcu_torture_write_types: Testing 10 update types

This commit adds the final line giving the count.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcutorture.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index acf9f9945d2b..9a2ecb8b88ec 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1324,6 +1324,7 @@ static void rcu_torture_write_types(void)
 	} else if (gp_sync && !cur_ops->sync) {
 		pr_alert("%s: gp_sync without primitives.\n", __func__);
 	}
+	pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes);
 }
 
 /*

From 9a13a324f40fc3846cf7867daffaccd785beb10c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Jul 2024 19:51:02 -0700
Subject: [PATCH 16/64] tools/rcu: Remove RCU Tasks Rude asynchronous APIs from
 rcu-updaters.sh

The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are no longer.
This commit therefore removes them from the rcu-updaters.sh script.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 tools/rcu/rcu-updaters.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/rcu/rcu-updaters.sh b/tools/rcu/rcu-updaters.sh
index 4ef1397927bb..8a5df3f22550 100755
--- a/tools/rcu/rcu-updaters.sh
+++ b/tools/rcu/rcu-updaters.sh
@@ -21,12 +21,10 @@ fi
 bpftrace -e 'kprobe:kvfree_call_rcu,
 	     kprobe:call_rcu,
 	     kprobe:call_rcu_tasks,
-	     kprobe:call_rcu_tasks_rude,
 	     kprobe:call_rcu_tasks_trace,
 	     kprobe:call_srcu,
 	     kprobe:rcu_barrier,
 	     kprobe:rcu_barrier_tasks,
-	     kprobe:rcu_barrier_tasks_rude,
 	     kprobe:rcu_barrier_tasks_trace,
 	     kprobe:srcu_barrier,
 	     kprobe:synchronize_rcu,

From 3471e96bcf53731639587580fd02e5a297940b91 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 18 Jun 2024 10:40:30 -0700
Subject: [PATCH 17/64] rcu/kfree: Warn on unexpected tail state

Within the rcu_sr_normal_gp_cleanup_work() function, there is an acquire
load from rcu_state.srs_done_tail, which is expected to be non-NULL.
This commit adds a WARN_ON_ONCE() to check this expectation.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e641cc681901..0f41a81138dc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1649,7 +1649,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
 	 * the done tail list manipulations are protected here.
 	 */
 	done = smp_load_acquire(&rcu_state.srs_done_tail);
-	if (!done)
+	if (WARN_ON_ONCE(!done))
 		return;
 
 	WARN_ON_ONCE(!rcu_sr_is_wait_head(done));

From c1972c8dc987769eac8e5dede536d3cff489c6ee Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 1 Jul 2024 09:49:49 -0700
Subject: [PATCH 18/64] locking/csd_lock: Print large numbers as negatives

The CSD-lock-hold diagnostics from CONFIG_CSD_LOCK_WAIT_DEBUG are
printed in nanoseconds as unsigned long longs, which is a bit obtuse for
human readers when timing bugs result in negative CSD-lock hold times.
Yes, there are some people to whom it is immediately obvious that
18446744073709551615 is really -1, but for the rest of us...

Therefore, print these numbers as signed long longs, making the negative
hold times immediately apparent.

Reported-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Imran Khan <imran.f.khan@oracle.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Leonardo Bras <leobras@redhat.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Reviewed-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/smp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index aaffecdad319..e87953729230 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -249,8 +249,8 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
 	cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
 	/* How long since this CSD lock was stuck. */
 	ts_delta = ts2 - ts0;
-	pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
-		 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
+	pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n",
+		 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta,
 		 cpu, csd->func, csd->info);
 	/*
 	 * If the CSD lock is still stuck after 5 minutes, it is unlikely

From 9aed3b51fd6186582a95abd9fa67782982540749 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Jul 2024 18:49:35 -0700
Subject: [PATCH 19/64] rcu: Better define "atomic" for list replacement

The kernel-doc headers for list_replace_rcu() and hlist_replace_rcu()
claim that the replacement is atomic, which it is, but only for readers.
Avoid confusion by making it clear that the atomic nature of these
functions applies only to readers, not to concurrent updaters.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rculist.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 3dc1e58865f7..14dfa6008467 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -191,7 +191,10 @@ static inline void hlist_del_init_rcu(struct hlist_node *n)
  * @old : the element to be replaced
  * @new : the new element to insert
  *
- * The @old entry will be replaced with the @new entry atomically.
+ * The @old entry will be replaced with the @new entry atomically from
+ * the perspective of concurrent readers.  It is the caller's responsibility
+ * to synchronize with concurrent updaters, if any.
+ *
  * Note: @old should not be empty.
  */
 static inline void list_replace_rcu(struct list_head *old,
@@ -519,7 +522,9 @@ static inline void hlist_del_rcu(struct hlist_node *n)
  * @old : the element to be replaced
  * @new : the new element to insert
  *
- * The @old entry will be replaced with the @new entry atomically.
+ * The @old entry will be replaced with the @new entry atomically from
+ * the perspective of concurrent readers.  It is the caller's responsibility
+ * to synchronize with concurrent updaters, if any.
  */
 static inline void hlist_replace_rcu(struct hlist_node *old,
 					struct hlist_node *new)

From cfdbfb94b3824b8da3a6b21252d24754d0162ab6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:37:48 -0700
Subject: [PATCH 20/64] rcutorture: Add rcutree.nohz_full_patience_delay to
 TREE07

This commit adds the rcutree.nohz_full_patience_delay=1000 kernel boot
parameter to the TREE07 scenario, on the observation that "if it ain't
tested, it don't work".

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot
index 979edbf4c820..55ce305b2a3d 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot
@@ -2,3 +2,4 @@ nohz_full=2-9
 rcutorture.stall_cpu=14
 rcutorture.stall_cpu_holdoff=90
 rcutorture.fwd_progress=0
+rcutree.nohz_full_patience_delay=1000

From fb579e6656a8d5e042e65062f68ca41321817237 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@toblux.com>
Date: Wed, 7 Aug 2024 11:55:00 +0200
Subject: [PATCH 21/64] rcu: Annotate struct kvfree_rcu_bulk_data with
 __counted_by()

Add the __counted_by compiler attribute to the flexible array member
records to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and
CONFIG_FORTIFY_SOURCE.

Increment nr_records before adding a new pointer to the records array.

Signed-off-by: Thorsten Blum <thorsten.blum@toblux.com>
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Reviewed-by: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0f41a81138dc..d5bf824159da 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3227,7 +3227,7 @@ struct kvfree_rcu_bulk_data {
 	struct list_head list;
 	struct rcu_gp_oldstate gp_snap;
 	unsigned long nr_records;
-	void *records[];
+	void *records[] __counted_by(nr_records);
 };
 
 /*
@@ -3767,7 +3767,8 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
 	}
 
 	// Finally insert and update the GP for this page.
-	bnode->records[bnode->nr_records++] = ptr;
+	bnode->nr_records++;
+	bnode->records[bnode->nr_records - 1] = ptr;
 	get_state_synchronize_rcu_full(&bnode->gp_snap);
 	atomic_inc(&(*krcp)->bulk_count[idx]);
 

From 0aac9daef6763e6efef398faff71f8c593651cce Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 23 Jul 2024 14:10:25 -0400
Subject: [PATCH 22/64] rcu: Use system_unbound_wq to avoid disturbing isolated
 CPUs

It was discovered that isolated CPUs could sometimes be disturbed by
kworkers processing kfree_rcu() works causing higher than expected
latency. It is because the RCU core uses "system_wq" which doesn't have
the WQ_UNBOUND flag to handle all its work items. Fix this violation of
latency limits by using "system_unbound_wq" in the RCU core instead.
This will ensure that those work items will not be run on CPUs marked
as isolated.

Beside the WQ_UNBOUND flag, the other major difference between system_wq
and system_unbound_wq is their max_active count. The system_unbound_wq
has a max_active of WQ_MAX_ACTIVE (512) while system_wq's max_active
is WQ_DFL_ACTIVE (256) which is half of WQ_MAX_ACTIVE.

Reported-by: Vratislav Bendel <vbendel@redhat.com>
Closes: https://issues.redhat.com/browse/RHEL-50220
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Tested-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e641cc681901..494aa9513d0b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3539,10 +3539,10 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
 	if (delayed_work_pending(&krcp->monitor_work)) {
 		delay_left = krcp->monitor_work.timer.expires - jiffies;
 		if (delay < delay_left)
-			mod_delayed_work(system_wq, &krcp->monitor_work, delay);
+			mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
 		return;
 	}
-	queue_delayed_work(system_wq, &krcp->monitor_work, delay);
+	queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
 }
 
 static void
@@ -3634,7 +3634,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
 			// be that the work is in the pending state when
 			// channels have been detached following by each
 			// other.
-			queue_rcu_work(system_wq, &krwp->rcu_work);
+			queue_rcu_work(system_unbound_wq, &krwp->rcu_work);
 		}
 	}
 
@@ -3704,7 +3704,7 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
 			!atomic_xchg(&krcp->work_in_progress, 1)) {
 		if (atomic_read(&krcp->backoff_page_cache_fill)) {
-			queue_delayed_work(system_wq,
+			queue_delayed_work(system_unbound_wq,
 				&krcp->page_cache_work,
 					msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
 		} else {

From 29bc83e4d90546aa794a9584786086b141a6ba4d Mon Sep 17 00:00:00 2001
From: JP Kobryn <inwardvessel@gmail.com>
Date: Mon, 15 Jul 2024 16:23:24 -0700
Subject: [PATCH 23/64] srcu: faster gp seq wrap-around

Using a higher value for the initial gp sequence counters allows for
wrapping to occur faster. It can help with surfacing any issues that may
be happening as a result of the wrap around.

Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcupdate.h |  3 +++
 include/linux/srcutree.h | 15 ++++++++++++++-
 kernel/rcu/rcu.h         |  3 ---
 kernel/rcu/srcutree.c    |  7 ++++---
 4 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 13f6f00aecf9..8d56db70d417 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -34,6 +34,9 @@
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
 
+#define RCU_SEQ_CTR_SHIFT    2
+#define RCU_SEQ_STATE_MASK   ((1 << RCU_SEQ_CTR_SHIFT) - 1)
+
 /* Exported common interfaces */
 void call_rcu(struct rcu_head *head, rcu_callback_t func);
 void rcu_barrier_tasks(void);
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 8f3f72480e78..ed57598394de 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -129,10 +129,23 @@ struct srcu_struct {
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
 
+/*
+ * Values for initializing gp sequence fields. Higher values allow wrap arounds to
+ * occur earlier.
+ * The second value with state is useful in the case of static initialization of
+ * srcu_usage where srcu_gp_seq_needed is expected to have some state value in its
+ * lower bits (or else it will appear to be already initialized within
+ * the call check_init_srcu_struct()).
+ */
+#define SRCU_GP_SEQ_INITIAL_VAL ((0UL - 100UL) << RCU_SEQ_CTR_SHIFT)
+#define SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE (SRCU_GP_SEQ_INITIAL_VAL - 1)
+
 #define __SRCU_USAGE_INIT(name)									\
 {												\
 	.lock = __SPIN_LOCK_UNLOCKED(name.lock),						\
-	.srcu_gp_seq_needed = -1UL,								\
+	.srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL,							\
+	.srcu_gp_seq_needed = SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE,				\
+	.srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL,					\
 	.work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0),					\
 }
 
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 38238e595a61..2bfed9855d67 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -54,9 +54,6 @@
  *					grace-period sequence number.
  */
 
-#define RCU_SEQ_CTR_SHIFT	2
-#define RCU_SEQ_STATE_MASK	((1 << RCU_SEQ_CTR_SHIFT) - 1)
-
 /* Low-order bit definition for polled grace-period APIs. */
 #define RCU_GET_STATE_COMPLETED	0x1
 
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index b24db425f16d..6fd9c914ce64 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -247,7 +247,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
 	mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
 	ssp->srcu_idx = 0;
-	ssp->srcu_sup->srcu_gp_seq = 0;
+	ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL;
 	ssp->srcu_sup->srcu_barrier_seq = 0;
 	mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
 	atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
@@ -258,7 +258,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	if (!ssp->sda)
 		goto err_free_sup;
 	init_srcu_struct_data(ssp);
-	ssp->srcu_sup->srcu_gp_seq_needed_exp = 0;
+	ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL;
 	ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
 		if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC))
@@ -266,7 +266,8 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 		WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
 	}
 	ssp->srcu_sup->srcu_ssp = ssp;
-	smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
+	smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed,
+			SRCU_GP_SEQ_INITIAL_VAL); /* Init done. */
 	return 0;
 
 err_free_sda:

From c8c3ae83e0bbed6d56a00068c7d10ebf64db48d9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:36:21 -0700
Subject: [PATCH 24/64] srcu: Check for concurrent updates of heuristics

SRCU maintains the ->srcu_n_exp_nodelay and ->reschedule_count values
to guide heuristics governing auto-expediting of normal SRCU grace
periods and grace-period-state-machine delays.  This commit adds KCSAN
ASSERT_EXCLUSIVE_WRITER() calls to check for concurrent updates to
these fields.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/srcutree.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6fd9c914ce64..aaee09a6748c 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -629,6 +629,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
 		if (time_after(j, gpstart))
 			jbase += j - gpstart;
 		if (!jbase) {
+			ASSERT_EXCLUSIVE_WRITER(sup->srcu_n_exp_nodelay);
 			WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1);
 			if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
 				jbase = 1;
@@ -1819,6 +1820,7 @@ static void process_srcu(struct work_struct *work)
 	} else {
 		j = jiffies;
 		if (READ_ONCE(sup->reschedule_jiffies) == j) {
+			ASSERT_EXCLUSIVE_WRITER(sup->reschedule_count);
 			WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1);
 			if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay)
 				curdelay = 1;

From e53cef031bfac2369d249cfd726706c30a5a3351 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:36:22 -0700
Subject: [PATCH 25/64] srcu: Mark callbacks not currently participating in
 barrier operation

SRCU keeps a count of the number of callbacks that the current
srcu_barrier() is waiting on, but there is currently no easy way to
work out which callback is stuck.  One way to do this is to mark idle
SRCU-barrier callbacks by making the ->next pointer point to the callback
itself, and this commit does just that.

Later commits will use this for debug output.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/srcutree.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index aaee09a6748c..31706e3293bc 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -137,6 +137,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
 		sdp->srcu_cblist_invoking = false;
 		sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
 		sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq;
+		sdp->srcu_barrier_head.next = &sdp->srcu_barrier_head;
 		sdp->mynode = NULL;
 		sdp->cpu = cpu;
 		INIT_WORK(&sdp->work, srcu_invoke_callbacks);
@@ -1562,6 +1563,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
 	struct srcu_data *sdp;
 	struct srcu_struct *ssp;
 
+	rhp->next = rhp; // Mark the callback as having been invoked.
 	sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
 	ssp = sdp->ssp;
 	if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))

From 58cb3210543591c1783a9a884adf27708f19d2a1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:37:49 -0700
Subject: [PATCH 26/64] rcutorture: Add a stall_cpu_repeat module parameter

This commit adds an stall_cpu_repeat kernel, which is also the
rcutorture.stall_cpu_repeat boot parameter, to test repeated CPU stalls.
Note that only the first stall will pay attention to the stall_cpu_irqsoff
module parameter.  For the second and subsequent stalls, interrupts will
be enabled.  This is helpful when testing the interaction between RCU
CPU stall warnings and CSD-lock stall warnings.

Reported-by: Rik van Riel <riel@surriel.com>
Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 .../admin-guide/kernel-parameters.txt         |  8 ++-
 kernel/rcu/rcutorture.c                       | 56 +++++++++++++------
 2 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f1384c7b59c9..db1be767c91a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5384,7 +5384,13 @@
 			Time to wait (s) after boot before inducing stall.
 
 	rcutorture.stall_cpu_irqsoff= [KNL]
-			Disable interrupts while stalling if set.
+			Disable interrupts while stalling if set, but only
+			on the first stall in the set.
+
+	rcutorture.stall_cpu_repeat= [KNL]
+			Number of times to repeat the stall sequence,
+			so that rcutorture.stall_cpu_repeat=3 will result
+			in four stall sequences.
 
 	rcutorture.stall_gp_kthread= [KNL]
 			Duration (s) of forced sleep within RCU
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 9a2ecb8b88ec..4fd23cae0e9e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -115,6 +115,7 @@ torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s
 torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall.");
 torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
 torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
+torture_param(int, stall_cpu_repeat, 0, "Number of additional stalls after the first one.");
 torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s).");
 torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
 torture_param(int, stutter, 5, "Number of seconds to run/halt test");
@@ -1393,7 +1394,8 @@ rcu_torture_writer(void *arg)
 
 	// If a new stall test is added, this must be adjusted.
 	if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu)
-		stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ;
+		stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) *
+			      HZ * (stall_cpu_repeat + 1);
 	VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
 	if (!can_expedite)
 		pr_alert("%s" TORTURE_FLAG
@@ -2391,7 +2393,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
 		 "test_boost=%d/%d test_boost_interval=%d "
 		 "test_boost_duration=%d shutdown_secs=%d "
 		 "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
-		 "stall_cpu_block=%d "
+		 "stall_cpu_block=%d stall_cpu_repeat=%d "
 		 "n_barrier_cbs=%d "
 		 "onoff_interval=%d onoff_holdoff=%d "
 		 "read_exit_delay=%d read_exit_burst=%d "
@@ -2403,7 +2405,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
 		 test_boost, cur_ops->can_boost,
 		 test_boost_interval, test_boost_duration, shutdown_secs,
 		 stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
-		 stall_cpu_block,
+		 stall_cpu_block, stall_cpu_repeat,
 		 n_barrier_cbs,
 		 onoff_interval, onoff_holdoff,
 		 read_exit_delay, read_exit_burst,
@@ -2481,19 +2483,11 @@ static struct notifier_block rcu_torture_stall_block = {
  * induces a CPU stall for the time specified by stall_cpu.  If a new
  * stall test is added, stallsdone in rcu_torture_writer() must be adjusted.
  */
-static int rcu_torture_stall(void *args)
+static void rcu_torture_stall_one(int rep, int irqsoff)
 {
 	int idx;
-	int ret;
 	unsigned long stop_at;
 
-	VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
-	if (rcu_cpu_stall_notifiers) {
-		ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
-		if (ret)
-			pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
-				__func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
-	}
 	if (stall_cpu_holdoff > 0) {
 		VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
 		schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
@@ -2513,12 +2507,12 @@ static int rcu_torture_stall(void *args)
 		stop_at = ktime_get_seconds() + stall_cpu;
 		/* RCU CPU stall is expected behavior in following code. */
 		idx = cur_ops->readlock();
-		if (stall_cpu_irqsoff)
+		if (irqsoff)
 			local_irq_disable();
 		else if (!stall_cpu_block)
 			preempt_disable();
-		pr_alert("%s start on CPU %d.\n",
-			  __func__, raw_smp_processor_id());
+		pr_alert("%s start stall episode %d on CPU %d.\n",
+			  __func__, rep + 1, raw_smp_processor_id());
 		while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) &&
 		       !kthread_should_stop())
 			if (stall_cpu_block) {
@@ -2530,12 +2524,42 @@ static int rcu_torture_stall(void *args)
 			} else if (stall_no_softlockup) {
 				touch_softlockup_watchdog();
 			}
-		if (stall_cpu_irqsoff)
+		if (irqsoff)
 			local_irq_enable();
 		else if (!stall_cpu_block)
 			preempt_enable();
 		cur_ops->readunlock(idx);
 	}
+}
+
+/*
+ * CPU-stall kthread.  Invokes rcu_torture_stall_one() once, and then as many
+ * additional times as specified by the stall_cpu_repeat module parameter.
+ * Note that stall_cpu_irqsoff is ignored on the second and subsequent
+ * stall.
+ */
+static int rcu_torture_stall(void *args)
+{
+	int i;
+	int repeat = stall_cpu_repeat;
+	int ret;
+
+	VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
+	if (repeat < 0) {
+		repeat = 0;
+		WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST));
+	}
+	if (rcu_cpu_stall_notifiers) {
+		ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
+		if (ret)
+			pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
+				__func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
+	}
+	for (i = 0; i <= repeat; i++) {
+		if (kthread_should_stop())
+			break;
+		rcu_torture_stall_one(i, i == 0 ? stall_cpu_irqsoff : 0);
+	}
 	pr_alert("%s end.\n", __func__);
 	if (rcu_cpu_stall_notifiers && !ret) {
 		ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block);

From 1c5144a066fd17eba681b3d6b0f86bda8e06fbfa Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:37:50 -0700
Subject: [PATCH 27/64] torture: Add torture.sh --guest-cpu-limit argument for
 limited hosts

Some servers have limitations on the number of CPUs a given guest OS
can use.  In my earlier experience, such limitations have been at least
half of the host's CPUs, but in a recent example, this limit is less
than 40%.  This commit therefore adds a --guest-cpu-limit argument that
allows such low limits to be made known to torture.sh.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 .../selftests/rcutorture/bin/torture.sh       | 38 +++++++++++++------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh
index 990d24696fd3..0447c4a00cc4 100755
--- a/tools/testing/selftests/rcutorture/bin/torture.sh
+++ b/tools/testing/selftests/rcutorture/bin/torture.sh
@@ -19,10 +19,10 @@ PATH=${RCUTORTURE}/bin:$PATH; export PATH
 
 TORTURE_ALLOTED_CPUS="`identify_qemu_vcpus`"
 MAKE_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS*2))
-HALF_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS/2))
-if test "$HALF_ALLOTED_CPUS" -lt 1
+SCALE_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS/2))
+if test "$SCALE_ALLOTED_CPUS" -lt 1
 then
-	HALF_ALLOTED_CPUS=1
+	SCALE_ALLOTED_CPUS=1
 fi
 VERBOSE_BATCH_CPUS=$((TORTURE_ALLOTED_CPUS/16))
 if test "$VERBOSE_BATCH_CPUS" -lt 2
@@ -90,6 +90,7 @@ usage () {
 	echo "       --do-scftorture / --do-no-scftorture / --no-scftorture"
 	echo "       --do-srcu-lockdep / --do-no-srcu-lockdep / --no-srcu-lockdep"
 	echo "       --duration [ <minutes> | <hours>h | <days>d ]"
+	echo "       --guest-cpu-limit N"
 	echo "       --kcsan-kmake-arg kernel-make-arguments"
 	exit 1
 }
@@ -203,6 +204,21 @@ do
 		duration_base=$(($ts*mult))
 		shift
 		;;
+	--guest-cpu-limit|--guest-cpu-lim)
+		checkarg --guest-cpu-limit "(number)" "$#" "$2" '^[0-9]*$' '^--'
+		if (("$2" <= "$TORTURE_ALLOTED_CPUS" / 2))
+		then
+			SCALE_ALLOTED_CPUS="$2"
+			VERBOSE_BATCH_CPUS="$((SCALE_ALLOTED_CPUS/8))"
+			if (("$VERBOSE_BATCH_CPUS" < 2))
+			then
+				VERBOSE_BATCH_CPUS=0
+			fi
+		else
+			echo "Ignoring value of $2 for --guest-cpu-limit which is greater than (("$TORTURE_ALLOTED_CPUS" / 2))."
+		fi
+		shift
+		;;
 	--kcsan-kmake-arg|--kcsan-kmake-args)
 		checkarg --kcsan-kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$'
 		kcsan_kmake_args="`echo "$kcsan_kmake_args $2" | sed -e 's/^ *//' -e 's/ *$//'`"
@@ -425,9 +441,9 @@ fi
 if test "$do_scftorture" = "yes"
 then
 	# Scale memory based on the number of CPUs.
-	scfmem=$((3+HALF_ALLOTED_CPUS/16))
-	torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1"
-	torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory ${scfmem}G --trust-make
+	scfmem=$((3+SCALE_ALLOTED_CPUS/16))
+	torture_bootargs="scftorture.nthreads=$SCALE_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1"
+	torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --memory ${scfmem}G --trust-make
 fi
 
 if test "$do_rt" = "yes"
@@ -471,8 +487,8 @@ for prim in $primlist
 do
 	if test -n "$firsttime"
 	then
-		torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$HALF_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot"
-		torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make
+		torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$SCALE_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot"
+		torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make
 		mv $T/last-resdir-nodebug $T/first-resdir-nodebug || :
 		if test -f "$T/last-resdir-kasan"
 		then
@@ -520,8 +536,8 @@ for prim in $primlist
 do
 	if test -n "$firsttime"
 	then
-		torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$HALF_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot"
-		torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make
+		torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$SCALE_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot"
+		torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --trust-make
 		mv $T/last-resdir-nodebug $T/first-resdir-nodebug || :
 		if test -f "$T/last-resdir-kasan"
 		then
@@ -559,7 +575,7 @@ do_kcsan="$do_kcsan_save"
 if test "$do_kvfree" = "yes"
 then
 	torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot"
-	torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make
+	torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --memory 2G --trust-make
 fi
 
 if test "$do_clocksourcewd" = "yes"

From 0ff92d145a2be4da47e2ee6287a731084fba112f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Jul 2024 19:52:29 -0700
Subject: [PATCH 28/64] doc: Remove RCU Tasks Rude asynchronous APIs

The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are no longer.
This commit therefore removes them from the documentation.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 .../RCU/Design/Requirements/Requirements.rst  |  3 +-
 Documentation/RCU/checklist.rst               | 61 +++++++++----------
 Documentation/RCU/whatisRCU.rst               |  2 +-
 .../admin-guide/kernel-parameters.txt         |  8 ---
 4 files changed, 30 insertions(+), 44 deletions(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst
index f511476b4550..6125e7068d2c 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -2649,8 +2649,7 @@ those that are idle from RCU's perspective) and then Tasks Rude RCU can
 be removed from the kernel.
 
 The tasks-rude-RCU API is also reader-marking-free and thus quite compact,
-consisting of call_rcu_tasks_rude(), synchronize_rcu_tasks_rude(),
-and rcu_barrier_tasks_rude().
+consisting solely of synchronize_rcu_tasks_rude().
 
 Tasks Trace RCU
 ~~~~~~~~~~~~~~~
diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index 3e6407de231c..7de3e308f330 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -194,14 +194,13 @@ over a rather long period of time, but improvements are always welcome!
 		when publicizing a pointer to a structure that can
 		be traversed by an RCU read-side critical section.
 
-5.	If any of call_rcu(), call_srcu(), call_rcu_tasks(),
-	call_rcu_tasks_rude(), or call_rcu_tasks_trace() is used,
-	the callback function may be invoked from softirq context,
-	and in any case with bottom halves disabled.  In particular,
-	this callback function cannot block.  If you need the callback
-	to block, run that code in a workqueue handler scheduled from
-	the callback.  The queue_rcu_work() function does this for you
-	in the case of call_rcu().
+5.	If any of call_rcu(), call_srcu(), call_rcu_tasks(), or
+	call_rcu_tasks_trace() is used, the callback function may be
+	invoked from softirq context, and in any case with bottom halves
+	disabled.  In particular, this callback function cannot block.
+	If you need the callback to block, run that code in a workqueue
+	handler scheduled from the callback.  The queue_rcu_work()
+	function does this for you in the case of call_rcu().
 
 6.	Since synchronize_rcu() can block, it cannot be called
 	from any sort of irq context.  The same rule applies
@@ -254,10 +253,10 @@ over a rather long period of time, but improvements are always welcome!
 		corresponding readers must use rcu_read_lock_trace()
 		and rcu_read_unlock_trace().
 
-	c.	If an updater uses call_rcu_tasks_rude() or
-		synchronize_rcu_tasks_rude(), then the corresponding
-		readers must use anything that disables preemption,
-		for example, preempt_disable() and preempt_enable().
+	c.	If an updater uses synchronize_rcu_tasks_rude(),
+		then the corresponding readers must use anything that
+		disables preemption, for example, preempt_disable()
+		and preempt_enable().
 
 	Mixing things up will result in confusion and broken kernels, and
 	has even resulted in an exploitable security issue.  Therefore,
@@ -326,11 +325,9 @@ over a rather long period of time, but improvements are always welcome!
 	d.	Periodically invoke rcu_barrier(), permitting a limited
 		number of updates per grace period.
 
-	The same cautions apply to call_srcu(), call_rcu_tasks(),
-	call_rcu_tasks_rude(), and call_rcu_tasks_trace().  This is
-	why there is an srcu_barrier(), rcu_barrier_tasks(),
-	rcu_barrier_tasks_rude(), and rcu_barrier_tasks_rude(),
-	respectively.
+	The same cautions apply to call_srcu(), call_rcu_tasks(), and
+	call_rcu_tasks_trace().  This is why there is an srcu_barrier(),
+	rcu_barrier_tasks(), and rcu_barrier_tasks_trace(), respectively.
 
 	Note that although these primitives do take action to avoid
 	memory exhaustion when any given CPU has too many callbacks,
@@ -383,17 +380,17 @@ over a rather long period of time, but improvements are always welcome!
 	must use whatever locking or other synchronization is required
 	to safely access and/or modify that data structure.
 
-	Do not assume that RCU callbacks will be executed on
-	the same CPU that executed the corresponding call_rcu(),
-	call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(), or
-	call_rcu_tasks_trace().  For example, if a given CPU goes offline
-	while having an RCU callback pending, then that RCU callback
-	will execute on some surviving CPU.  (If this was not the case,
-	a self-spawning RCU callback would prevent the victim CPU from
-	ever going offline.)  Furthermore, CPUs designated by rcu_nocbs=
-	might well *always* have their RCU callbacks executed on some
-	other CPUs, in fact, for some  real-time workloads, this is the
-	whole point of using the rcu_nocbs= kernel boot parameter.
+	Do not assume that RCU callbacks will be executed on the same
+	CPU that executed the corresponding call_rcu(), call_srcu(),
+	call_rcu_tasks(), or call_rcu_tasks_trace().  For example, if
+	a given CPU goes offline while having an RCU callback pending,
+	then that RCU callback will execute on some surviving CPU.
+	(If this was not the case, a self-spawning RCU callback would
+	prevent the victim CPU from ever going offline.)  Furthermore,
+	CPUs designated by rcu_nocbs= might well *always* have their
+	RCU callbacks executed on some other CPUs, in fact, for some
+	real-time workloads, this is the whole point of using the
+	rcu_nocbs= kernel boot parameter.
 
 	In addition, do not assume that callbacks queued in a given order
 	will be invoked in that order, even if they all are queued on the
@@ -507,9 +504,9 @@ over a rather long period of time, but improvements are always welcome!
 	These debugging aids can help you find problems that are
 	otherwise extremely difficult to spot.
 
-17.	If you pass a callback function defined within a module to one of
-	call_rcu(), call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(),
-	or call_rcu_tasks_trace(), then it is necessary to wait for all
+17.	If you pass a callback function defined within a module
+	to one of call_rcu(), call_srcu(), call_rcu_tasks(), or
+	call_rcu_tasks_trace(), then it is necessary to wait for all
 	pending callbacks to be invoked before unloading that module.
 	Note that it is absolutely *not* sufficient to wait for a grace
 	period!  For example, synchronize_rcu() implementation is *not*
@@ -522,7 +519,6 @@ over a rather long period of time, but improvements are always welcome!
 	-	call_rcu() -> rcu_barrier()
 	-	call_srcu() -> srcu_barrier()
 	-	call_rcu_tasks() -> rcu_barrier_tasks()
-	-	call_rcu_tasks_rude() -> rcu_barrier_tasks_rude()
 	-	call_rcu_tasks_trace() -> rcu_barrier_tasks_trace()
 
 	However, these barrier functions are absolutely *not* guaranteed
@@ -539,7 +535,6 @@ over a rather long period of time, but improvements are always welcome!
 	-	Either synchronize_srcu() or synchronize_srcu_expedited(),
 		together with and srcu_barrier()
 	-	synchronize_rcu_tasks() and rcu_barrier_tasks()
-	-	synchronize_tasks_rude() and rcu_barrier_tasks_rude()
 	-	synchronize_tasks_trace() and rcu_barrier_tasks_trace()
 
 	If necessary, you can use something like workqueues to execute
diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
index d585a5490aee..1ef5784c1b84 100644
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -1103,7 +1103,7 @@ RCU-Tasks-Rude::
 
 	Critical sections	Grace period		Barrier
 
-	N/A			call_rcu_tasks_rude	rcu_barrier_tasks_rude
+	N/A						N/A
 				synchronize_rcu_tasks_rude
 
 
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f1384c7b59c9..a6107b745076 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5572,14 +5572,6 @@
 			of zero will disable batching.	Batching is
 			always disabled for synchronize_rcu_tasks().
 
-	rcupdate.rcu_tasks_rude_lazy_ms= [KNL]
-			Set timeout in milliseconds RCU Tasks
-			Rude asynchronous callback batching for
-			call_rcu_tasks_rude().	A negative value
-			will take the default.	A value of zero will
-			disable batching.  Batching is always disabled
-			for synchronize_rcu_tasks_rude().
-
 	rcupdate.rcu_tasks_trace_lazy_ms= [KNL]
 			Set timeout in milliseconds RCU Tasks
 			Trace asynchronous callback batching for

From b428a9de9bba8877aeb5de4691cc9334467065d6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Jul 2024 09:04:08 -0700
Subject: [PATCH 29/64] rcutorture: Stop testing RCU Tasks Rude asynchronous
 APIs

The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are currently
unused.  Furthermore, the idea is to get rid of RCU Tasks Rude entirely
once all architectures have their deep-idle and entry/exit code correctly
marked as inline or noinstr.  As a first step towards this goal, this
commit therefore removes these two functions from rcutorture testing.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcutorture.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 08bf7c669dd3..cd8752b2a960 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -915,11 +915,6 @@ static struct rcu_torture_ops tasks_ops = {
  * Definitions for rude RCU-tasks torture testing.
  */
 
-static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p)
-{
-	call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb);
-}
-
 static struct rcu_torture_ops tasks_rude_ops = {
 	.ttype		= RCU_TASKS_RUDE_FLAVOR,
 	.init		= rcu_sync_torture_init,
@@ -927,11 +922,8 @@ static struct rcu_torture_ops tasks_rude_ops = {
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= rcu_torture_read_unlock_trivial,
 	.get_gp_seq	= rcu_no_completed,
-	.deferred_free	= rcu_tasks_rude_torture_deferred_free,
 	.sync		= synchronize_rcu_tasks_rude,
 	.exp_sync	= synchronize_rcu_tasks_rude,
-	.call		= call_rcu_tasks_rude,
-	.cb_barrier	= rcu_barrier_tasks_rude,
 	.gp_kthread_dbg	= show_rcu_tasks_rude_gp_kthread,
 	.get_gp_data	= rcu_tasks_rude_get_gp_data,
 	.cbflood_max	= 50000,

From 599194d01459670049079ff54d52442f54d38432 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Jul 2024 19:46:00 -0700
Subject: [PATCH 30/64] rcuscale: Stop testing RCU Tasks Rude asynchronous APIs

The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are currently
unused.  Furthermore, the idea is to get rid of RCU Tasks Rude entirely
once all architectures have their deep-idle and entry/exit code correctly
marked as inline or noinstr.  As a step towards this goal, this commit
therefore removes these two functions from rcuscale testing.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcuscale.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index b53a9e8f5904..d534d4ec2314 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -333,8 +333,6 @@ static struct rcu_scale_ops tasks_rude_ops = {
 	.readunlock	= tasks_rude_scale_read_unlock,
 	.get_gp_seq	= rcu_no_completed,
 	.gp_diff	= rcu_seq_diff,
-	.async		= call_rcu_tasks_rude,
-	.gp_barrier	= rcu_barrier_tasks_rude,
 	.sync		= synchronize_rcu_tasks_rude,
 	.exp_sync	= synchronize_rcu_tasks_rude,
 	.rso_gp_kthread	= get_rcu_tasks_rude_gp_kthread,

From 7945b741d1fc071a621366c512a060ea08848955 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Jul 2024 19:47:47 -0700
Subject: [PATCH 31/64] rcu-tasks: Remove RCU Tasks Rude asynchronous APIs

The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are currently
unused.  This commit therefore removes their definitions and boot-time
self-tests.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcupdate.h |  2 --
 kernel/rcu/tasks.h       | 55 +++++++++++++---------------------------
 2 files changed, 17 insertions(+), 40 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 13f6f00aecf9..31e679c7110e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -37,7 +37,6 @@
 /* Exported common interfaces */
 void call_rcu(struct rcu_head *head, rcu_callback_t func);
 void rcu_barrier_tasks(void);
-void rcu_barrier_tasks_rude(void);
 void synchronize_rcu(void);
 
 struct rcu_gp_oldstate;
@@ -202,7 +201,6 @@ do {									\
 } while (0)
 
 # ifdef CONFIG_TASKS_RUDE_RCU
-void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func);
 void synchronize_rcu_tasks_rude(void);
 # endif
 
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index ba3440a45b6d..4bc038bcc016 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -396,7 +396,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
 
 // Wait for all in-flight callbacks for the specified RCU Tasks flavor.
 // Operates in a manner similar to rcu_barrier().
-static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
+static void __maybe_unused rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
 {
 	int cpu;
 	unsigned long flags;
@@ -1244,13 +1244,12 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
 
 ////////////////////////////////////////////////////////////////////////
 //
-// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of
-// passing an empty function to schedule_on_each_cpu().  This approach
-// provides an asynchronous call_rcu_tasks_rude() API and batching of
-// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API.
-// This invokes schedule_on_each_cpu() in order to send IPIs far and wide
-// and induces otherwise unnecessary context switches on all online CPUs,
-// whether idle or not.
+// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's
+// trick of passing an empty function to schedule_on_each_cpu().
+// This approach provides batching of concurrent calls to the synchronous
+// synchronize_rcu_tasks_rude() API.  This invokes schedule_on_each_cpu()
+// in order to send IPIs far and wide and induces otherwise unnecessary
+// context switches on all online CPUs, whether idle or not.
 //
 // Callback handling is provided by the rcu_tasks_kthread() function.
 //
@@ -1268,11 +1267,11 @@ static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)
 	schedule_on_each_cpu(rcu_tasks_be_rude);
 }
 
-void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func);
+static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func);
 DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
 		 "RCU Tasks Rude");
 
-/**
+/*
  * call_rcu_tasks_rude() - Queue a callback rude task-based grace period
  * @rhp: structure to be used for queueing the RCU updates.
  * @func: actual callback function to be invoked after the grace period
@@ -1289,12 +1288,14 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
  *
  * See the description of call_rcu() for more detailed information on
  * memory ordering guarantees.
+ *
+ * This is no longer exported, and is instead reserved for use by
+ * synchronize_rcu_tasks_rude().
  */
-void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func)
+static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func)
 {
 	call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude);
 }
-EXPORT_SYMBOL_GPL(call_rcu_tasks_rude);
 
 /**
  * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period
@@ -1320,26 +1321,9 @@ void synchronize_rcu_tasks_rude(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude);
 
-/**
- * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks.
- *
- * Although the current implementation is guaranteed to wait, it is not
- * obligated to, for example, if there are no pending callbacks.
- */
-void rcu_barrier_tasks_rude(void)
-{
-	rcu_barrier_tasks_generic(&rcu_tasks_rude);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude);
-
-int rcu_tasks_rude_lazy_ms = -1;
-module_param(rcu_tasks_rude_lazy_ms, int, 0444);
-
 static int __init rcu_spawn_tasks_rude_kthread(void)
 {
 	rcu_tasks_rude.gp_sleep = HZ / 10;
-	if (rcu_tasks_rude_lazy_ms >= 0)
-		rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms);
 	rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
 	return 0;
 }
@@ -2069,11 +2053,6 @@ static struct rcu_tasks_test_desc tests[] = {
 		/* If not defined, the test is skipped. */
 		.notrun = IS_ENABLED(CONFIG_TASKS_RCU),
 	},
-	{
-		.name = "call_rcu_tasks_rude()",
-		/* If not defined, the test is skipped. */
-		.notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
-	},
 	{
 		.name = "call_rcu_tasks_trace()",
 		/* If not defined, the test is skipped. */
@@ -2081,6 +2060,7 @@ static struct rcu_tasks_test_desc tests[] = {
 	}
 };
 
+#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
 static void test_rcu_tasks_callback(struct rcu_head *rhp)
 {
 	struct rcu_tasks_test_desc *rttd =
@@ -2090,6 +2070,7 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
 
 	rttd->notrun = false;
 }
+#endif // #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
 
 static void rcu_tasks_initiate_self_tests(void)
 {
@@ -2102,16 +2083,14 @@ static void rcu_tasks_initiate_self_tests(void)
 
 #ifdef CONFIG_TASKS_RUDE_RCU
 	pr_info("Running RCU Tasks Rude wait API self tests\n");
-	tests[1].runstart = jiffies;
 	synchronize_rcu_tasks_rude();
-	call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
 #endif
 
 #ifdef CONFIG_TASKS_TRACE_RCU
 	pr_info("Running RCU Tasks Trace wait API self tests\n");
-	tests[2].runstart = jiffies;
+	tests[1].runstart = jiffies;
 	synchronize_rcu_tasks_trace();
-	call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
+	call_rcu_tasks_trace(&tests[1].rh, test_rcu_tasks_callback);
 #endif
 }
 

From fd70e9f1d85f5323096ad313ba73f5fe3d15ea41 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang1211@gmail.com>
Date: Wed, 10 Jul 2024 12:45:42 +0800
Subject: [PATCH 32/64] rcu-tasks: Fix access non-existent percpu rtpcp
 variable in rcu_tasks_need_gpcb()

For kernels built with CONFIG_FORCE_NR_CPUS=y, the nr_cpu_ids is
defined as NR_CPUS instead of the number of possible cpus, this
will cause the following system panic:

smpboot: Allowing 4 CPUs, 0 hotplug CPUs
...
setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:512 nr_node_ids:1
...
BUG: unable to handle page fault for address: ffffffff9911c8c8
Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 0 PID: 15 Comm: rcu_tasks_trace Tainted: G W
6.6.21 #1 5dc7acf91a5e8e9ac9dcfc35bee0245691283ea6
RIP: 0010:rcu_tasks_need_gpcb+0x25d/0x2c0
RSP: 0018:ffffa371c00a3e60 EFLAGS: 00010082
CR2: ffffffff9911c8c8 CR3: 000000040fa20005 CR4: 00000000001706f0
Call Trace:
<TASK>
? __die+0x23/0x80
? page_fault_oops+0xa4/0x180
? exc_page_fault+0x152/0x180
? asm_exc_page_fault+0x26/0x40
? rcu_tasks_need_gpcb+0x25d/0x2c0
? __pfx_rcu_tasks_kthread+0x40/0x40
rcu_tasks_one_gp+0x69/0x180
rcu_tasks_kthread+0x94/0xc0
kthread+0xe8/0x140
? __pfx_kthread+0x40/0x40
ret_from_fork+0x34/0x80
? __pfx_kthread+0x40/0x40
ret_from_fork_asm+0x1b/0x80
</TASK>

Considering that there may be holes in the CPU numbers, use the
maximum possible cpu number, instead of nr_cpu_ids, for configuring
enqueue and dequeue limits.

[ neeraj.upadhyay: Fix htmldocs build error reported by Stephen Rothwell ]

Closes: https://lore.kernel.org/linux-input/CALMA0xaTSMN+p4xUXkzrtR5r6k7hgoswcaXx7baR_z9r5jjskw@mail.gmail.com/T/#u
Reported-by: Zhixu Liu <zhixu.liu@gmail.com>
Signed-off-by: Zqiang <qiang.zhang1211@gmail.com>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tasks.h | 82 ++++++++++++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 29 deletions(-)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 4bc038bcc016..72d564c84499 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -34,6 +34,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
  * @rtp_blkd_tasks: List of tasks blocked as readers.
  * @rtp_exit_list: List of tasks in the latter portion of do_exit().
  * @cpu: CPU number corresponding to this entry.
+ * @index: Index of this CPU in rtpcp_array of the rcu_tasks structure.
  * @rtpp: Pointer to the rcu_tasks structure.
  */
 struct rcu_tasks_percpu {
@@ -49,6 +50,7 @@ struct rcu_tasks_percpu {
 	struct list_head rtp_blkd_tasks;
 	struct list_head rtp_exit_list;
 	int cpu;
+	int index;
 	struct rcu_tasks *rtpp;
 };
 
@@ -76,6 +78,7 @@ struct rcu_tasks_percpu {
  * @call_func: This flavor's call_rcu()-equivalent function.
  * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE).
  * @rtpcpu: This flavor's rcu_tasks_percpu structure.
+ * @rtpcp_array: Array of pointers to rcu_tasks_percpu structure of CPUs in cpu_possible_mask.
  * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks.
  * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing.
  * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing.
@@ -110,6 +113,7 @@ struct rcu_tasks {
 	call_rcu_func_t call_func;
 	unsigned int wait_state;
 	struct rcu_tasks_percpu __percpu *rtpcpu;
+	struct rcu_tasks_percpu **rtpcp_array;
 	int percpu_enqueue_shift;
 	int percpu_enqueue_lim;
 	int percpu_dequeue_lim;
@@ -182,6 +186,8 @@ module_param(rcu_task_collapse_lim, int, 0444);
 static int rcu_task_lazy_lim __read_mostly = 32;
 module_param(rcu_task_lazy_lim, int, 0444);
 
+static int rcu_task_cpu_ids;
+
 /* RCU tasks grace-period state for debugging. */
 #define RTGS_INIT		 0
 #define RTGS_WAIT_WAIT_CBS	 1
@@ -245,6 +251,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
 	int cpu;
 	int lim;
 	int shift;
+	int maxcpu;
+	int index = 0;
 
 	if (rcu_task_enqueue_lim < 0) {
 		rcu_task_enqueue_lim = 1;
@@ -254,14 +262,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
 	}
 	lim = rcu_task_enqueue_lim;
 
-	if (lim > nr_cpu_ids)
-		lim = nr_cpu_ids;
-	shift = ilog2(nr_cpu_ids / lim);
-	if (((nr_cpu_ids - 1) >> shift) >= lim)
-		shift++;
-	WRITE_ONCE(rtp->percpu_enqueue_shift, shift);
-	WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
-	smp_store_release(&rtp->percpu_enqueue_lim, lim);
+	rtp->rtpcp_array = kcalloc(num_possible_cpus(), sizeof(struct rcu_tasks_percpu *), GFP_KERNEL);
+	BUG_ON(!rtp->rtpcp_array);
+
 	for_each_possible_cpu(cpu) {
 		struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
 
@@ -273,14 +276,29 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
 		INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
 		rtpcp->cpu = cpu;
 		rtpcp->rtpp = rtp;
+		rtpcp->index = index;
+		rtp->rtpcp_array[index] = rtpcp;
+		index++;
 		if (!rtpcp->rtp_blkd_tasks.next)
 			INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
 		if (!rtpcp->rtp_exit_list.next)
 			INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
+		maxcpu = cpu;
 	}
 
-	pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,
-			data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust);
+	rcu_task_cpu_ids = maxcpu + 1;
+	if (lim > rcu_task_cpu_ids)
+		lim = rcu_task_cpu_ids;
+	shift = ilog2(rcu_task_cpu_ids / lim);
+	if (((rcu_task_cpu_ids - 1) >> shift) >= lim)
+		shift++;
+	WRITE_ONCE(rtp->percpu_enqueue_shift, shift);
+	WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
+	smp_store_release(&rtp->percpu_enqueue_lim, lim);
+
+	pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d rcu_task_cpu_ids=%d.\n",
+			rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim),
+			rcu_task_cb_adjust, rcu_task_cpu_ids);
 }
 
 // Compute wakeup time for lazy callback timer.
@@ -348,7 +366,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
 			rtpcp->rtp_n_lock_retries = 0;
 		}
 		if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim &&
-		    READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids)
+		    READ_ONCE(rtp->percpu_enqueue_lim) != rcu_task_cpu_ids)
 			needadjust = true;  // Defer adjustment to avoid deadlock.
 	}
 	// Queuing callbacks before initialization not yet supported.
@@ -368,10 +386,10 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
 	if (unlikely(needadjust)) {
 		raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
-		if (rtp->percpu_enqueue_lim != nr_cpu_ids) {
+		if (rtp->percpu_enqueue_lim != rcu_task_cpu_ids) {
 			WRITE_ONCE(rtp->percpu_enqueue_shift, 0);
-			WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids);
-			smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids);
+			WRITE_ONCE(rtp->percpu_dequeue_lim, rcu_task_cpu_ids);
+			smp_store_release(&rtp->percpu_enqueue_lim, rcu_task_cpu_ids);
 			pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
 		}
 		raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
@@ -444,6 +462,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
 
 	dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim);
 	for (cpu = 0; cpu < dequeue_limit; cpu++) {
+		if (!cpu_possible(cpu))
+			continue;
 		struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
 
 		/* Advance and accelerate any new callbacks. */
@@ -481,7 +501,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
 	if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) {
 		raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
 		if (rtp->percpu_enqueue_lim > 1) {
-			WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));
+			WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(rcu_task_cpu_ids));
 			smp_store_release(&rtp->percpu_enqueue_lim, 1);
 			rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
 			gpdone = false;
@@ -496,7 +516,9 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
 			pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
 		}
 		if (rtp->percpu_dequeue_lim == 1) {
-			for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
+			for (cpu = rtp->percpu_dequeue_lim; cpu < rcu_task_cpu_ids; cpu++) {
+				if (!cpu_possible(cpu))
+					continue;
 				struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
 
 				WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
@@ -511,30 +533,32 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
 // Advance callbacks and invoke any that are ready.
 static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp)
 {
-	int cpu;
-	int cpunext;
 	int cpuwq;
 	unsigned long flags;
 	int len;
+	int index;
 	struct rcu_head *rhp;
 	struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
 	struct rcu_tasks_percpu *rtpcp_next;
 
-	cpu = rtpcp->cpu;
-	cpunext = cpu * 2 + 1;
-	if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
-		rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
-		cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
-		queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
-		cpunext++;
-		if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
-			rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
-			cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
+	index = rtpcp->index * 2 + 1;
+	if (index < num_possible_cpus()) {
+		rtpcp_next = rtp->rtpcp_array[index];
+		if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+			cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND;
 			queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
+			index++;
+			if (index < num_possible_cpus()) {
+				rtpcp_next = rtp->rtpcp_array[index];
+				if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+					cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND;
+					queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
+				}
+			}
 		}
 	}
 
-	if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu))
+	if (rcu_segcblist_empty(&rtpcp->cblist))
 		return;
 	raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
 	rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));

From 49f49266ec9d61a4d038cb4ddff4417ba793198f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:34:21 -0700
Subject: [PATCH 33/64] rcu/tasks: Check processor-ID assumptions

The current mapping of smp_processor_id() to a CPU processing Tasks-RCU
callbacks makes some assumptions about layout.  This commit therefore
adds a WARN_ON() to check these assumptions.

[ neeraj.upadhyay: Replace nr_cpu_ids with rcu_task_cpu_ids. ]

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tasks.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 72d564c84499..9fb8bb0afff7 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -357,6 +357,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
 	rcu_read_lock();
 	ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift);
 	chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask);
+	WARN_ON_ONCE(chosen_cpu >= rcu_task_cpu_ids);
 	rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu);
 	if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled.
 		raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.

From 522295774db51ab39ea3d46926555401c8f10130 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:34:22 -0700
Subject: [PATCH 34/64] rcu/tasks: Update rtp->tasks_gp_seq comment

The rtp->tasks_gp_seq grace-period sequence number is not a strict count,
but rather the usual RCU sequence number with the lower few bits tracking
per-grace-period state and the upper bits the count of grace periods
since boot, give or take the initial value.  This commit therefore
adjusts this comment.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tasks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 9fb8bb0afff7..127018280618 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -65,7 +65,7 @@ struct rcu_tasks_percpu {
  * @init_fract: Initial backoff sleep interval.
  * @gp_jiffies: Time of last @gp_state transition.
  * @gp_start: Most recent grace-period start in jiffies.
- * @tasks_gp_seq: Number of grace periods completed since boot.
+ * @tasks_gp_seq: Number of grace periods completed since boot in upper bits.
  * @n_ipis: Number of IPIs sent to encourage grace periods to end.
  * @n_ipis_fails: Number of IPI-send failures.
  * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.

From 54973cdd166b13195a99940cd21a827c3a99cac2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:34:23 -0700
Subject: [PATCH 35/64] rcu: Provide rcu_barrier_cb_is_done() to check
 rcu_barrier() CBs

This commit provides a rcu_barrier_cb_is_done() function that returns
true if the *rcu_barrier*() callback passed in is done.  This will be
used when printing grace-period debugging information.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcu.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 38238e595a61..caaed27e476b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -255,6 +255,11 @@ static inline void debug_rcu_head_callback(struct rcu_head *rhp)
 		kmem_dump_obj(rhp);
 }
 
+static inline bool rcu_barrier_cb_is_done(struct rcu_head *rhp)
+{
+	return rhp->next == rhp;
+}
+
 extern int rcu_cpu_stall_suppress_at_boot;
 
 static inline bool rcu_stall_is_suppressed_at_boot(void)

From d3f84aeb71d61ece24164bb2ce3161c60922fd8c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:34:24 -0700
Subject: [PATCH 36/64] rcu/tasks: Mark callbacks not currently participating
 in barrier operation

Each Tasks RCU flavor keeps a count of the number of callbacks that the
current rcu_barrier_tasks*() is waiting on, but there is currently no
easy way to work out which callback is stuck.  One way to do this is to
mark idle RCU-barrier callbacks by making the ->next pointer point to
the callback itself, and this commit does just that.

Later commits will use this for debug output.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tasks.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 127018280618..d44abcd656d6 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -283,6 +283,7 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
 			INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
 		if (!rtpcp->rtp_exit_list.next)
 			INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
+		rtpcp->barrier_q_head.next = &rtpcp->barrier_q_head;
 		maxcpu = cpu;
 	}
 
@@ -407,6 +408,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
 	struct rcu_tasks *rtp;
 	struct rcu_tasks_percpu *rtpcp;
 
+	rhp->next = rhp; // Mark the callback as having been invoked.
 	rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head);
 	rtp = rtpcp->rtpp;
 	if (atomic_dec_and_test(&rtp->barrier_q_count))

From fe91cf39db0939ac8d523b5a1c31840f7cbe205c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:34:25 -0700
Subject: [PATCH 37/64] rcu/tasks: Add detailed grace-period and barrier
 diagnostics

This commit adds rcu_tasks_torture_stats_print(),
rcu_tasks_trace_torture_stats_print(), and
rcu_tasks_rude_torture_stats_print() functions that provide detailed
diagnostics on grace-period, callback, and barrier state.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcupdate.h |  3 ++
 kernel/rcu/tasks.h       | 66 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 31e679c7110e..17463e95b6ef 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -164,6 +164,7 @@ static inline void rcu_nocb_flush_deferred_wakeup(void) { }
 	} while (0)
 void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
 void synchronize_rcu_tasks(void);
+void rcu_tasks_torture_stats_print(char *tt, char *tf);
 # else
 # define rcu_tasks_classic_qs(t, preempt) do { } while (0)
 # define call_rcu_tasks call_rcu
@@ -190,6 +191,7 @@ void rcu_tasks_trace_qs_blkd(struct task_struct *t);
 			rcu_tasks_trace_qs_blkd(t);				\
 		}								\
 	} while (0)
+void rcu_tasks_trace_torture_stats_print(char *tt, char *tf);
 # else
 # define rcu_tasks_trace_qs(t) do { } while (0)
 # endif
@@ -202,6 +204,7 @@ do {									\
 
 # ifdef CONFIG_TASKS_RUDE_RCU
 void synchronize_rcu_tasks_rude(void);
+void rcu_tasks_rude_torture_stats_print(char *tt, char *tf);
 # endif
 
 #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false)
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index d44abcd656d6..5f6d80ce1e47 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -714,9 +714,7 @@ static void __init rcu_tasks_bootup_oddness(void)
 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
 }
 
-#endif /* #ifndef CONFIG_TINY_RCU */
 
-#ifndef CONFIG_TINY_RCU
 /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
 static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
 {
@@ -750,6 +748,52 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
 		rtp->lazy_jiffies,
 		s);
 }
+
+/* Dump out more rcutorture-relevant state common to all RCU-tasks flavors. */
+static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *tt,
+						  char *tf, char *tst)
+{
+	cpumask_var_t cm;
+	int cpu;
+	bool gotcb = false;
+	unsigned long j = jiffies;
+
+	pr_alert("%s%s Tasks%s RCU g%ld gp_start %lu gp_jiffies %lu gp_state %d (%s).\n",
+		 tt, tf, tst, data_race(rtp->tasks_gp_seq),
+		 j - data_race(rtp->gp_start), j - data_race(rtp->gp_jiffies),
+		 data_race(rtp->gp_state), tasks_gp_state_getname(rtp));
+	pr_alert("\tEnqueue shift %d limit %d Dequeue limit %d gpseq %lu.\n",
+		 data_race(rtp->percpu_enqueue_shift),
+		 data_race(rtp->percpu_enqueue_lim),
+		 data_race(rtp->percpu_dequeue_lim),
+		 data_race(rtp->percpu_dequeue_gpseq));
+	(void)zalloc_cpumask_var(&cm, GFP_KERNEL);
+	pr_alert("\tCallback counts:");
+	for_each_possible_cpu(cpu) {
+		long n;
+		struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+		if (cpumask_available(cm) && !rcu_barrier_cb_is_done(&rtpcp->barrier_q_head))
+			cpumask_set_cpu(cpu, cm);
+		n = rcu_segcblist_n_cbs(&rtpcp->cblist);
+		if (!n)
+			continue;
+		pr_cont(" %d:%ld", cpu, n);
+		gotcb = true;
+	}
+	if (gotcb)
+		pr_cont(".\n");
+	else
+		pr_cont(" (none).\n");
+	pr_alert("\tBarrier seq %lu count %d holdout CPUs ",
+		 data_race(rtp->barrier_q_seq), atomic_read(&rtp->barrier_q_count));
+	if (cpumask_available(cm) && !cpumask_empty(cm))
+		pr_cont(" %*pbl.\n", cpumask_pr_args(cm));
+	else
+		pr_cont("(none).\n");
+	free_cpumask_var(cm);
+}
+
 #endif // #ifndef CONFIG_TINY_RCU
 
 static void exit_tasks_rcu_finish_trace(struct task_struct *t);
@@ -1201,6 +1245,12 @@ void show_rcu_tasks_classic_gp_kthread(void)
 	show_rcu_tasks_generic_gp_kthread(&rcu_tasks, "");
 }
 EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);
+
+void rcu_tasks_torture_stats_print(char *tt, char *tf)
+{
+	rcu_tasks_torture_stats_print_generic(&rcu_tasks, tt, tf, "");
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_torture_stats_print);
 #endif // !defined(CONFIG_TINY_RCU)
 
 struct task_struct *get_rcu_tasks_gp_kthread(void)
@@ -1361,6 +1411,12 @@ void show_rcu_tasks_rude_gp_kthread(void)
 	show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, "");
 }
 EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
+
+void rcu_tasks_rude_torture_stats_print(char *tt, char *tf)
+{
+	rcu_tasks_torture_stats_print_generic(&rcu_tasks_rude, tt, tf, "");
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_rude_torture_stats_print);
 #endif // !defined(CONFIG_TINY_RCU)
 
 struct task_struct *get_rcu_tasks_rude_gp_kthread(void)
@@ -2038,6 +2094,12 @@ void show_rcu_tasks_trace_gp_kthread(void)
 	show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf);
 }
 EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread);
+
+void rcu_tasks_trace_torture_stats_print(char *tt, char *tf)
+{
+	rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, "");
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print);
 #endif // !defined(CONFIG_TINY_RCU)
 
 struct task_struct *get_rcu_tasks_trace_gp_kthread(void)

From 591ce640819ff10385cd072fb19c2ca5d5181fe5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:34:26 -0700
Subject: [PATCH 38/64] rcu/tasks: Add rcu_barrier_tasks*() start time to
 diagnostics

This commit adds the start time, in jiffies, of the most recently started
rcu_barrier_tasks*() operation to the diagnostic output used by rcuscale.
This information can be helpful in distinguishing a hung barrier operation
from a long series of barrier operations.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tasks.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 5f6d80ce1e47..36be92efb0da 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -87,6 +87,7 @@ struct rcu_tasks_percpu {
  * @barrier_q_count: Number of queues being waited on.
  * @barrier_q_completion: Barrier wait/wakeup mechanism.
  * @barrier_q_seq: Sequence number for barrier operations.
+ * @barrier_q_start: Most recent barrier start in jiffies.
  * @name: This flavor's textual name.
  * @kname: This flavor's kthread name.
  */
@@ -122,6 +123,7 @@ struct rcu_tasks {
 	atomic_t barrier_q_count;
 	struct completion barrier_q_completion;
 	unsigned long barrier_q_seq;
+	unsigned long barrier_q_start;
 	char *name;
 	char *kname;
 };
@@ -430,6 +432,7 @@ static void __maybe_unused rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
 		mutex_unlock(&rtp->barrier_q_mutex);
 		return;
 	}
+	rtp->barrier_q_start = jiffies;
 	rcu_seq_start(&rtp->barrier_q_seq);
 	init_completion(&rtp->barrier_q_completion);
 	atomic_set(&rtp->barrier_q_count, 2);
@@ -785,8 +788,9 @@ static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *t
 		pr_cont(".\n");
 	else
 		pr_cont(" (none).\n");
-	pr_alert("\tBarrier seq %lu count %d holdout CPUs ",
-		 data_race(rtp->barrier_q_seq), atomic_read(&rtp->barrier_q_count));
+	pr_alert("\tBarrier seq %lu start %lu count %d holdout CPUs ",
+		 data_race(rtp->barrier_q_seq), j - data_race(rtp->barrier_q_start),
+		 atomic_read(&rtp->barrier_q_count));
 	if (cpumask_available(cm) && !cpumask_empty(cm))
 		pr_cont(" %*pbl.\n", cpumask_pr_args(cm));
 	else

From 3e49aea71d5bac93b892f954d14bc8070e4cc651 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 5 Jul 2024 19:57:47 -0700
Subject: [PATCH 39/64] refscale: Add TINY scenario

This commit adds a TINY scenario in order to support tests of Tiny RCU
and Tiny SRCU.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 .../rcutorture/configs/refscale/TINY          | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 tools/testing/selftests/rcutorture/configs/refscale/TINY

diff --git a/tools/testing/selftests/rcutorture/configs/refscale/TINY b/tools/testing/selftests/rcutorture/configs/refscale/TINY
new file mode 100644
index 000000000000..759343980b80
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refscale/TINY
@@ -0,0 +1,20 @@
+CONFIG_SMP=n
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+CONFIG_PREEMPT_DYNAMIC=n
+#CHECK#CONFIG_PREEMPT_RCU=n
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_KPROBES=n
+CONFIG_FTRACE=n

From 4e39bb49c2de445848c2668d7f9fec6ba70724ae Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 21 Jul 2024 09:23:46 +0200
Subject: [PATCH 40/64] refscale: Optimize process_durations()

process_durations() is not a hot path, but there is no good reason to
iterate over and over the data already in 'buf'.

Using a seq_buf saves some useless strcat() and the need of a temp buffer.
Data is written directly at the correct place.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Tested-by: "Paul E. McKenney" <paulmck@kernel.org>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/refscale.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index f4ea5b1ec068..cfec0648e141 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -28,6 +28,7 @@
 #include <linux/rcupdate_trace.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
+#include <linux/seq_buf.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
 #include <linux/stat.h>
@@ -891,32 +892,34 @@ static u64 process_durations(int n)
 {
 	int i;
 	struct reader_task *rt;
-	char buf1[64];
+	struct seq_buf s;
 	char *buf;
 	u64 sum = 0;
 
 	buf = kmalloc(800 + 64, GFP_KERNEL);
 	if (!buf)
 		return 0;
-	buf[0] = 0;
-	sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
-		exp_idx);
+	seq_buf_init(&s, buf, 800 + 64);
+
+	seq_buf_printf(&s, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
+		       exp_idx);
 
 	for (i = 0; i < n && !torture_must_stop(); i++) {
 		rt = &(reader_tasks[i]);
-		sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns);
 
 		if (i % 5 == 0)
-			strcat(buf, "\n");
-		if (strlen(buf) >= 800) {
-			pr_alert("%s", buf);
-			buf[0] = 0;
+			seq_buf_putc(&s, '\n');
+
+		if (seq_buf_used(&s) >= 800) {
+			pr_alert("%s", seq_buf_str(&s));
+			seq_buf_clear(&s);
 		}
-		strcat(buf, buf1);
+
+		seq_buf_printf(&s, "%d: %llu\t", i, rt->last_duration_ns);
 
 		sum += rt->last_duration_ns;
 	}
-	pr_alert("%s\n", buf);
+	pr_alert("%s\n", seq_buf_str(&s));
 
 	kfree(buf);
 	return sum;

From ea793764b5c677cf16f114a5ee3fb620560a78f6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:42:58 -0700
Subject: [PATCH 41/64] rcuscale: Save a few lines with whitespace-only change

This whitespace-only commit fuses a few lines of code, taking advantage
of the newish 100-character-per-line limit to save a few lines of code.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcuscale.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index d534d4ec2314..3269dd9c639f 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -1015,13 +1015,9 @@ rcu_scale_init(void)
 	}
 	while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders)
 		schedule_timeout_uninterruptible(1);
-	writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]),
-			       GFP_KERNEL);
-	writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations),
-				   GFP_KERNEL);
-	writer_n_durations =
-		kcalloc(nrealwriters, sizeof(*writer_n_durations),
-			GFP_KERNEL);
+	writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), GFP_KERNEL);
+	writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL);
+	writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL);
 	if (!writer_tasks || !writer_durations || !writer_n_durations) {
 		SCALEOUT_ERRSTRING("out of memory");
 		firsterr = -ENOMEM;

From 42a8a2695ccbb98c15d73703cb39af70cdcf45fb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:42:59 -0700
Subject: [PATCH 42/64] rcuscale: Dump stacks of stalled rcu_scale_writer()
 instances

This commit improves debuggability by dumping the stacks of
rcu_scale_writer() instances that have not completed in a reasonable
timeframe.  These stacks are dumped remotely, but they will be accurate
in the thus-far common case where the stalled rcu_scale_writer() instances
are blocked.

[ paulmck: Apply kernel test robot feedback. ]

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcuscale.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 3269dd9c639f..5087ca7062d9 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -39,6 +39,7 @@
 #include <linux/torture.h>
 #include <linux/vmalloc.h>
 #include <linux/rcupdate_trace.h>
+#include <linux/sched/debug.h>
 
 #include "rcu.h"
 
@@ -111,6 +112,7 @@ static struct task_struct **reader_tasks;
 static struct task_struct *shutdown_task;
 
 static u64 **writer_durations;
+static bool *writer_done;
 static int *writer_n_durations;
 static atomic_t n_rcu_scale_reader_started;
 static atomic_t n_rcu_scale_writer_started;
@@ -524,6 +526,7 @@ retry:
 			started = true;
 		if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) {
 			done = true;
+			WRITE_ONCE(writer_done[me], true);
 			sched_set_normal(current, 0);
 			pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n",
 				 scale_type, SCALE_FLAG, me, MIN_MEAS);
@@ -549,6 +552,19 @@ retry:
 		if (done && !alldone &&
 		    atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters)
 			alldone = true;
+		if (done && !alldone && time_after(jiffies, jdone + HZ * 60)) {
+			static atomic_t dumped;
+			int i;
+
+			if (!atomic_xchg(&dumped, 1)) {
+				for (i = 0; i < nrealwriters; i++) {
+					if (writer_done[i])
+						continue;
+					pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i);
+					sched_show_task(writer_tasks[i]);
+				}
+			}
+		}
 		if (started && !alldone && i < MAX_MEAS - 1)
 			i++;
 		rcu_scale_wait_shutdown();
@@ -921,6 +937,8 @@ rcu_scale_cleanup(void)
 		kfree(writer_tasks);
 		kfree(writer_durations);
 		kfree(writer_n_durations);
+		kfree(writer_done);
+		writer_done = NULL;
 	}
 
 	/* Do torture-type-specific cleanup operations.  */
@@ -1015,10 +1033,11 @@ rcu_scale_init(void)
 	}
 	while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders)
 		schedule_timeout_uninterruptible(1);
-	writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), GFP_KERNEL);
+	writer_tasks = kcalloc(nrealwriters, sizeof(writer_tasks[0]), GFP_KERNEL);
 	writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL);
 	writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL);
-	if (!writer_tasks || !writer_durations || !writer_n_durations) {
+	writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL);
+	if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done) {
 		SCALEOUT_ERRSTRING("out of memory");
 		firsterr = -ENOMEM;
 		goto unwind;

From 48c21c020fcacba6deb065c78ac1876418b5be11 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:43:00 -0700
Subject: [PATCH 43/64] rcuscale: Dump grace-period statistics when
 rcu_scale_writer() stalls

This commit adds a .stats function pointer to the rcu_scale_ops structure,
and if this is non-NULL, it is invoked after stack traces are dumped in
response to a rcu_scale_writer() stall.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcuscale.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 5087ca7062d9..1b9a43653d7e 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -145,6 +145,7 @@ struct rcu_scale_ops {
 	void (*sync)(void);
 	void (*exp_sync)(void);
 	struct task_struct *(*rso_gp_kthread)(void);
+	void (*stats)(void);
 	const char *name;
 };
 
@@ -226,6 +227,11 @@ static void srcu_scale_synchronize(void)
 	synchronize_srcu(srcu_ctlp);
 }
 
+static void srcu_scale_stats(void)
+{
+	srcu_torture_stats_print(srcu_ctlp, scale_type, SCALE_FLAG);
+}
+
 static void srcu_scale_synchronize_expedited(void)
 {
 	synchronize_srcu_expedited(srcu_ctlp);
@@ -243,6 +249,7 @@ static struct rcu_scale_ops srcu_ops = {
 	.gp_barrier	= srcu_rcu_barrier,
 	.sync		= srcu_scale_synchronize,
 	.exp_sync	= srcu_scale_synchronize_expedited,
+	.stats		= srcu_scale_stats,
 	.name		= "srcu"
 };
 
@@ -272,6 +279,7 @@ static struct rcu_scale_ops srcud_ops = {
 	.gp_barrier	= srcu_rcu_barrier,
 	.sync		= srcu_scale_synchronize,
 	.exp_sync	= srcu_scale_synchronize_expedited,
+	.stats		= srcu_scale_stats,
 	.name		= "srcud"
 };
 
@@ -563,6 +571,8 @@ retry:
 					pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i);
 					sched_show_task(writer_tasks[i]);
 				}
+				if (cur_ops->stats)
+					cur_ops->stats();
 			}
 		}
 		if (started && !alldone && i < MAX_MEAS - 1)

From 0616f7e981968743edb1ef758ba1a0e984240419 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:43:01 -0700
Subject: [PATCH 44/64] rcu: Mark callbacks not currently participating in
 barrier operation

RCU keeps a count of the number of callbacks that the current
rcu_barrier() is waiting on, but there is currently no easy way to
work out which callback is stuck.  One way to do this is to mark idle
RCU-barrier callbacks by making the ->next pointer point to the callback
itself, and this commit does just that.

Later commits will use this for debug output.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e641cc681901..f931171daecd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4403,6 +4403,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
 {
 	unsigned long __maybe_unused s = rcu_state.barrier_sequence;
 
+	rhp->next = rhp; // Mark the callback as having been invoked.
 	if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
 		rcu_barrier_trace(TPS("LastCB"), -1, s);
 		complete(&rcu_state.barrier_completion);
@@ -5424,6 +5425,8 @@ static void __init rcu_init_one(void)
 		while (i > rnp->grphi)
 			rnp++;
 		per_cpu_ptr(&rcu_data, i)->mynode = rnp;
+		per_cpu_ptr(&rcu_data, i)->barrier_head.next =
+			&per_cpu_ptr(&rcu_data, i)->barrier_head;
 		rcu_boot_init_percpu_data(i);
 	}
 }

From 674fc922f06f7632ac94536ca92dd8addb606f46 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:43:02 -0700
Subject: [PATCH 45/64] rcuscale: Print detailed grace-period and barrier
 diagnostics

This commit uses the new  rcu_tasks_torture_stats_print(),
rcu_tasks_trace_torture_stats_print(), and
rcu_tasks_rude_torture_stats_print() functions in order to provide
detailed diagnostics on grace-period, callback, and barrier state when
rcu_scale_writer() hangs.

[ paulmck: Apply kernel test robot feedback. ]

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcuscale.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 1b9a43653d7e..c507750e94d8 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -298,6 +298,11 @@ static void tasks_scale_read_unlock(int idx)
 {
 }
 
+static void rcu_tasks_scale_stats(void)
+{
+	rcu_tasks_torture_stats_print(scale_type, SCALE_FLAG);
+}
+
 static struct rcu_scale_ops tasks_ops = {
 	.ptype		= RCU_TASKS_FLAVOR,
 	.init		= rcu_sync_scale_init,
@@ -310,6 +315,7 @@ static struct rcu_scale_ops tasks_ops = {
 	.sync		= synchronize_rcu_tasks,
 	.exp_sync	= synchronize_rcu_tasks,
 	.rso_gp_kthread	= get_rcu_tasks_gp_kthread,
+	.stats		= IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_scale_stats,
 	.name		= "tasks"
 };
 
@@ -336,6 +342,11 @@ static void tasks_rude_scale_read_unlock(int idx)
 {
 }
 
+static void rcu_tasks_rude_scale_stats(void)
+{
+	rcu_tasks_rude_torture_stats_print(scale_type, SCALE_FLAG);
+}
+
 static struct rcu_scale_ops tasks_rude_ops = {
 	.ptype		= RCU_TASKS_RUDE_FLAVOR,
 	.init		= rcu_sync_scale_init,
@@ -346,6 +357,7 @@ static struct rcu_scale_ops tasks_rude_ops = {
 	.sync		= synchronize_rcu_tasks_rude,
 	.exp_sync	= synchronize_rcu_tasks_rude,
 	.rso_gp_kthread	= get_rcu_tasks_rude_gp_kthread,
+	.stats		= IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_rude_scale_stats,
 	.name		= "tasks-rude"
 };
 
@@ -374,6 +386,11 @@ static void tasks_trace_scale_read_unlock(int idx)
 	rcu_read_unlock_trace();
 }
 
+static void rcu_tasks_trace_scale_stats(void)
+{
+	rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG);
+}
+
 static struct rcu_scale_ops tasks_tracing_ops = {
 	.ptype		= RCU_TASKS_FLAVOR,
 	.init		= rcu_sync_scale_init,
@@ -386,6 +403,7 @@ static struct rcu_scale_ops tasks_tracing_ops = {
 	.sync		= synchronize_rcu_tasks_trace,
 	.exp_sync	= synchronize_rcu_tasks_trace,
 	.rso_gp_kthread	= get_rcu_tasks_trace_gp_kthread,
+	.stats		= IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats,
 	.name		= "tasks-tracing"
 };
 

From ac9d45544cd571decca395715d0b0a3b617d02f4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 1 Jul 2024 13:33:58 -0700
Subject: [PATCH 46/64] locking/csd_lock: Provide an indication of ongoing
 CSD-lock stall

If a CSD-lock stall goes on long enough, it will cause an RCU CPU
stall warning.  This additional warning provides much additional
console-log traffic and little additional information.  Therefore,
provide a new csd_lock_is_stuck() function that returns true if there
is an ongoing CSD-lock stall.  This function will be used by the RCU
CPU stall warnings to provide a one-line indication of the stall when
this function returns true.

[ neeraj.upadhyay: Apply Rik van Riel feedback. ]
[ neeraj.upadhyay: Apply kernel test robot feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Imran Khan <imran.f.khan@oracle.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Leonardo Bras <leobras@redhat.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/smp.h |  6 ++++++
 kernel/smp.c        | 16 ++++++++++++++++
 lib/Kconfig.debug   |  1 +
 3 files changed, 23 insertions(+)

diff --git a/include/linux/smp.h b/include/linux/smp.h
index fcd61dfe2af3..3871bd32018f 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -294,4 +294,10 @@ int smpcfd_prepare_cpu(unsigned int cpu);
 int smpcfd_dead_cpu(unsigned int cpu);
 int smpcfd_dying_cpu(unsigned int cpu);
 
+#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
+bool csd_lock_is_stuck(void);
+#else
+static inline bool csd_lock_is_stuck(void) { return false; }
+#endif
+
 #endif /* __LINUX_SMP_H */
diff --git a/kernel/smp.c b/kernel/smp.c
index e87953729230..202cda4d2a55 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -208,6 +208,19 @@ static int csd_lock_wait_getcpu(call_single_data_t *csd)
 	return -1;
 }
 
+static atomic_t n_csd_lock_stuck;
+
+/**
+ * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long?
+ *
+ * Returns @true if a CSD-lock acquisition is stuck and has been stuck
+ * long enough for a "non-responsive CSD lock" message to be printed.
+ */
+bool csd_lock_is_stuck(void)
+{
+	return !!atomic_read(&n_csd_lock_stuck);
+}
+
 /*
  * Complain if too much time spent waiting.  Note that only
  * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
@@ -229,6 +242,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
 		cpu = csd_lock_wait_getcpu(csd);
 		pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
 			 *bug_id, raw_smp_processor_id(), cpu);
+		atomic_dec(&n_csd_lock_stuck);
 		return true;
 	}
 
@@ -252,6 +266,8 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
 	pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n",
 		 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta,
 		 cpu, csd->func, csd->info);
+	if (firsttime)
+		atomic_inc(&n_csd_lock_stuck);
 	/*
 	 * If the CSD lock is still stuck after 5 minutes, it is unlikely
 	 * to become unstuck. Use a signed comparison to avoid triggering
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a30c03a66172..4e5f61cba8e4 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1614,6 +1614,7 @@ config SCF_TORTURE_TEST
 config CSD_LOCK_WAIT_DEBUG
 	bool "Debugging for csd_lock_wait(), called from smp_call_function*()"
 	depends on DEBUG_KERNEL
+	depends on SMP
 	depends on 64BIT
 	default n
 	help

From d40760d6811d55172ba6b90ebd2a60a75f88bffe Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Jul 2024 14:32:20 -0700
Subject: [PATCH 47/64] locking/csd-lock: Use backoff for repeated reports of
 same incident

Currently, the CSD-lock diagnostics in CONFIG_CSD_LOCK_WAIT_DEBUG=y
kernels are emitted at five-second intervals.  Although this has proven
to be a good time interval for the first diagnostic, if the target CPU
keeps interrupts disabled for way longer than five seconds, the ratio
of useful new information to pointless repetition increases considerably.

Therefore, back off the time period for repeated reports of the same
incident, increasing linearly with the number of reports and logarithmicly
with the number of online CPUs.

[ paulmck: Apply Dan Carpenter feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Imran Khan <imran.f.khan@oracle.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Leonardo Bras <leobras@redhat.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Reviewed-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/smp.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index 202cda4d2a55..b484ee6dcaf6 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -226,7 +226,7 @@ bool csd_lock_is_stuck(void)
  * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
  * so waiting on other types gets much less information.
  */
-static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
+static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages)
 {
 	int cpu = -1;
 	int cpux;
@@ -249,7 +249,9 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
 	ts2 = sched_clock();
 	/* How long since we last checked for a stuck CSD lock.*/
 	ts_delta = ts2 - *ts1;
-	if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
+	if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) *
+			       (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) ||
+		   csd_lock_timeout_ns == 0))
 		return false;
 
 	firsttime = !*bug_id;
@@ -266,6 +268,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
 	pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n",
 		 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta,
 		 cpu, csd->func, csd->info);
+	(*nmessages)++;
 	if (firsttime)
 		atomic_inc(&n_csd_lock_stuck);
 	/*
@@ -306,12 +309,13 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
  */
 static void __csd_lock_wait(call_single_data_t *csd)
 {
+	unsigned long nmessages = 0;
 	int bug_id = 0;
 	u64 ts0, ts1;
 
 	ts1 = ts0 = sched_clock();
 	for (;;) {
-		if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
+		if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages))
 			break;
 		cpu_relax();
 	}

From 9fbaa44114ca6b69074a488295e86a9fa7e685f9 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 15 Jul 2024 13:49:41 -0400
Subject: [PATCH 48/64] smp: print only local CPU info when sched_clock goes
 backward

About 40% of all csd_lock warnings observed in our fleet appear to
be due to sched_clock() going backward in time (usually only a little
bit), resulting in ts0 being larger than ts2.

When the local CPU is at fault, we should print out a message reflecting
that, rather than trying to get the remote CPU's stack trace.

Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/smp.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/smp.c b/kernel/smp.c
index b484ee6dcaf6..f25e20617b7e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -254,6 +254,14 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
 		   csd_lock_timeout_ns == 0))
 		return false;
 
+	if (ts0 > ts2) {
+		/* Our own sched_clock went backward; don't blame another CPU. */
+		ts_delta = ts0 - ts2;
+		pr_alert("sched_clock on CPU %d went backward by %llu ns\n", raw_smp_processor_id(), ts_delta);
+		*ts1 = ts2;
+		return false;
+	}
+
 	firsttime = !*bug_id;
 	if (firsttime)
 		*bug_id = atomic_inc_return(&csd_bug_count);

From 1dd01c06506c42ab3a8026272ee93c466ae0cb4b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 1 Jul 2024 19:30:56 -0700
Subject: [PATCH 49/64] rcu: Summarize RCU CPU stall warnings during CSD-lock
 stalls

During CSD-lock stalls, the additional information output by RCU CPU
stall warnings is usually redundant, flooding the console for not good
reason.  However, this has been the way things work for a few years.
This commit therefore adds an rcutree.csd_lock_suppress_rcu_stall kernel
boot parameter that causes RCU CPU stall warnings to be abbreviated to
a single line when there is at least one CPU that has been stuck waiting
for CSD lock for more than five seconds.

To make this abbreviated message happen with decent probability:

tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 8 \
	--configs "2*TREE01" --kconfig "CONFIG_CSD_LOCK_WAIT_DEBUG=y" \
	--bootargs "csdlock_debug=1 rcutorture.stall_cpu=200 \
	rcutorture.stall_cpu_holdoff=120 rcutorture.stall_cpu_irqsoff=1 \
	rcutree.csd_lock_suppress_rcu_stall=1 \
	rcupdate.rcu_exp_cpu_stall_timeout=5000" --trust-make

[ paulmck: Apply kernel test robot feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 4 ++++
 kernel/rcu/tree_stall.h                         | 8 +++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f1384c7b59c9..d56356c13184 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4937,6 +4937,10 @@
 			Set maximum number of finished RCU callbacks to
 			process in one batch.
 
+	rcutree.csd_lock_suppress_rcu_stall=	[KNL]
+			Do only a one-line RCU CPU stall warning when
+			there is an ongoing too-long CSD-lock wait.
+
 	rcutree.do_rcu_barrier=	[KNL]
 			Request a call to rcu_barrier().  This is
 			throttled so that userspace tests can safely
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 4b0e9d7c4c68..b497d4c6dabd 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -9,6 +9,7 @@
 
 #include <linux/kvm_para.h>
 #include <linux/rcu_notifier.h>
+#include <linux/smp.h>
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -719,6 +720,9 @@ static void print_cpu_stall(unsigned long gps)
 	set_preempt_need_resched();
 }
 
+static bool csd_lock_suppress_rcu_stall;
+module_param(csd_lock_suppress_rcu_stall, bool, 0644);
+
 static void check_cpu_stall(struct rcu_data *rdp)
 {
 	bool self_detected;
@@ -791,7 +795,9 @@ static void check_cpu_stall(struct rcu_data *rdp)
 			return;
 
 		rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps);
-		if (self_detected) {
+		if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) {
+			pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name);
+		} else if (self_detected) {
 			/* We haven't checked in, so go dump stack. */
 			print_cpu_stall(gps);
 		} else {

From 27d5749d075578b105e958369a0263a0e15a8d3f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 1 Jul 2024 21:19:06 -0700
Subject: [PATCH 50/64] rcu: Extract synchronize_rcu_expedited_stall() from
 synchronize_rcu_expedited_wait()

This commit extracts the RCU CPU stall-warning report code from
synchronize_rcu_expedited_wait() and places it in a new function named
synchronize_rcu_expedited_stall().  This is strictly a code-movement
commit.  A later commit will use this reorganization to avoid printing
expedited RCU CPU stall warnings while there are ongoing CSD-lock stall
reports.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree_exp.h | 111 +++++++++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 51 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 4acd29d16fdb..17dd5169012d 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -542,6 +542,65 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit)
 	return false;
 }
 
+/*
+ * Print out an expedited RCU CPU stall warning message.
+ */
+static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigned long j)
+{
+	int cpu;
+	unsigned long mask;
+	int ndetected;
+	struct rcu_node *rnp;
+	struct rcu_node *rnp_root = rcu_get_root();
+
+	pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name);
+	ndetected = 0;
+	rcu_for_each_leaf_node(rnp) {
+		ndetected += rcu_print_task_exp_stall(rnp);
+		for_each_leaf_node_possible_cpu(rnp, cpu) {
+			struct rcu_data *rdp;
+
+			mask = leaf_node_cpu_bit(rnp, cpu);
+			if (!(READ_ONCE(rnp->expmask) & mask))
+				continue;
+			ndetected++;
+			rdp = per_cpu_ptr(&rcu_data, cpu);
+			pr_cont(" %d-%c%c%c%c", cpu,
+				"O."[!!cpu_online(cpu)],
+				"o."[!!(rdp->grpmask & rnp->expmaskinit)],
+				"N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
+				"D."[!!data_race(rdp->cpu_no_qs.b.exp)]);
+		}
+	}
+	pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
+		j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask),
+		".T"[!!data_race(rnp_root->exp_tasks)]);
+	if (ndetected) {
+		pr_err("blocking rcu_node structures (internal RCU debug):");
+		rcu_for_each_node_breadth_first(rnp) {
+			if (rnp == rnp_root)
+				continue; /* printed unconditionally */
+			if (sync_rcu_exp_done_unlocked(rnp))
+				continue;
+			pr_cont(" l=%u:%d-%d:%#lx/%c",
+				rnp->level, rnp->grplo, rnp->grphi, data_race(rnp->expmask),
+				".T"[!!data_race(rnp->exp_tasks)]);
+		}
+		pr_cont("\n");
+	}
+	rcu_for_each_leaf_node(rnp) {
+		for_each_leaf_node_possible_cpu(rnp, cpu) {
+			mask = leaf_node_cpu_bit(rnp, cpu);
+			if (!(READ_ONCE(rnp->expmask) & mask))
+				continue;
+			preempt_disable(); // For smp_processor_id() in dump_cpu_task().
+			dump_cpu_task(cpu);
+			preempt_enable();
+		}
+		rcu_exp_print_detail_task_stall_rnp(rnp);
+	}
+}
+
 /*
  * Wait for the expedited grace period to elapse, issuing any needed
  * RCU CPU stall warnings along the way.
@@ -553,10 +612,8 @@ static void synchronize_rcu_expedited_wait(void)
 	unsigned long jiffies_stall;
 	unsigned long jiffies_start;
 	unsigned long mask;
-	int ndetected;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
-	struct rcu_node *rnp_root = rcu_get_root();
 	unsigned long flags;
 
 	trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
@@ -593,55 +650,7 @@ static void synchronize_rcu_expedited_wait(void)
 		j = jiffies;
 		rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start));
 		trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
-		pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
-		       rcu_state.name);
-		ndetected = 0;
-		rcu_for_each_leaf_node(rnp) {
-			ndetected += rcu_print_task_exp_stall(rnp);
-			for_each_leaf_node_possible_cpu(rnp, cpu) {
-				struct rcu_data *rdp;
-
-				mask = leaf_node_cpu_bit(rnp, cpu);
-				if (!(READ_ONCE(rnp->expmask) & mask))
-					continue;
-				ndetected++;
-				rdp = per_cpu_ptr(&rcu_data, cpu);
-				pr_cont(" %d-%c%c%c%c", cpu,
-					"O."[!!cpu_online(cpu)],
-					"o."[!!(rdp->grpmask & rnp->expmaskinit)],
-					"N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
-					"D."[!!data_race(rdp->cpu_no_qs.b.exp)]);
-			}
-		}
-		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
-			j - jiffies_start, rcu_state.expedited_sequence,
-			data_race(rnp_root->expmask),
-			".T"[!!data_race(rnp_root->exp_tasks)]);
-		if (ndetected) {
-			pr_err("blocking rcu_node structures (internal RCU debug):");
-			rcu_for_each_node_breadth_first(rnp) {
-				if (rnp == rnp_root)
-					continue; /* printed unconditionally */
-				if (sync_rcu_exp_done_unlocked(rnp))
-					continue;
-				pr_cont(" l=%u:%d-%d:%#lx/%c",
-					rnp->level, rnp->grplo, rnp->grphi,
-					data_race(rnp->expmask),
-					".T"[!!data_race(rnp->exp_tasks)]);
-			}
-			pr_cont("\n");
-		}
-		rcu_for_each_leaf_node(rnp) {
-			for_each_leaf_node_possible_cpu(rnp, cpu) {
-				mask = leaf_node_cpu_bit(rnp, cpu);
-				if (!(READ_ONCE(rnp->expmask) & mask))
-					continue;
-				preempt_disable(); // For smp_processor_id() in dump_cpu_task().
-				dump_cpu_task(cpu);
-				preempt_enable();
-			}
-			rcu_exp_print_detail_task_stall_rnp(rnp);
-		}
+		synchronize_rcu_expedited_stall(jiffies_start, j);
 		jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
 		panic_on_rcu_stall();
 	}

From 7c72dedb0079e62c1c75dbab038332017f34a6b8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 1 Jul 2024 21:24:29 -0700
Subject: [PATCH 51/64] rcu: Summarize expedited RCU CPU stall warnings during
 CSD-lock stalls

During CSD-lock stalls, the additional information output by expedited
RCU CPU stall warnings is usually redundant, flooding the console for
not good reason.  However, this has been the way things work for a few
years.  This commit therefore uses rcutree.csd_lock_suppress_rcu_stall
kernel boot parameter that causes expedited RCU CPU stall warnings to
be abbreviated to a single line when there is at least one CPU that has
been stuck waiting for CSD lock for more than five seconds.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree_exp.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 17dd5169012d..d4be644afb50 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -553,6 +553,10 @@ static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigne
 	struct rcu_node *rnp;
 	struct rcu_node *rnp_root = rcu_get_root();
 
+	if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) {
+		pr_err("INFO: %s detected expedited stalls, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name);
+		return;
+	}
 	pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name);
 	ndetected = 0;
 	rcu_for_each_leaf_node(rnp) {

From 51b739990cc81316768b85db519eac3999ba92a9 Mon Sep 17 00:00:00 2001
From: Ryo Takakura <takakura@valinux.co.jp>
Date: Fri, 28 Jun 2024 13:18:26 +0900
Subject: [PATCH 52/64] rcu: Let dump_cpu_task() be used without preemption
 disabled

The commit 2d7f00b2f0130 ("rcu: Suppress smp_processor_id() complaint
in synchronize_rcu_expedited_wait()") disabled preemption around
dump_cpu_task() to suppress warning on its usage within preemtible context.

Calling dump_cpu_task() doesn't required to be in non-preemptible context
except for suppressing the smp_processor_id() warning.
As the smp_processor_id() is evaluated along with in_hardirq()
to check if it's in interrupt context, this patch removes the need
for its preemtion disablement by reordering the condition so that
smp_processor_id() only gets evaluated when it's in interrupt context.

Signed-off-by: Ryo Takakura <takakura@valinux.co.jp>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree_exp.h | 2 --
 kernel/sched/core.c   | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index d4be644afb50..c5d9a7eb0803 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -597,9 +597,7 @@ static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigne
 			mask = leaf_node_cpu_bit(rnp, cpu);
 			if (!(READ_ONCE(rnp->expmask) & mask))
 				continue;
-			preempt_disable(); // For smp_processor_id() in dump_cpu_task().
 			dump_cpu_task(cpu);
-			preempt_enable();
 		}
 		rcu_exp_print_detail_task_stall_rnp(rnp);
 	}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a9f655025607..78ae888a1fa2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9726,7 +9726,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 
 void dump_cpu_task(int cpu)
 {
-	if (cpu == smp_processor_id() && in_hardirq()) {
+	if (in_hardirq() && cpu == smp_processor_id()) {
 		struct pt_regs *regs;
 
 		regs = get_irq_regs();

From 11377947b5861fa59bf77c827e1dd7c081842cc9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:43:03 -0700
Subject: [PATCH 53/64] rcuscale: Provide clear error when async specified
 without primitives

Currently, if the rcuscale module's async module parameter is specified
for RCU implementations that do not have async primitives such as RCU
Tasks Rude (which now lacks a call_rcu_tasks_rude() function), there
will be a series of splats due to calls to a NULL pointer.  This commit
therefore warns of this situation, but switches to non-async testing.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcuscale.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index c507750e94d8..79e1c32d5c0f 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -525,7 +525,7 @@ rcu_scale_writer(void *arg)
 			schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1);
 		wdp = &wdpp[i];
 		*wdp = ktime_get_mono_fast_ns();
-		if (gp_async) {
+		if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) {
 retry:
 			if (!rhp)
 				rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
@@ -597,7 +597,7 @@ retry:
 			i++;
 		rcu_scale_wait_shutdown();
 	} while (!torture_must_stop());
-	if (gp_async) {
+	if (gp_async && cur_ops->async) {
 		cur_ops->gp_barrier();
 	}
 	writer_n_durations[me] = i_max + 1;

From abaf1322adbfd4faa28072fd5412e54b9722e477 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:43:04 -0700
Subject: [PATCH 54/64] rcuscale: Make all writer tasks report upon hang

This commit causes all writer tasks to provide a brief report after a
hang has been reported, spaced at one-second intervals.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcuscale.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 79e1c32d5c0f..dfe8e0faa4d8 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -483,6 +483,7 @@ rcu_scale_writer(void *arg)
 	unsigned long jdone;
 	long me = (long)arg;
 	struct rcu_head *rhp = NULL;
+	bool selfreport = false;
 	bool started = false, done = false, alldone = false;
 	u64 t;
 	DEFINE_TORTURE_RANDOM(tr);
@@ -593,6 +594,11 @@ retry:
 					cur_ops->stats();
 			}
 		}
+		if (!selfreport && time_after(jiffies, jdone + HZ * (70 + me))) {
+			pr_info("%s: Writer %ld self-report: started %d done %d/%d->%d i %d jdone %lu.\n",
+				__func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone);
+			selfreport = true;
+		}
 		if (started && !alldone && i < MAX_MEAS - 1)
 			i++;
 		rcu_scale_wait_shutdown();

From 3e3c4f0e2753e92aa2a248d34c6866c0a264ac96 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:43:05 -0700
Subject: [PATCH 55/64] rcuscale: Make rcu_scale_writer() tolerate repeated
 GFP_KERNEL failure

Under some conditions, kmalloc(GFP_KERNEL) allocations have been
observed to repeatedly fail.  This situation has been observed to
cause one of the rcu_scale_writer() instances to loop indefinitely
retrying memory allocation for an asynchronous grace-period primitive.
The problem is that if memory is short, all the other instances will
allocate all available memory before the looping task is awakened from
its rcu_barrier*() call.  This in turn results in hangs, so that rcuscale
fails to complete.

This commit therefore removes the tight retry loop, so that when this
condition occurs, the affected task is still passing through the full
loop with its full set of termination checks.  This spreads the risk
of indefinite memory-allocation retry failures across all instances of
rcu_scale_writer() tasks, which in turn prevents the hangs.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcuscale.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index dfe8e0faa4d8..80518662273b 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -520,6 +520,8 @@ rcu_scale_writer(void *arg)
 
 	jdone = jiffies + minruntime * HZ;
 	do {
+		bool gp_succeeded = false;
+
 		if (writer_holdoff)
 			udelay(writer_holdoff);
 		if (writer_holdoff_jiffies)
@@ -527,23 +529,24 @@ rcu_scale_writer(void *arg)
 		wdp = &wdpp[i];
 		*wdp = ktime_get_mono_fast_ns();
 		if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) {
-retry:
 			if (!rhp)
 				rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
 			if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
 				atomic_inc(this_cpu_ptr(&n_async_inflight));
 				cur_ops->async(rhp, rcu_scale_async_cb);
 				rhp = NULL;
+				gp_succeeded = true;
 			} else if (!kthread_should_stop()) {
 				cur_ops->gp_barrier();
-				goto retry;
 			} else {
 				kfree(rhp); /* Because we are stopping. */
 			}
 		} else if (gp_exp) {
 			cur_ops->exp_sync();
+			gp_succeeded = true;
 		} else {
 			cur_ops->sync();
+			gp_succeeded = true;
 		}
 		t = ktime_get_mono_fast_ns();
 		*wdp = t - *wdp;
@@ -599,7 +602,7 @@ retry:
 				__func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone);
 			selfreport = true;
 		}
-		if (started && !alldone && i < MAX_MEAS - 1)
+		if (gp_succeeded && started && !alldone && i < MAX_MEAS - 1)
 			i++;
 		rcu_scale_wait_shutdown();
 	} while (!torture_must_stop());

From 1c3e6e7903fa939601fdd326b04507304ba22337 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:43:06 -0700
Subject: [PATCH 56/64] rcuscale: Use special allocator for rcu_scale_writer()

The rcu_scale_writer() function needs only a fixed number of rcu_head
structures per kthread, which means that a trivial allocator suffices.
This commit therefore uses an llist-based allocator using a fixed array of
structures per kthread.  This allows aggressive testing of RCU performance
without stressing the slab allocators.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcuscale.c | 123 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 113 insertions(+), 10 deletions(-)

diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 80518662273b..bc7cca979c06 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -105,6 +105,19 @@ static char *scale_type = "rcu";
 module_param(scale_type, charp, 0444);
 MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)");
 
+// Structure definitions for custom fixed-per-task allocator.
+struct writer_mblock {
+	struct rcu_head wmb_rh;
+	struct llist_node wmb_node;
+	struct writer_freelist *wmb_wfl;
+};
+
+struct writer_freelist {
+	struct llist_head ws_lhg;
+	struct llist_head ____cacheline_internodealigned_in_smp ws_lhp;
+	struct writer_mblock *ws_mblocks;
+};
+
 static int nrealreaders;
 static int nrealwriters;
 static struct task_struct **writer_tasks;
@@ -113,6 +126,7 @@ static struct task_struct *shutdown_task;
 
 static u64 **writer_durations;
 static bool *writer_done;
+static struct writer_freelist *writer_freelists;
 static int *writer_n_durations;
 static atomic_t n_rcu_scale_reader_started;
 static atomic_t n_rcu_scale_writer_started;
@@ -463,13 +477,52 @@ rcu_scale_reader(void *arg)
 	return 0;
 }
 
+/*
+ * Allocate a writer_mblock structure for the specified rcu_scale_writer
+ * task.
+ */
+static struct writer_mblock *rcu_scale_alloc(long me)
+{
+	struct llist_node *llnp;
+	struct writer_freelist *wflp;
+	struct writer_mblock *wmbp;
+
+	if (WARN_ON_ONCE(!writer_freelists))
+		return NULL;
+	wflp = &writer_freelists[me];
+	if (llist_empty(&wflp->ws_lhp)) {
+		// ->ws_lhp is private to its rcu_scale_writer task.
+		wmbp = container_of(llist_del_all(&wflp->ws_lhg), struct writer_mblock, wmb_node);
+		wflp->ws_lhp.first = &wmbp->wmb_node;
+	}
+	llnp = llist_del_first(&wflp->ws_lhp);
+	if (!llnp)
+		return NULL;
+	return container_of(llnp, struct writer_mblock, wmb_node);
+}
+
+/*
+ * Free a writer_mblock structure to its rcu_scale_writer task.
+ */
+static void rcu_scale_free(struct writer_mblock *wmbp)
+{
+	struct writer_freelist *wflp;
+
+	if (!wmbp)
+		return;
+	wflp = wmbp->wmb_wfl;
+	llist_add(&wmbp->wmb_node, &wflp->ws_lhg);
+}
+
 /*
  * Callback function for asynchronous grace periods from rcu_scale_writer().
  */
 static void rcu_scale_async_cb(struct rcu_head *rhp)
 {
+	struct writer_mblock *wmbp = container_of(rhp, struct writer_mblock, wmb_rh);
+
 	atomic_dec(this_cpu_ptr(&n_async_inflight));
-	kfree(rhp);
+	rcu_scale_free(wmbp);
 }
 
 /*
@@ -482,13 +535,13 @@ rcu_scale_writer(void *arg)
 	int i_max;
 	unsigned long jdone;
 	long me = (long)arg;
-	struct rcu_head *rhp = NULL;
 	bool selfreport = false;
 	bool started = false, done = false, alldone = false;
 	u64 t;
 	DEFINE_TORTURE_RANDOM(tr);
 	u64 *wdp;
 	u64 *wdpp = writer_durations[me];
+	struct writer_mblock *wmbp = NULL;
 
 	VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
 	WARN_ON(!wdpp);
@@ -529,17 +582,18 @@ rcu_scale_writer(void *arg)
 		wdp = &wdpp[i];
 		*wdp = ktime_get_mono_fast_ns();
 		if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) {
-			if (!rhp)
-				rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
-			if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
+			if (!wmbp)
+				wmbp = rcu_scale_alloc(me);
+			if (wmbp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
 				atomic_inc(this_cpu_ptr(&n_async_inflight));
-				cur_ops->async(rhp, rcu_scale_async_cb);
-				rhp = NULL;
+				cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb);
+				wmbp = NULL;
 				gp_succeeded = true;
 			} else if (!kthread_should_stop()) {
 				cur_ops->gp_barrier();
 			} else {
-				kfree(rhp); /* Because we are stopping. */
+				rcu_scale_free(wmbp); /* Because we are stopping. */
+				wmbp = NULL;
 			}
 		} else if (gp_exp) {
 			cur_ops->exp_sync();
@@ -607,6 +661,7 @@ rcu_scale_writer(void *arg)
 		rcu_scale_wait_shutdown();
 	} while (!torture_must_stop());
 	if (gp_async && cur_ops->async) {
+		rcu_scale_free(wmbp);
 		cur_ops->gp_barrier();
 	}
 	writer_n_durations[me] = i_max + 1;
@@ -970,12 +1025,30 @@ rcu_scale_cleanup(void)
 					schedule_timeout_uninterruptible(1);
 			}
 			kfree(writer_durations[i]);
+			if (writer_freelists) {
+				int ctr = 0;
+				struct llist_node *llnp;
+				struct writer_freelist *wflp = &writer_freelists[i];
+
+				if (wflp->ws_mblocks) {
+					llist_for_each(llnp, wflp->ws_lhg.first)
+						ctr++;
+					llist_for_each(llnp, wflp->ws_lhp.first)
+						ctr++;
+					WARN_ONCE(ctr != gp_async_max,
+						  "%s: ctr = %d gp_async_max = %d\n",
+						  __func__, ctr, gp_async_max);
+					kfree(wflp->ws_mblocks);
+				}
+			}
 		}
 		kfree(writer_tasks);
 		kfree(writer_durations);
 		kfree(writer_n_durations);
 		kfree(writer_done);
 		writer_done = NULL;
+		kfree(writer_freelists);
+		writer_freelists = NULL;
 	}
 
 	/* Do torture-type-specific cleanup operations.  */
@@ -1002,8 +1075,9 @@ rcu_scale_shutdown(void *arg)
 static int __init
 rcu_scale_init(void)
 {
-	long i;
 	int firsterr = 0;
+	long i;
+	long j;
 	static struct rcu_scale_ops *scale_ops[] = {
 		&rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
 	};
@@ -1074,7 +1148,18 @@ rcu_scale_init(void)
 	writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL);
 	writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL);
 	writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL);
-	if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done) {
+	if (gp_async) {
+		if (gp_async_max <= 0) {
+			pr_warn("%s: gp_async_max = %d must be greater than zero.\n",
+				__func__, gp_async_max);
+			WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST));
+			firsterr = -EINVAL;
+			goto unwind;
+		}
+		writer_freelists = kcalloc(nrealwriters, sizeof(writer_freelists[0]), GFP_KERNEL);
+	}
+	if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done ||
+	    (gp_async && !writer_freelists)) {
 		SCALEOUT_ERRSTRING("out of memory");
 		firsterr = -ENOMEM;
 		goto unwind;
@@ -1087,6 +1172,24 @@ rcu_scale_init(void)
 			firsterr = -ENOMEM;
 			goto unwind;
 		}
+		if (writer_freelists) {
+			struct writer_freelist *wflp = &writer_freelists[i];
+
+			init_llist_head(&wflp->ws_lhg);
+			init_llist_head(&wflp->ws_lhp);
+			wflp->ws_mblocks = kcalloc(gp_async_max, sizeof(wflp->ws_mblocks[0]),
+						   GFP_KERNEL);
+			if (!wflp->ws_mblocks) {
+				firsterr = -ENOMEM;
+				goto unwind;
+			}
+			for (j = 0; j < gp_async_max; j++) {
+				struct writer_mblock *wmbp = &wflp->ws_mblocks[j];
+
+				wmbp->wmb_wfl = wflp;
+				llist_add(&wmbp->wmb_node, &wflp->ws_lhp);
+			}
+		}
 		firsterr = torture_create_kthread(rcu_scale_writer, (void *)i,
 						  writer_tasks[i]);
 		if (torture_init_error(firsterr))

From 554f07a119866a78606b36123fb28e9451b1f994 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:43:07 -0700
Subject: [PATCH 57/64] rcuscale: NULL out top-level pointers to heap memory

Currently, if someone modprobes and rmmods rcuscale successfully, but
the next run errors out during the modprobe, non-NULL pointers to freed
memory will remain.  If the run after that also errors out during the
modprobe, there will be double-free bugs.

This commit therefore NULLs out top-level pointers to memory that has
just been freed.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcuscale.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index bc7cca979c06..61a178914256 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -819,6 +819,7 @@ kfree_scale_cleanup(void)
 			torture_stop_kthread(kfree_scale_thread,
 					     kfree_reader_tasks[i]);
 		kfree(kfree_reader_tasks);
+		kfree_reader_tasks = NULL;
 	}
 
 	torture_cleanup_end();
@@ -987,6 +988,7 @@ rcu_scale_cleanup(void)
 			torture_stop_kthread(rcu_scale_reader,
 					     reader_tasks[i]);
 		kfree(reader_tasks);
+		reader_tasks = NULL;
 	}
 
 	if (writer_tasks) {
@@ -1043,8 +1045,11 @@ rcu_scale_cleanup(void)
 			}
 		}
 		kfree(writer_tasks);
+		writer_tasks = NULL;
 		kfree(writer_durations);
+		writer_durations = NULL;
 		kfree(writer_n_durations);
+		writer_n_durations = NULL;
 		kfree(writer_done);
 		writer_done = NULL;
 		kfree(writer_freelists);

From f1fd0e0bb12d09ca0564425b8af76fddf2e380fb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:43:08 -0700
Subject: [PATCH 58/64] rcuscale: Count outstanding callbacks per-task rather
 than per-CPU

The current rcu_scale_writer() asynchronous grace-period testing uses a
per-CPU counter to track the number of outstanding callbacks.  This is
subject to CPU-imbalance errors when tasks migrate from one CPU to another
between the time that the counter is incremented and the callback is
queued, and additionally in kernels configured such that callbacks can
be invoked on some CPU other than the one that queued it.

This commit therefore arranges for per-task callback counts, thus avoiding
any issues with migration of either tasks or callbacks.

Reported-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/rcuscale.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 61a178914256..6d37596deb1f 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -114,6 +114,7 @@ struct writer_mblock {
 
 struct writer_freelist {
 	struct llist_head ws_lhg;
+	atomic_t ws_inflight;
 	struct llist_head ____cacheline_internodealigned_in_smp ws_lhp;
 	struct writer_mblock *ws_mblocks;
 };
@@ -136,7 +137,6 @@ static u64 t_rcu_scale_writer_started;
 static u64 t_rcu_scale_writer_finished;
 static unsigned long b_rcu_gp_test_started;
 static unsigned long b_rcu_gp_test_finished;
-static DEFINE_PER_CPU(atomic_t, n_async_inflight);
 
 #define MAX_MEAS 10000
 #define MIN_MEAS 100
@@ -520,8 +520,9 @@ static void rcu_scale_free(struct writer_mblock *wmbp)
 static void rcu_scale_async_cb(struct rcu_head *rhp)
 {
 	struct writer_mblock *wmbp = container_of(rhp, struct writer_mblock, wmb_rh);
+	struct writer_freelist *wflp = wmbp->wmb_wfl;
 
-	atomic_dec(this_cpu_ptr(&n_async_inflight));
+	atomic_dec(&wflp->ws_inflight);
 	rcu_scale_free(wmbp);
 }
 
@@ -541,6 +542,7 @@ rcu_scale_writer(void *arg)
 	DEFINE_TORTURE_RANDOM(tr);
 	u64 *wdp;
 	u64 *wdpp = writer_durations[me];
+	struct writer_freelist *wflp = &writer_freelists[me];
 	struct writer_mblock *wmbp = NULL;
 
 	VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
@@ -584,8 +586,8 @@ rcu_scale_writer(void *arg)
 		if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) {
 			if (!wmbp)
 				wmbp = rcu_scale_alloc(me);
-			if (wmbp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
-				atomic_inc(this_cpu_ptr(&n_async_inflight));
+			if (wmbp && atomic_read(&wflp->ws_inflight) < gp_async_max) {
+				atomic_inc(&wflp->ws_inflight);
 				cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb);
 				wmbp = NULL;
 				gp_succeeded = true;

From 8f35fefad06323ab8ad5915bb7c6bc1a684cb8b5 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Tue, 6 Aug 2024 15:30:16 +0200
Subject: [PATCH 59/64] refscale: Constify struct ref_scale_ops

'struct ref_scale_ops' are not modified in these drivers.

Constifying this structure moves some data to a read-only section, so
increase overall security.

On a x86_64, with allmodconfig:
Before:
======
   text	   data	    bss	    dec	    hex	filename
  34231	   4167	    736	  39134	   98de	kernel/rcu/refscale.o

After:
=====
   text	   data	    bss	    dec	    hex	filename
  35175	   3239	    736	  39150	   98ee	kernel/rcu/refscale.o

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Tested-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/refscale.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index cfec0648e141..0db9db73f57f 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -135,7 +135,7 @@ struct ref_scale_ops {
 	const char *name;
 };
 
-static struct ref_scale_ops *cur_ops;
+static const struct ref_scale_ops *cur_ops;
 
 static void un_delay(const int udl, const int ndl)
 {
@@ -171,7 +171,7 @@ static bool rcu_sync_scale_init(void)
 	return true;
 }
 
-static struct ref_scale_ops rcu_ops = {
+static const struct ref_scale_ops rcu_ops = {
 	.init		= rcu_sync_scale_init,
 	.readsection	= ref_rcu_read_section,
 	.delaysection	= ref_rcu_delay_section,
@@ -205,7 +205,7 @@ static void srcu_ref_scale_delay_section(const int nloops, const int udl, const
 	}
 }
 
-static struct ref_scale_ops srcu_ops = {
+static const struct ref_scale_ops srcu_ops = {
 	.init		= rcu_sync_scale_init,
 	.readsection	= srcu_ref_scale_read_section,
 	.delaysection	= srcu_ref_scale_delay_section,
@@ -232,7 +232,7 @@ static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, c
 		un_delay(udl, ndl);
 }
 
-static struct ref_scale_ops rcu_tasks_ops = {
+static const struct ref_scale_ops rcu_tasks_ops = {
 	.init		= rcu_sync_scale_init,
 	.readsection	= rcu_tasks_ref_scale_read_section,
 	.delaysection	= rcu_tasks_ref_scale_delay_section,
@@ -271,7 +271,7 @@ static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, c
 	}
 }
 
-static struct ref_scale_ops rcu_trace_ops = {
+static const struct ref_scale_ops rcu_trace_ops = {
 	.init		= rcu_sync_scale_init,
 	.readsection	= rcu_trace_ref_scale_read_section,
 	.delaysection	= rcu_trace_ref_scale_delay_section,
@@ -310,7 +310,7 @@ static void ref_refcnt_delay_section(const int nloops, const int udl, const int
 	}
 }
 
-static struct ref_scale_ops refcnt_ops = {
+static const struct ref_scale_ops refcnt_ops = {
 	.init		= rcu_sync_scale_init,
 	.readsection	= ref_refcnt_section,
 	.delaysection	= ref_refcnt_delay_section,
@@ -347,7 +347,7 @@ static void ref_rwlock_delay_section(const int nloops, const int udl, const int
 	}
 }
 
-static struct ref_scale_ops rwlock_ops = {
+static const struct ref_scale_ops rwlock_ops = {
 	.init		= ref_rwlock_init,
 	.readsection	= ref_rwlock_section,
 	.delaysection	= ref_rwlock_delay_section,
@@ -384,7 +384,7 @@ static void ref_rwsem_delay_section(const int nloops, const int udl, const int n
 	}
 }
 
-static struct ref_scale_ops rwsem_ops = {
+static const struct ref_scale_ops rwsem_ops = {
 	.init		= ref_rwsem_init,
 	.readsection	= ref_rwsem_section,
 	.delaysection	= ref_rwsem_delay_section,
@@ -419,7 +419,7 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd
 	preempt_enable();
 }
 
-static struct ref_scale_ops lock_ops = {
+static const struct ref_scale_ops lock_ops = {
 	.readsection	= ref_lock_section,
 	.delaysection	= ref_lock_delay_section,
 	.name		= "lock"
@@ -454,7 +454,7 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in
 	preempt_enable();
 }
 
-static struct ref_scale_ops lock_irq_ops = {
+static const struct ref_scale_ops lock_irq_ops = {
 	.readsection	= ref_lock_irq_section,
 	.delaysection	= ref_lock_irq_delay_section,
 	.name		= "lock-irq"
@@ -490,7 +490,7 @@ static void ref_acqrel_delay_section(const int nloops, const int udl, const int
 	preempt_enable();
 }
 
-static struct ref_scale_ops acqrel_ops = {
+static const struct ref_scale_ops acqrel_ops = {
 	.readsection	= ref_acqrel_section,
 	.delaysection	= ref_acqrel_delay_section,
 	.name		= "acqrel"
@@ -524,7 +524,7 @@ static void ref_clock_delay_section(const int nloops, const int udl, const int n
 	stopopts = x;
 }
 
-static struct ref_scale_ops clock_ops = {
+static const struct ref_scale_ops clock_ops = {
 	.readsection	= ref_clock_section,
 	.delaysection	= ref_clock_delay_section,
 	.name		= "clock"
@@ -556,7 +556,7 @@ static void ref_jiffies_delay_section(const int nloops, const int udl, const int
 	stopopts = x;
 }
 
-static struct ref_scale_ops jiffies_ops = {
+static const struct ref_scale_ops jiffies_ops = {
 	.readsection	= ref_jiffies_section,
 	.delaysection	= ref_jiffies_delay_section,
 	.name		= "jiffies"
@@ -706,9 +706,9 @@ static void refscale_typesafe_ctor(void *rtsp_in)
 	preempt_enable();
 }
 
-static struct ref_scale_ops typesafe_ref_ops;
-static struct ref_scale_ops typesafe_lock_ops;
-static struct ref_scale_ops typesafe_seqlock_ops;
+static const struct ref_scale_ops typesafe_ref_ops;
+static const struct ref_scale_ops typesafe_lock_ops;
+static const struct ref_scale_ops typesafe_seqlock_ops;
 
 // Initialize for a typesafe test.
 static bool typesafe_init(void)
@@ -769,7 +769,7 @@ static void typesafe_cleanup(void)
 }
 
 // The typesafe_init() function distinguishes these structures by address.
-static struct ref_scale_ops typesafe_ref_ops = {
+static const struct ref_scale_ops typesafe_ref_ops = {
 	.init		= typesafe_init,
 	.cleanup	= typesafe_cleanup,
 	.readsection	= typesafe_read_section,
@@ -777,7 +777,7 @@ static struct ref_scale_ops typesafe_ref_ops = {
 	.name		= "typesafe_ref"
 };
 
-static struct ref_scale_ops typesafe_lock_ops = {
+static const struct ref_scale_ops typesafe_lock_ops = {
 	.init		= typesafe_init,
 	.cleanup	= typesafe_cleanup,
 	.readsection	= typesafe_read_section,
@@ -785,7 +785,7 @@ static struct ref_scale_ops typesafe_lock_ops = {
 	.name		= "typesafe_lock"
 };
 
-static struct ref_scale_ops typesafe_seqlock_ops = {
+static const struct ref_scale_ops typesafe_seqlock_ops = {
 	.init		= typesafe_init,
 	.cleanup	= typesafe_cleanup,
 	.readsection	= typesafe_read_section,
@@ -1026,7 +1026,7 @@ end:
 }
 
 static void
-ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag)
+ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag)
 {
 	pr_alert("%s" SCALE_FLAG
 		 "--- %s:  verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
@@ -1081,7 +1081,7 @@ ref_scale_init(void)
 {
 	long i;
 	int firsterr = 0;
-	static struct ref_scale_ops *scale_ops[] = {
+	static const struct ref_scale_ops *scale_ops[] = {
 		&rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
 		&rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops,
 		&typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,

From 1fcb932c8b5ce86219d7dedcd63659351a43291c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 4 Jul 2024 00:56:40 +0200
Subject: [PATCH 60/64] rcu/nocb: Simplify (de-)offloading state machine

Now that the (de-)offloading process can only apply to offline CPUs,
there is no more concurrency between rcu_core and nocb kthreads. Also
the mutation now happens on empty queues.

Therefore the state machine can be reduced to a single bit called
SEGCBLIST_OFFLOADED. Simplify the transition as follows:

* Upon offloading: queue the rdp to be added to the rcuog list and
  wait for the rcuog kthread to set the SEGCBLIST_OFFLOADED bit. Unpark
  rcuo kthread.

* Upon de-offloading: Park rcuo kthread. Queue the rdp to be removed
  from the rcuog list and wait for the rcuog kthread to clear the
  SEGCBLIST_OFFLOADED bit.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcu_segcblist.h |   4 +-
 kernel/rcu/rcu_segcblist.c    |  11 ---
 kernel/rcu/rcu_segcblist.h    |   2 +-
 kernel/rcu/tree_nocb.h        | 138 ++++++++++++++++------------------
 4 files changed, 68 insertions(+), 87 deletions(-)

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 1ef1bb54853d..2fdc2208f1ca 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -185,9 +185,7 @@ struct rcu_cblist {
  *  ----------------------------------------------------------------------------
  */
 #define SEGCBLIST_ENABLED	BIT(0)
-#define SEGCBLIST_LOCKING	BIT(1)
-#define SEGCBLIST_KTHREAD_GP	BIT(2)
-#define SEGCBLIST_OFFLOADED	BIT(3)
+#define SEGCBLIST_OFFLOADED	BIT(1)
 
 struct rcu_segcblist {
 	struct rcu_head *head;
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 1693ea22ef1b..298a2c573f02 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -260,17 +260,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
 	rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED);
 }
 
-/*
- * Mark the specified rcu_segcblist structure as offloaded (or not)
- */
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload)
-{
-	if (offload)
-		rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED);
-	else
-		rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED);
-}
-
 /*
  * Does the specified rcu_segcblist structure contain callbacks that
  * are ready to be invoked?
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 7a0962dfee86..259904075636 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -89,7 +89,7 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
 static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
 {
 	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
-	    rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING))
+	    rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED))
 		return true;
 
 	return false;
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 24daf606de0c..857024ee0349 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -604,37 +604,33 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
 	}
 }
 
-static int nocb_gp_toggle_rdp(struct rcu_data *rdp)
+static void nocb_gp_toggle_rdp(struct rcu_data *rdp_gp, struct rcu_data *rdp)
 {
 	struct rcu_segcblist *cblist = &rdp->cblist;
 	unsigned long flags;
-	int ret;
 
-	rcu_nocb_lock_irqsave(rdp, flags);
-	if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
-	    !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+	/*
+	 * Locking orders future de-offloaded callbacks enqueue against previous
+	 * handling of this rdp. Ie: Make sure rcuog is done with this rdp before
+	 * deoffloaded callbacks can be enqueued.
+	 */
+	raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+	if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
 		/*
 		 * Offloading. Set our flag and notify the offload worker.
 		 * We will handle this rdp until it ever gets de-offloaded.
 		 */
-		rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
-		ret = 1;
-	} else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
-		   rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+		list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp);
+		rcu_segcblist_set_flags(cblist, SEGCBLIST_OFFLOADED);
+	} else {
 		/*
 		 * De-offloading. Clear our flag and notify the de-offload worker.
 		 * We will ignore this rdp until it ever gets re-offloaded.
 		 */
-		rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
-		ret = 0;
-	} else {
-		WARN_ON_ONCE(1);
-		ret = -1;
+		list_del(&rdp->nocb_entry_rdp);
+		rcu_segcblist_clear_flags(cblist, SEGCBLIST_OFFLOADED);
 	}
-
-	rcu_nocb_unlock_irqrestore(rdp, flags);
-
-	return ret;
+	raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 }
 
 static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
@@ -841,14 +837,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 	}
 
 	if (rdp_toggling) {
-		int ret;
-
-		ret = nocb_gp_toggle_rdp(rdp_toggling);
-		if (ret == 1)
-			list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
-		else if (ret == 0)
-			list_del(&rdp_toggling->nocb_entry_rdp);
-
+		nocb_gp_toggle_rdp(my_rdp, rdp_toggling);
 		swake_up_one(&rdp_toggling->nocb_state_wq);
 	}
 
@@ -1018,16 +1007,11 @@ void rcu_nocb_flush_deferred_wakeup(void)
 }
 EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
 
-static int rdp_offload_toggle(struct rcu_data *rdp,
-			       bool offload, unsigned long flags)
-	__releases(rdp->nocb_lock)
+static int rcu_nocb_queue_toggle_rdp(struct rcu_data *rdp)
 {
-	struct rcu_segcblist *cblist = &rdp->cblist;
 	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
 	bool wake_gp = false;
-
-	rcu_segcblist_offload(cblist, offload);
-	rcu_nocb_unlock_irqrestore(rdp, flags);
+	unsigned long flags;
 
 	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
 	// Queue this rdp for add/del to/from the list to iterate on rcuog
@@ -1041,9 +1025,25 @@ static int rdp_offload_toggle(struct rcu_data *rdp,
 	return wake_gp;
 }
 
+static bool rcu_nocb_rdp_deoffload_wait_cond(struct rcu_data *rdp)
+{
+	unsigned long flags;
+	bool ret;
+
+	/*
+	 * Locking makes sure rcuog is done handling this rdp before deoffloaded
+	 * enqueue can happen. Also it keeps the SEGCBLIST_OFFLOADED flag stable
+	 * while the ->nocb_lock is held.
+	 */
+	raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+	ret = !rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
+	raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+	return ret;
+}
+
 static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
 {
-	struct rcu_segcblist *cblist = &rdp->cblist;
 	unsigned long flags;
 	int wake_gp;
 	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
@@ -1056,51 +1056,42 @@ static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
 	/* Flush all callbacks from segcblist and bypass */
 	rcu_barrier();
 
+	/*
+	 * Make sure the rcuoc kthread isn't in the middle of a nocb locked
+	 * sequence while offloading is deactivated, along with nocb locking.
+	 */
+	if (rdp->nocb_cb_kthread)
+		kthread_park(rdp->nocb_cb_kthread);
+
 	rcu_nocb_lock_irqsave(rdp, flags);
 	WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
 	WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
+	rcu_nocb_unlock_irqrestore(rdp, flags);
 
-	wake_gp = rdp_offload_toggle(rdp, false, flags);
+	wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
 
 	mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+
 	if (rdp_gp->nocb_gp_kthread) {
 		if (wake_gp)
 			wake_up_process(rdp_gp->nocb_gp_kthread);
 
 		swait_event_exclusive(rdp->nocb_state_wq,
-				      !rcu_segcblist_test_flags(cblist,
-								SEGCBLIST_KTHREAD_GP));
-		if (rdp->nocb_cb_kthread)
-			kthread_park(rdp->nocb_cb_kthread);
+				      rcu_nocb_rdp_deoffload_wait_cond(rdp));
 	} else {
 		/*
 		 * No kthread to clear the flags for us or remove the rdp from the nocb list
 		 * to iterate. Do it here instead. Locking doesn't look stricly necessary
 		 * but we stick to paranoia in this rare path.
 		 */
-		rcu_nocb_lock_irqsave(rdp, flags);
-		rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
-		rcu_nocb_unlock_irqrestore(rdp, flags);
+		raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+		rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
+		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 
 		list_del(&rdp->nocb_entry_rdp);
 	}
-	mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
 
-	/*
-	 * Lock one last time to acquire latest callback updates from kthreads
-	 * so we can later handle callbacks locally without locking.
-	 */
-	rcu_nocb_lock_irqsave(rdp, flags);
-	/*
-	 * Theoretically we could clear SEGCBLIST_LOCKING after the nocb
-	 * lock is released but how about being paranoid for once?
-	 */
-	rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING);
-	/*
-	 * Without SEGCBLIST_LOCKING, we can't use
-	 * rcu_nocb_unlock_irqrestore() anymore.
-	 */
-	raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+	mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
 
 	return 0;
 }
@@ -1129,10 +1120,20 @@ int rcu_nocb_cpu_deoffload(int cpu)
 }
 EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
 
+static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp)
+{
+	unsigned long flags;
+	bool ret;
+
+	raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+	ret = rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
+	raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+	return ret;
+}
+
 static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
 {
-	struct rcu_segcblist *cblist = &rdp->cblist;
-	unsigned long flags;
 	int wake_gp;
 	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
 
@@ -1152,20 +1153,14 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
 	WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
 	WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
 
-	/*
-	 * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
-	 * is set.
-	 */
-	raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
-
-	wake_gp = rdp_offload_toggle(rdp, true, flags);
+	wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
 	if (wake_gp)
 		wake_up_process(rdp_gp->nocb_gp_kthread);
 
-	kthread_unpark(rdp->nocb_cb_kthread);
-
 	swait_event_exclusive(rdp->nocb_state_wq,
-			      rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+			      rcu_nocb_rdp_offload_wait_cond(rdp));
+
+	kthread_unpark(rdp->nocb_cb_kthread);
 
 	return 0;
 }
@@ -1340,8 +1335,7 @@ void __init rcu_init_nohz(void)
 		rdp = per_cpu_ptr(&rcu_data, cpu);
 		if (rcu_segcblist_empty(&rdp->cblist))
 			rcu_segcblist_init(&rdp->cblist);
-		rcu_segcblist_offload(&rdp->cblist, true);
-		rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
+		rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
 	}
 	rcu_organize_nocb_kthreads();
 }

From 9139f93209d1ffd7f489ab19dee01b7c3a1a43d2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 14 Aug 2024 00:56:40 +0200
Subject: [PATCH 61/64] rcu/nocb: Fix RT throttling hrtimer armed from offline
 CPU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After a CPU is marked offline and until it reaches its final trip to
idle, rcuo has several opportunities to be woken up, either because
a callback has been queued in the meantime or because
rcutree_report_cpu_dead() has issued the final deferred NOCB wake up.

If RCU-boosting is enabled, RCU kthreads are set to SCHED_FIFO policy.
And if RT-bandwidth is enabled, the related hrtimer might be armed.
However this then happens after hrtimers have been migrated at the
CPUHP_AP_HRTIMERS_DYING stage, which is broken as reported by the
following warning:

 Call trace:
  enqueue_hrtimer+0x7c/0xf8
  hrtimer_start_range_ns+0x2b8/0x300
  enqueue_task_rt+0x298/0x3f0
  enqueue_task+0x94/0x188
  ttwu_do_activate+0xb4/0x27c
  try_to_wake_up+0x2d8/0x79c
  wake_up_process+0x18/0x28
  __wake_nocb_gp+0x80/0x1a0
  do_nocb_deferred_wakeup_common+0x3c/0xcc
  rcu_report_dead+0x68/0x1ac
  cpuhp_report_idle_dead+0x48/0x9c
  do_idle+0x288/0x294
  cpu_startup_entry+0x34/0x3c
  secondary_start_kernel+0x138/0x158

Fix this with waking up rcuo using an IPI if necessary. Since the
existing API to deal with this situation only handles swait queue, rcuo
is only woken up from offline CPUs if it's not already waiting on a
grace period. In the worst case some callbacks will just wait for a
grace period to complete before being assigned to a subsequent one.

Reported-by: "Cheng-Jui Wang (王正睿)" <Cheng-Jui.Wang@mediatek.com>
Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree_nocb.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 857024ee0349..e3cf354f7a7f 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -216,7 +216,10 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
 	raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
 	if (needwake) {
 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
-		wake_up_process(rdp_gp->nocb_gp_kthread);
+		if (cpu_is_offline(raw_smp_processor_id()))
+			swake_up_one_online(&rdp_gp->nocb_gp_wq);
+		else
+			wake_up_process(rdp_gp->nocb_gp_kthread);
 	}
 
 	return needwake;

From 1b022b8763fde84c51ca06218306d7d88fb090cb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 14 Aug 2024 00:56:41 +0200
Subject: [PATCH 62/64] rcu/nocb: Conditionally wake up rcuo if not already
 waiting on GP

A callback enqueuer currently wakes up the rcuo kthread if it is adding
the first non-done callback of a CPU, whether the kthread is waiting on
a grace period or not (unless the CPU is offline).

This looks like a desired behaviour because then the rcuo kthread
doesn't wait for the end of the current grace period to handle the
callback. It is accelerated right away and assigned to the next grace
period. The GP kthread is notified about that fact and iterates with
the upcoming GP without sleeping in-between.

However this best-case scenario is contradicted by a few details,
depending on the situation:

1) If the callback is a non-bypass one queued with IRQs enabled, the
   wake up only occurs if no other pending callbacks are on the list.
   Therefore the theoretical "optimization" actually applies on rare
   occasions.

2) If the callback is a non-bypass one queued with IRQs disabled, the
   situation is similar with even more uncertainty due to the deferred
   wake up.

3) If the callback is lazy, a few jiffies don't make any difference.

4) If the callback is bypass, the wake up timer is programmed 2 jiffies
   ahead by rcuo in case the regular pending queue has been handled
   in the meantime. The rare storm of callbacks can otherwise wait for
   the currently elapsing grace period to be flushed and handled.

For all those reasons, the optimization is only theoretical and
occasional. Therefore it is reasonable that callbacks enqueuers only
wake up the rcuo kthread when it is not already waiting on a grace
period to complete.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree_nocb.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index e3cf354f7a7f..14d183438423 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -216,10 +216,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
 	raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
 	if (needwake) {
 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
-		if (cpu_is_offline(raw_smp_processor_id()))
-			swake_up_one_online(&rdp_gp->nocb_gp_wq);
-		else
-			wake_up_process(rdp_gp->nocb_gp_kthread);
+		swake_up_one_online(&rdp_gp->nocb_gp_wq);
 	}
 
 	return needwake;

From 7562eed272b49e233c430524e684b957f34f2fd2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 14 Aug 2024 00:56:42 +0200
Subject: [PATCH 63/64] rcu/nocb: Remove superfluous memory barrier after
 bypass enqueue

Pre-GP accesses performed by the update side must be ordered against
post-GP accesses performed by the readers. This is ensured by the
bypass or nocb locking on enqueue time, followed by the fully ordered
rnp locking initiated while callbacks are accelerated, and then
propagated throughout the whole GP lifecyle associated with the
callbacks.

Therefore the explicit barrier advertizing ordering between bypass
enqueue and rcuo wakeup is superfluous. If anything, it would even only
order the first bypass callback enqueue against the rcuo wakeup and
ignore all the subsequent ones.

Remove the needless barrier.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree_nocb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 14d183438423..4ffb8c90d695 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -493,7 +493,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
 	}
 	rcu_nocb_bypass_unlock(rdp);
-	smp_mb(); /* Order enqueue before wake. */
+
 	// A wake up of the grace period kthread or timer adjustment
 	// needs to be done only if:
 	// 1. Bypass list was fully empty before (this is the first

From 1ecd9d68eb44e4b7972aee2840eb4fdf29b9de2b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 23 Aug 2024 14:15:12 -0700
Subject: [PATCH 64/64] rcu: Defer printing stall-warning backtrace when
 holding rcu_node lock

The rcu_dump_cpu_stacks() holds the leaf rcu_node structure's ->lock
when dumping the stakcks of any CPUs stalling the current grace period.
This lock is held to prevent confusion that would otherwise occur when
the stalled CPU reported its quiescent state (and then went on to do
unrelated things) just as the backtrace NMI was heading towards it.

This has worked well, but on larger systems has recently been observed
to cause severe lock contention resulting in CSD-lock stalls and other
general unhappiness.

This commit therefore does printk_deferred_enter() before acquiring
the lock and printk_deferred_exit() after releasing it, thus deferring
the overhead of actually outputting the stack trace out of that lock's
critical section.

Reported-by: Rik van Riel <riel@surriel.com>
Suggested-by: Rik van Riel <riel@surriel.com>
Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/rcu/tree_stall.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b497d4c6dabd..9772e1ffcf6e 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -371,6 +371,7 @@ static void rcu_dump_cpu_stacks(void)
 	struct rcu_node *rnp;
 
 	rcu_for_each_leaf_node(rnp) {
+		printk_deferred_enter();
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		for_each_leaf_node_possible_cpu(rnp, cpu)
 			if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
@@ -380,6 +381,7 @@ static void rcu_dump_cpu_stacks(void)
 					dump_cpu_task(cpu);
 			}
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+		printk_deferred_exit();
 	}
 }