From 85d68222ddc5f4522e456d97d201166acb50f716 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 Oct 2023 09:53:08 +0100 Subject: [PATCH 1/4] rcu: Break rcu_node_0 --> &rq->__lock order Commit 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()") added a kfree() call to free any user provided affinity mask, if present. It was changed later to use kfree_rcu() in commit 9a5418bc48ba ("sched/core: Use kfree_rcu() in do_set_cpus_allowed()") to avoid a circular locking dependency problem. It turns out that even kfree_rcu() isn't safe for avoiding circular locking problem. As reported by kernel test robot, the following circular locking dependency now exists: &rdp->nocb_lock --> rcu_node_0 --> &rq->__lock Solve this by breaking the rcu_node_0 --> &rq->__lock chain by moving the resched_cpu() out from under rcu_node lock. [peterz: heavily borrowed from Waiman's Changelog] [paulmck: applied Z qiang feedback] Fixes: 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()") Reported-by: kernel test robot Acked-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/oe-lkp/202310302207.a25f1a30-oliver.sang@intel.com Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 700524726079..77e7a0dc722a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -755,14 +755,19 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) } /* - * Return true if the specified CPU has passed through a quiescent - * state by virtue of being in or having passed through an dynticks - * idle state since the last call to dyntick_save_progress_counter() - * for this same CPU, or by virtue of having been offline. + * Returns positive if the specified CPU has passed through a quiescent state + * by virtue of being in or having passed through an dynticks idle state since + * the last call to dyntick_save_progress_counter() for this same CPU, or by + * virtue of having been offline. + * + * Returns negative if the specified CPU needs a force resched. + * + * Returns zero otherwise. */ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { unsigned long jtsq; + int ret = 0; struct rcu_node *rnp = rdp->mynode; /* @@ -848,8 +853,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) || rcu_state.cbovld)) { WRITE_ONCE(rdp->rcu_urgent_qs, true); - resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); + ret = -1; } /* @@ -862,8 +867,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (time_after(jiffies, rcu_state.jiffies_resched)) { if (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq)) { - resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); + ret = -1; } if (IS_ENABLED(CONFIG_IRQ_WORK) && !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && @@ -892,7 +897,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } } - return 0; + return ret; } /* Trace-event wrapper function for trace_rcu_future_grace_period. */ @@ -2271,15 +2276,15 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) { int cpu; unsigned long flags; - unsigned long mask; - struct rcu_data *rdp; struct rcu_node *rnp; rcu_state.cbovld = rcu_state.cbovldnext; rcu_state.cbovldnext = false; rcu_for_each_leaf_node(rnp) { + unsigned long mask = 0; + unsigned long rsmask = 0; + cond_resched_tasks_rcu_qs(); - mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { @@ -2297,11 +2302,17 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) continue; } for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) { + struct rcu_data *rdp; + int ret; + rdp = per_cpu_ptr(&rcu_data, cpu); - if (f(rdp)) { + ret = f(rdp); + if (ret > 0) { mask |= rdp->grpmask; rcu_disable_urgency_upon_qs(rdp); } + if (ret < 0) + rsmask |= rdp->grpmask; } if (mask != 0) { /* Idle/offline CPUs, report (releases rnp->lock). */ @@ -2310,6 +2321,9 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } + + for_each_leaf_node_cpu_mask(rnp, cpu, rsmask) + resched_cpu(cpu); } } From 2be4686d866ad5896f2bb94d82fe892197aea9c7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2023 16:40:47 +0200 Subject: [PATCH 2/4] rcu: Introduce rcu_cpu_online() Export the RCU point of view as to when a CPU is considered offline (ie: when does RCU consider that a CPU is sufficiently down in the hotplug process to not feature any possible read side). This will be used by RCU-tasks whose vision of an offline CPU should reasonably match the one of RCU core. Fixes: cff9b2332ab7 ("kernel/sched: Modify initial boot task idle setup") Acked-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/tree.c | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0d866eaa4cc8..b531c33e9545 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -500,6 +500,7 @@ static inline void rcu_expedite_gp(void) { } static inline void rcu_unexpedite_gp(void) { } static inline void rcu_async_hurry(void) { } static inline void rcu_async_relax(void) { } +static inline bool rcu_cpu_online(int cpu) { return true; } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ @@ -509,6 +510,7 @@ void rcu_unexpedite_gp(void); void rcu_async_hurry(void); void rcu_async_relax(void); void rcupdate_announce_bootup_oddness(void); +bool rcu_cpu_online(int cpu); #ifdef CONFIG_TASKS_RCU_GENERIC void show_rcu_tasks_gp_kthreads(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 77e7a0dc722a..c3359f4c8830 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4216,6 +4216,13 @@ static bool rcu_rdp_cpu_online(struct rcu_data *rdp) return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); } +bool rcu_cpu_online(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return rcu_rdp_cpu_online(rdp); +} + #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* From 9715ed501b585d47444865071674c961c0cc0020 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2023 16:40:48 +0200 Subject: [PATCH 3/4] rcu/tasks: Handle new PF_IDLE semantics The commit: cff9b2332ab7 ("kernel/sched: Modify initial boot task idle setup") has changed the semantics of what is to be considered an idle task in such a way that CPU boot code preceding the actual idle loop is excluded from it. This has however introduced new potential RCU-tasks stalls when either: 1) Grace period is started before init/0 had a chance to set PF_IDLE, keeping it stuck in the holdout list until idle ever schedules. 2) Grace period is started when some possible CPUs have never been online, keeping their idle tasks stuck in the holdout list until the CPU ever boots up. 3) Similar to 1) but with secondary CPUs: Grace period is started concurrently with secondary CPU booting, putting its idle task in the holdout list because PF_IDLE isn't yet observed on it. It stays then stuck in the holdout list until that CPU ever schedules. The effect is mitigated here by the hotplug AP thread that must run to bring the CPU up. Fix this with handling the new semantics of PF_IDLE, keeping in mind that it may or may not be set on an idle task. Take advantage of that to strengthen the coverage of an RCU-tasks quiescent state within an idle task, excluding the CPU boot code from it. Only the code running within the idle loop is now a quiescent state, along with offline CPUs. Fixes: cff9b2332ab7 ("kernel/sched: Modify initial boot task idle setup") Suggested-by: Joel Fernandes Suggested-by: Paul E . McKenney" Acked-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker --- kernel/rcu/tasks.h | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 1fa631168594..1087f63b75b9 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -895,10 +895,36 @@ static void rcu_tasks_pregp_step(struct list_head *hop) synchronize_rcu(); } +/* Check for quiescent states since the pregp's synchronize_rcu() */ +static bool rcu_tasks_is_holdout(struct task_struct *t) +{ + int cpu; + + /* Has the task been seen voluntarily sleeping? */ + if (!READ_ONCE(t->on_rq)) + return false; + + /* + * Idle tasks (or idle injection) within the idle loop are RCU-tasks + * quiescent states. But CPU boot code performed by the idle task + * isn't a quiescent state. + */ + if (is_idle_task(t)) + return false; + + cpu = task_cpu(t); + + /* Idle tasks on offline CPUs are RCU-tasks quiescent states. */ + if (t == idle_task(cpu) && !rcu_cpu_online(cpu)) + return false; + + return true; +} + /* Per-task initial processing. */ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) { - if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { + if (t != current && rcu_tasks_is_holdout(t)) { get_task_struct(t); t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); WRITE_ONCE(t->rcu_tasks_holdout, true); @@ -947,7 +973,7 @@ static void check_holdout_task(struct task_struct *t, if (!READ_ONCE(t->rcu_tasks_holdout) || t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || + !rcu_tasks_is_holdout(t) || (IS_ENABLED(CONFIG_NO_HZ_FULL) && !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { WRITE_ONCE(t->rcu_tasks_holdout, false); From a80712b9cc7e57830260ec5e1feb9cdb59e1da2f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2023 16:40:49 +0200 Subject: [PATCH 4/4] rcu/tasks-trace: Handle new PF_IDLE semantics The commit: cff9b2332ab7 ("kernel/sched: Modify initial boot task idle setup") has changed the semantics of what is to be considered an idle task in such a way that the idle task of an offline CPU may not carry the PF_IDLE flag anymore. However RCU-tasks-trace tests the opposite assertion, still assuming that idle tasks carry the PF_IDLE flag during their whole lifecycle. Remove this assumption to avoid spurious warnings but keep the initial test verifying that the idle task is the current task on any offline CPU. Reported-by: Naresh Kamboju Fixes: cff9b2332ab7 ("kernel/sched: Modify initial boot task idle setup") Suggested-by: Joel Fernandes Suggested-by: Paul E . McKenney" Acked-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 1087f63b75b9..f54d5782eca0 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1551,7 +1551,7 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) } else { // The task is not running, so C-language access is safe. nesting = t->trc_reader_nesting; - WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t)); + WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t)))); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl) n_heavy_reader_ofl_updates++; }