Merge branches 'doc.2022.06.21a', 'fixes.2022.07.19a', 'nocb.2022.07.19a', 'poll.2022.07.21a', 'rcu-tasks.2022.06.21a' and 'torture.2022.06.21a' into HEAD

doc.2022.06.21a: Documentation updates.
fixes.2022.07.19a: Miscellaneous fixes.
nocb.2022.07.19a: Callback-offload updates.
poll.2022.07.21a: Polled grace-period updates.
rcu-tasks.2022.06.21a: Tasks RCU updates.
torture.2022.06.21a: Torture-test updates.
This commit is contained in:
Paul E. McKenney 2022-07-21 17:43:16 -07:00
28 changed files with 1223 additions and 484 deletions

View File

@ -3659,6 +3659,9 @@
just as if they had also been called out in the
rcu_nocbs= boot parameter.
Note that this argument takes precedence over
the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option.
noiotrap [SH] Disables trapped I/O port accesses.
noirqdebug [X86-32] Disables the code which attempts to detect and
@ -4557,6 +4560,9 @@
no-callback mode from boot but the mode may be
toggled at runtime via cpusets.
Note that this argument takes precedence over
the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option.
rcu_nocb_poll [KNL]
Rather than requiring that offloaded CPUs
(specified by rcu_nocbs= above) explicitly
@ -5799,6 +5805,24 @@
expediting. Set to zero to disable automatic
expediting.
srcutree.srcu_max_nodelay [KNL]
Specifies the number of no-delay instances
per jiffy for which the SRCU grace period
worker thread will be rescheduled with zero
delay. Beyond this limit, worker thread will
be rescheduled with a sleep delay of one jiffy.
srcutree.srcu_max_nodelay_phase [KNL]
Specifies the per-grace-period phase, number of
non-sleeping polls of readers. Beyond this limit,
grace period worker thread will be rescheduled
with a sleep delay of one jiffy, between each
rescan of the readers, for a grace period phase.
srcutree.srcu_retry_check_delay [KNL]
Specifies number of microseconds of non-sleeping
delay between each non-sleeping poll of readers.
srcutree.small_contention_lim [KNL]
Specifies the number of update-side contention
events per jiffy will be tolerated before

View File

@ -41,6 +41,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
void rcu_barrier_tasks(void);
void rcu_barrier_tasks_rude(void);
void synchronize_rcu(void);
unsigned long get_completed_synchronize_rcu(void);
#ifdef CONFIG_PREEMPT_RCU
@ -169,13 +170,24 @@ void synchronize_rcu_tasks(void);
# endif
# ifdef CONFIG_TASKS_TRACE_RCU
# define rcu_tasks_trace_qs(t) \
do { \
if (!likely(READ_ONCE((t)->trc_reader_checked)) && \
!unlikely(READ_ONCE((t)->trc_reader_nesting))) { \
smp_store_release(&(t)->trc_reader_checked, true); \
smp_mb(); /* Readers partitioned by store. */ \
} \
// Bits for ->trc_reader_special.b.need_qs field.
#define TRC_NEED_QS 0x1 // Task needs a quiescent state.
#define TRC_NEED_QS_CHECKED 0x2 // Task has been checked for needing quiescent state.
u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new);
void rcu_tasks_trace_qs_blkd(struct task_struct *t);
# define rcu_tasks_trace_qs(t) \
do { \
int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting); \
\
if (likely(!READ_ONCE((t)->trc_reader_special.b.need_qs)) && \
likely(!___rttq_nesting)) { \
rcu_trc_cmpxchg_need_qs((t), 0, TRC_NEED_QS_CHECKED); \
} else if (___rttq_nesting && ___rttq_nesting != INT_MIN && \
!READ_ONCE((t)->trc_reader_special.b.blocked)) { \
rcu_tasks_trace_qs_blkd(t); \
} \
} while (0)
# else
# define rcu_tasks_trace_qs(t) do { } while (0)
@ -184,7 +196,7 @@ void synchronize_rcu_tasks(void);
#define rcu_tasks_qs(t, preempt) \
do { \
rcu_tasks_classic_qs((t), (preempt)); \
rcu_tasks_trace_qs((t)); \
rcu_tasks_trace_qs(t); \
} while (0)
# ifdef CONFIG_TASKS_RUDE_RCU

View File

@ -75,7 +75,7 @@ static inline void rcu_read_unlock_trace(void)
nesting = READ_ONCE(t->trc_reader_nesting) - 1;
barrier(); // Critical section before disabling.
// Disable IPI-based setting of .need_qs.
WRITE_ONCE(t->trc_reader_nesting, INT_MIN);
WRITE_ONCE(t->trc_reader_nesting, INT_MIN + nesting);
if (likely(!READ_ONCE(t->trc_reader_special.s)) || nesting) {
WRITE_ONCE(t->trc_reader_nesting, nesting);
return; // We assume shallow reader nesting.

View File

@ -23,6 +23,16 @@ static inline void cond_synchronize_rcu(unsigned long oldstate)
might_sleep();
}
static inline unsigned long start_poll_synchronize_rcu_expedited(void)
{
return start_poll_synchronize_rcu();
}
static inline void cond_synchronize_rcu_expedited(unsigned long oldstate)
{
cond_synchronize_rcu(oldstate);
}
extern void rcu_barrier(void);
static inline void synchronize_rcu_expedited(void)
@ -38,7 +48,7 @@ static inline void synchronize_rcu_expedited(void)
*/
extern void kvfree(const void *addr);
static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
static inline void __kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
if (head) {
call_rcu(head, func);
@ -51,6 +61,15 @@ static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
kvfree((void *) func);
}
#ifdef CONFIG_KASAN_GENERIC
void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
#else
static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
__kvfree_call_rcu(head, func);
}
#endif
void rcu_qs(void);
static inline void rcu_softirq_qs(void)

View File

@ -40,6 +40,8 @@ bool rcu_eqs_special_set(int cpu);
void rcu_momentary_dyntick_idle(void);
void kfree_rcu_scheduler_running(void);
bool rcu_gp_might_be_stalled(void);
unsigned long start_poll_synchronize_rcu_expedited(void);
void cond_synchronize_rcu_expedited(unsigned long oldstate);
unsigned long get_state_synchronize_rcu(void);
unsigned long start_poll_synchronize_rcu(void);
bool poll_state_synchronize_rcu(unsigned long oldstate);

View File

@ -843,8 +843,9 @@ struct task_struct {
int trc_reader_nesting;
int trc_ipi_to_cpu;
union rcu_special trc_reader_special;
bool trc_reader_checked;
struct list_head trc_holdout_list;
struct list_head trc_blkd_node;
int trc_blkd_cpu;
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
struct sched_info sched_info;
@ -2223,6 +2224,7 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
extern bool sched_task_on_rq(struct task_struct *p);
extern unsigned long get_wchan(struct task_struct *p);
extern struct task_struct *cpu_curr_snapshot(int cpu);
/*
* In order to reduce various lock holder preemption latencies provide an

View File

@ -157,6 +157,7 @@ struct task_struct init_task
.trc_reader_nesting = 0,
.trc_reader_special.s = 0,
.trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list),
.trc_blkd_node = LIST_HEAD_INIT(init_task.trc_blkd_node),
#endif
#ifdef CONFIG_CPUSETS
.mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq,

View File

@ -1814,6 +1814,7 @@ static inline void rcu_copy_process(struct task_struct *p)
p->trc_reader_nesting = 0;
p->trc_reader_special.s = 0;
INIT_LIST_HEAD(&p->trc_holdout_list);
INIT_LIST_HEAD(&p->trc_blkd_node);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}

View File

@ -262,6 +262,35 @@ config RCU_NOCB_CPU
Say Y here if you need reduced OS jitter, despite added overhead.
Say N here if you are unsure.
config RCU_NOCB_CPU_DEFAULT_ALL
bool "Offload RCU callback processing from all CPUs by default"
depends on RCU_NOCB_CPU
default n
help
Use this option to offload callback processing from all CPUs
by default, in the absence of the rcu_nocbs or nohz_full boot
parameter. This also avoids the need to use any boot parameters
to achieve the effect of offloading all CPUs on boot.
Say Y here if you want offload all CPUs by default on boot.
Say N here if you are unsure.
config RCU_NOCB_CPU_CB_BOOST
bool "Offload RCU callback from real-time kthread"
depends on RCU_NOCB_CPU && RCU_BOOST
default y if PREEMPT_RT
help
Use this option to invoke offloaded callbacks as SCHED_FIFO
to avoid starvation by heavy SCHED_OTHER background load.
Of course, running as SCHED_FIFO during callback floods will
cause the rcuo[ps] kthreads to monopolize the CPU for hundreds
of milliseconds or more. Therefore, when enabling this option,
it is your responsibility to ensure that latency-sensitive
tasks either run with higher priority or run on some other CPU.
Say Y here if you want to set RT priority for offloading kthreads.
Say N here if you are building a !PREEMPT_RT kernel and are unsure.
config TASKS_TRACE_RCU_READ_MB
bool "Tasks Trace RCU readers use memory barriers in user and idle"
depends on RCU_EXPERT && TASKS_TRACE_RCU

View File

@ -121,7 +121,7 @@ config RCU_EQS_DEBUG
config RCU_STRICT_GRACE_PERIOD
bool "Provide debug RCU implementation with short grace periods"
depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4
depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 && !TINY_RCU
default n
select PREEMPT_COUNT if PREEMPT=n
help

View File

@ -23,6 +23,9 @@
#define RCU_SEQ_CTR_SHIFT 2
#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
/* Low-order bit definition for polled grace-period APIs. */
#define RCU_GET_STATE_COMPLETED 0x1
extern int sysctl_sched_rt_runtime;
/*
@ -119,6 +122,18 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
return ULONG_CMP_GE(READ_ONCE(*sp), s);
}
/*
* Given a snapshot from rcu_seq_snap(), determine whether or not a
* full update-side operation has occurred, but do not allow the
* (ULONG_MAX / 2) safety-factor/guard-band.
*/
static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s)
{
unsigned long cur_s = READ_ONCE(*sp);
return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1));
}
/*
* Has a grace period completed since the time the old gp_seq was collected?
*/

View File

@ -419,6 +419,7 @@ rcu_scale_writer(void *arg)
VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
WARN_ON(!wdpp);
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
current->flags |= PF_NO_SETAFFINITY;
sched_set_fifo_low(current);
if (holdoff)

View File

@ -83,9 +83,11 @@ torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait");
torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)");
torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()");
torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
@ -192,12 +194,16 @@ static int rcu_torture_writer_state;
#define RTWS_DEF_FREE 3
#define RTWS_EXP_SYNC 4
#define RTWS_COND_GET 5
#define RTWS_COND_SYNC 6
#define RTWS_POLL_GET 7
#define RTWS_POLL_WAIT 8
#define RTWS_SYNC 9
#define RTWS_STUTTER 10
#define RTWS_STOPPING 11
#define RTWS_COND_GET_EXP 6
#define RTWS_COND_SYNC 7
#define RTWS_COND_SYNC_EXP 8
#define RTWS_POLL_GET 9
#define RTWS_POLL_GET_EXP 10
#define RTWS_POLL_WAIT 11
#define RTWS_POLL_WAIT_EXP 12
#define RTWS_SYNC 13
#define RTWS_STUTTER 14
#define RTWS_STOPPING 15
static const char * const rcu_torture_writer_state_names[] = {
"RTWS_FIXED_DELAY",
"RTWS_DELAY",
@ -205,9 +211,13 @@ static const char * const rcu_torture_writer_state_names[] = {
"RTWS_DEF_FREE",
"RTWS_EXP_SYNC",
"RTWS_COND_GET",
"RTWS_COND_GET_EXP",
"RTWS_COND_SYNC",
"RTWS_COND_SYNC_EXP",
"RTWS_POLL_GET",
"RTWS_POLL_GET_EXP",
"RTWS_POLL_WAIT",
"RTWS_POLL_WAIT_EXP",
"RTWS_SYNC",
"RTWS_STUTTER",
"RTWS_STOPPING",
@ -320,7 +330,12 @@ struct rcu_torture_ops {
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
unsigned long (*get_gp_state_exp)(void);
unsigned long (*start_gp_poll_exp)(void);
bool (*poll_gp_state_exp)(unsigned long oldstate);
void (*cond_sync_exp)(unsigned long oldstate);
unsigned long (*get_gp_state)(void);
unsigned long (*get_gp_completed)(void);
unsigned long (*start_gp_poll)(void);
bool (*poll_gp_state)(unsigned long oldstate);
void (*cond_sync)(unsigned long oldstate);
@ -487,9 +502,14 @@ static struct rcu_torture_ops rcu_ops = {
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
.get_gp_state = get_state_synchronize_rcu,
.get_gp_completed = get_completed_synchronize_rcu,
.start_gp_poll = start_poll_synchronize_rcu,
.poll_gp_state = poll_state_synchronize_rcu,
.cond_sync = cond_synchronize_rcu,
.get_gp_state_exp = get_state_synchronize_rcu,
.start_gp_poll_exp = start_poll_synchronize_rcu_expedited,
.poll_gp_state_exp = poll_state_synchronize_rcu,
.cond_sync_exp = cond_synchronize_rcu_expedited,
.call = call_rcu,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
@ -1119,9 +1139,8 @@ rcu_torture_fqs(void *arg)
return 0;
}
// Used by writers to randomly choose from the available grace-period
// primitives. The only purpose of the initialization is to size the array.
static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC };
// Used by writers to randomly choose from the available grace-period primitives.
static int synctype[ARRAY_SIZE(rcu_torture_writer_state_names)] = { };
static int nsynctypes;
/*
@ -1129,18 +1148,27 @@ static int nsynctypes;
*/
static void rcu_torture_write_types(void)
{
bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
bool gp_poll1 = gp_poll, gp_sync1 = gp_sync;
bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp;
bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
bool gp_sync1 = gp_sync;
/* Initialize synctype[] array. If none set, take default. */
if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1)
gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true;
if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp &&
!gp_normal1 && !gp_poll1 && !gp_sync1)
gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 =
gp_normal1 = gp_poll1 = gp_sync1 = true;
if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
synctype[nsynctypes++] = RTWS_COND_GET;
pr_info("%s: Testing conditional GPs.\n", __func__);
} else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) {
pr_alert("%s: gp_cond without primitives.\n", __func__);
}
if (gp_cond_exp1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp) {
synctype[nsynctypes++] = RTWS_COND_GET_EXP;
pr_info("%s: Testing conditional expedited GPs.\n", __func__);
} else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) {
pr_alert("%s: gp_cond_exp without primitives.\n", __func__);
}
if (gp_exp1 && cur_ops->exp_sync) {
synctype[nsynctypes++] = RTWS_EXP_SYNC;
pr_info("%s: Testing expedited GPs.\n", __func__);
@ -1159,6 +1187,12 @@ static void rcu_torture_write_types(void)
} else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) {
pr_alert("%s: gp_poll without primitives.\n", __func__);
}
if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) {
synctype[nsynctypes++] = RTWS_POLL_GET_EXP;
pr_info("%s: Testing polling expedited GPs.\n", __func__);
} else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) {
pr_alert("%s: gp_poll_exp without primitives.\n", __func__);
}
if (gp_sync1 && cur_ops->sync) {
synctype[nsynctypes++] = RTWS_SYNC;
pr_info("%s: Testing normal GPs.\n", __func__);
@ -1237,6 +1271,10 @@ rcu_torture_writer(void *arg)
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
cookie, cur_ops->get_gp_state());
if (cur_ops->get_gp_completed) {
cookie = cur_ops->get_gp_completed();
WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
}
cur_ops->readunlock(idx);
}
switch (synctype[torture_random(&rand) % nsynctypes]) {
@ -1246,7 +1284,12 @@ rcu_torture_writer(void *arg)
break;
case RTWS_EXP_SYNC:
rcu_torture_writer_state = RTWS_EXP_SYNC;
if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
cookie = cur_ops->get_gp_state();
cur_ops->exp_sync();
cur_ops->exp_sync();
if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
rcu_torture_pipe_update(old_rp);
break;
case RTWS_COND_GET:
@ -1257,6 +1300,14 @@ rcu_torture_writer(void *arg)
cur_ops->cond_sync(gp_snap);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_COND_GET_EXP:
rcu_torture_writer_state = RTWS_COND_GET_EXP;
gp_snap = cur_ops->get_gp_state_exp();
torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC_EXP;
cur_ops->cond_sync_exp(gp_snap);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_POLL_GET:
rcu_torture_writer_state = RTWS_POLL_GET;
gp_snap = cur_ops->start_gp_poll();
@ -1266,9 +1317,23 @@ rcu_torture_writer(void *arg)
&rand);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_POLL_GET_EXP:
rcu_torture_writer_state = RTWS_POLL_GET_EXP;
gp_snap = cur_ops->start_gp_poll_exp();
rcu_torture_writer_state = RTWS_POLL_WAIT_EXP;
while (!cur_ops->poll_gp_state_exp(gp_snap))
torture_hrtimeout_jiffies(torture_random(&rand) % 16,
&rand);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_SYNC:
rcu_torture_writer_state = RTWS_SYNC;
if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
cookie = cur_ops->get_gp_state();
cur_ops->sync();
cur_ops->sync();
if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
rcu_torture_pipe_update(old_rp);
break;
default:
@ -1304,8 +1369,9 @@ rcu_torture_writer(void *arg)
if (list_empty(&rcu_tortures[i].rtort_free) &&
rcu_access_pointer(rcu_torture_current) !=
&rcu_tortures[i]) {
rcu_ftrace_dump(DUMP_ALL);
tracing_off();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
rcu_ftrace_dump(DUMP_ALL);
}
if (stutter_waited)
sched_set_normal(current, oldnice);
@ -1367,6 +1433,11 @@ rcu_torture_fakewriter(void *arg)
torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
cur_ops->cond_sync(gp_snap);
break;
case RTWS_COND_GET_EXP:
gp_snap = cur_ops->get_gp_state_exp();
torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
cur_ops->cond_sync_exp(gp_snap);
break;
case RTWS_POLL_GET:
gp_snap = cur_ops->start_gp_poll();
while (!cur_ops->poll_gp_state(gp_snap)) {
@ -1374,6 +1445,13 @@ rcu_torture_fakewriter(void *arg)
&rand);
}
break;
case RTWS_POLL_GET_EXP:
gp_snap = cur_ops->start_gp_poll_exp();
while (!cur_ops->poll_gp_state_exp(gp_snap)) {
torture_hrtimeout_jiffies(torture_random(&rand) % 16,
&rand);
}
break;
case RTWS_SYNC:
cur_ops->sync();
break;
@ -1851,7 +1929,7 @@ rcu_torture_stats_print(void)
batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]);
}
}
for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) {
if (pipesummary[i] != 0)
break;
}
@ -1973,7 +2051,13 @@ static void rcu_torture_mem_dump_obj(void)
static int z;
kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
if (WARN_ON_ONCE(!kcp))
return;
rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
if (WARN_ON_ONCE(!rhp)) {
kmem_cache_destroy(kcp);
return;
}
pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
mem_dump_obj(ZERO_SIZE_PTR);
@ -1990,6 +2074,8 @@ static void rcu_torture_mem_dump_obj(void)
kmem_cache_free(kcp, rhp);
kmem_cache_destroy(kcp);
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
if (WARN_ON_ONCE(!rhp))
return;
pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
pr_alert("mem_dump_obj(kmalloc %px):", rhp);
mem_dump_obj(rhp);
@ -1997,6 +2083,8 @@ static void rcu_torture_mem_dump_obj(void)
mem_dump_obj(&rhp->func);
kfree(rhp);
rhp = vmalloc(4096);
if (WARN_ON_ONCE(!rhp))
return;
pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
pr_alert("mem_dump_obj(vmalloc %px):", rhp);
mem_dump_obj(rhp);
@ -2058,6 +2146,19 @@ static int rcutorture_booster_init(unsigned int cpu)
if (boost_tasks[cpu] != NULL)
return 0; /* Already created, nothing more to do. */
// Testing RCU priority boosting requires rcutorture do
// some serious abuse. Counter this by running ksoftirqd
// at higher priority.
if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
struct sched_param sp;
struct task_struct *t;
t = per_cpu(ksoftirqd, cpu);
WARN_ON_ONCE(!t);
sp.sched_priority = 2;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
/* Don't allow time recalculation while creating a new task. */
mutex_lock(&boost_mutex);
rcu_torture_disable_rt_throttle();
@ -2856,7 +2957,6 @@ static int rcu_torture_read_exit_child(void *trsp_in)
// Parent kthread which creates and destroys read-exit child kthreads.
static int rcu_torture_read_exit(void *unused)
{
int count = 0;
bool errexit = false;
int i;
struct task_struct *tsp;
@ -2868,34 +2968,28 @@ static int rcu_torture_read_exit(void *unused)
// Each pass through this loop does one read-exit episode.
do {
if (++count > read_exit_burst) {
VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
rcu_barrier(); // Wait for task_struct free, avoid OOM.
for (i = 0; i < read_exit_delay; i++) {
schedule_timeout_uninterruptible(HZ);
if (READ_ONCE(read_exit_child_stop))
break;
VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
for (i = 0; i < read_exit_burst; i++) {
if (READ_ONCE(read_exit_child_stop))
break;
stutter_wait("rcu_torture_read_exit");
// Spawn child.
tsp = kthread_run(rcu_torture_read_exit_child,
&trs, "%s", "rcu_torture_read_exit_child");
if (IS_ERR(tsp)) {
TOROUT_ERRSTRING("out of memory");
errexit = true;
break;
}
if (!READ_ONCE(read_exit_child_stop))
VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
count = 0;
cond_resched();
kthread_stop(tsp);
n_read_exits++;
}
if (READ_ONCE(read_exit_child_stop))
break;
// Spawn child.
tsp = kthread_run(rcu_torture_read_exit_child,
&trs, "%s",
"rcu_torture_read_exit_child");
if (IS_ERR(tsp)) {
TOROUT_ERRSTRING("out of memory");
errexit = true;
tsp = NULL;
break;
}
cond_resched();
kthread_stop(tsp);
n_read_exits ++;
stutter_wait("rcu_torture_read_exit");
VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
rcu_barrier(); // Wait for task_struct free, avoid OOM.
i = 0;
for (; !errexit && !READ_ONCE(read_exit_child_stop) && i < read_exit_delay; i++)
schedule_timeout_uninterruptible(HZ);
} while (!errexit && !READ_ONCE(read_exit_child_stop));
// Clean up and exit.
@ -3105,6 +3199,7 @@ static void rcu_test_debug_objects(void)
pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
destroy_rcu_head_on_stack(&rh1);
destroy_rcu_head_on_stack(&rh2);
kfree(rhp);
#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
@ -3312,21 +3407,6 @@ rcu_torture_init(void)
rcutor_hp = firsterr;
if (torture_init_error(firsterr))
goto unwind;
// Testing RCU priority boosting requires rcutorture do
// some serious abuse. Counter this by running ksoftirqd
// at higher priority.
if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
for_each_online_cpu(cpu) {
struct sched_param sp;
struct task_struct *t;
t = per_cpu(ksoftirqd, cpu);
WARN_ON_ONCE(!t);
sp.sched_priority = 2;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
}
}
shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);

View File

@ -385,7 +385,7 @@ static struct ref_scale_ops rwsem_ops = {
};
// Definitions for global spinlock
static DEFINE_SPINLOCK(test_lock);
static DEFINE_RAW_SPINLOCK(test_lock);
static void ref_lock_section(const int nloops)
{
@ -393,8 +393,8 @@ static void ref_lock_section(const int nloops)
preempt_disable();
for (i = nloops; i >= 0; i--) {
spin_lock(&test_lock);
spin_unlock(&test_lock);
raw_spin_lock(&test_lock);
raw_spin_unlock(&test_lock);
}
preempt_enable();
}
@ -405,9 +405,9 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd
preempt_disable();
for (i = nloops; i >= 0; i--) {
spin_lock(&test_lock);
raw_spin_lock(&test_lock);
un_delay(udl, ndl);
spin_unlock(&test_lock);
raw_spin_unlock(&test_lock);
}
preempt_enable();
}
@ -427,8 +427,8 @@ static void ref_lock_irq_section(const int nloops)
preempt_disable();
for (i = nloops; i >= 0; i--) {
spin_lock_irqsave(&test_lock, flags);
spin_unlock_irqrestore(&test_lock, flags);
raw_spin_lock_irqsave(&test_lock, flags);
raw_spin_unlock_irqrestore(&test_lock, flags);
}
preempt_enable();
}
@ -440,9 +440,9 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in
preempt_disable();
for (i = nloops; i >= 0; i--) {
spin_lock_irqsave(&test_lock, flags);
raw_spin_lock_irqsave(&test_lock, flags);
un_delay(udl, ndl);
spin_unlock_irqrestore(&test_lock, flags);
raw_spin_unlock_irqrestore(&test_lock, flags);
}
preempt_enable();
}

View File

@ -511,10 +511,52 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
return sum;
}
#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances.
#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances.
/*
* We use an adaptive strategy for synchronize_srcu() and especially for
* synchronize_srcu_expedited(). We spin for a fixed time period
* (defined below, boot time configurable) to allow SRCU readers to exit
* their read-side critical sections. If there are still some readers
* after one jiffy, we repeatedly block for one jiffy time periods.
* The blocking time is increased as the grace-period age increases,
* with max blocking time capped at 10 jiffies.
*/
#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5
static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
module_param(srcu_retry_check_delay, ulong, 0444);
#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase
// no-delay instances.
#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase
// no-delay instances.
#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low))
#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high))
#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
// called from process_srcu().
#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \
(2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
// Maximum per-GP-phase consecutive no-delay instances.
#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \
SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \
SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
module_param(srcu_max_nodelay_phase, ulong, 0444);
// Maximum consecutive no-delay instances.
#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
module_param(srcu_max_nodelay, ulong, 0444);
/*
* Return grace-period delay, zero if there are expedited grace
@ -522,16 +564,22 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
*/
static unsigned long srcu_get_delay(struct srcu_struct *ssp)
{
unsigned long gpstart;
unsigned long j;
unsigned long jbase = SRCU_INTERVAL;
if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
jbase = 0;
if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)))
jbase += jiffies - READ_ONCE(ssp->srcu_gp_start);
if (!jbase) {
WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE)
jbase = 1;
if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) {
j = jiffies - 1;
gpstart = READ_ONCE(ssp->srcu_gp_start);
if (time_after(j, gpstart))
jbase += j - gpstart;
if (!jbase) {
WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
jbase = 1;
}
}
return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
}
@ -606,15 +654,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
/*
* We use an adaptive strategy for synchronize_srcu() and especially for
* synchronize_srcu_expedited(). We spin for a fixed time period
* (defined below) to allow SRCU readers to exit their read-side critical
* sections. If there are still some readers after a few microseconds,
* we repeatedly block for 1-millisecond time periods.
*/
#define SRCU_RETRY_CHECK_DELAY 5
/*
* Start an SRCU grace period.
*/
@ -700,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
*/
static void srcu_gp_end(struct srcu_struct *ssp)
{
unsigned long cbdelay;
unsigned long cbdelay = 1;
bool cbs;
bool last_lvl;
int cpu;
@ -720,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
spin_lock_irq_rcu_node(ssp);
idx = rcu_seq_state(ssp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
cbdelay = !!srcu_get_delay(ssp);
if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
cbdelay = 0;
WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
rcu_seq_end(&ssp->srcu_gp_seq);
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
@ -921,12 +962,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
*/
static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
{
unsigned long curdelay;
curdelay = !srcu_get_delay(ssp);
for (;;) {
if (srcu_readers_active_idx_check(ssp, idx))
return true;
if (--trycount + !srcu_get_delay(ssp) <= 0)
if ((--trycount + curdelay) <= 0)
return false;
udelay(SRCU_RETRY_CHECK_DELAY);
udelay(srcu_retry_check_delay);
}
}
@ -1582,7 +1627,7 @@ static void process_srcu(struct work_struct *work)
j = jiffies;
if (READ_ONCE(ssp->reschedule_jiffies) == j) {
WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY)
if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
curdelay = 1;
} else {
WRITE_ONCE(ssp->reschedule_count, 1);
@ -1674,6 +1719,11 @@ static int __init srcu_bootup_announce(void)
pr_info("Hierarchical SRCU implementation.\n");
if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
return 0;
}
early_initcall(srcu_bootup_announce);

View File

@ -14,7 +14,7 @@
struct rcu_tasks;
typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp);
typedef void (*pregp_func_t)(void);
typedef void (*pregp_func_t)(struct list_head *hop);
typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop);
typedef void (*postscan_func_t)(struct list_head *hop);
typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp);
@ -29,6 +29,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @rtp_work: Work queue for invoking callbacks.
* @rtp_irq_work: IRQ work queue for deferred wakeups.
* @barrier_q_head: RCU callback for barrier operation.
* @rtp_blkd_tasks: List of tasks blocked as readers.
* @cpu: CPU number corresponding to this entry.
* @rtpp: Pointer to the rcu_tasks structure.
*/
@ -40,6 +41,7 @@ struct rcu_tasks_percpu {
struct work_struct rtp_work;
struct irq_work rtp_irq_work;
struct rcu_head barrier_q_head;
struct list_head rtp_blkd_tasks;
int cpu;
struct rcu_tasks *rtpp;
};
@ -48,6 +50,7 @@ struct rcu_tasks_percpu {
* struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
* @cbs_wait: RCU wait allowing a new callback to get kthread's attention.
* @cbs_gbl_lock: Lock protecting callback list.
* @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone.
* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
* @gp_func: This flavor's grace-period-wait function.
* @gp_state: Grace period's most recent state transition (debugging).
@ -79,6 +82,7 @@ struct rcu_tasks_percpu {
struct rcu_tasks {
struct rcuwait cbs_wait;
raw_spinlock_t cbs_gbl_lock;
struct mutex tasks_gp_mutex;
int gp_state;
int gp_sleep;
int init_fract;
@ -119,6 +123,7 @@ static struct rcu_tasks rt_name = \
{ \
.cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \
.cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \
.tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \
.gp_func = gp, \
.call_func = call, \
.rtpcpu = &rt_name ## __percpu, \
@ -140,6 +145,7 @@ static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY;
module_param(rcu_task_ipi_delay, int, 0644);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
#define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30)
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
@ -253,6 +259,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
rtpcp->cpu = cpu;
rtpcp->rtpp = rtp;
if (!rtpcp->rtp_blkd_tasks.next)
INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
}
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
@ -323,17 +331,6 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
irq_work_queue(&rtpcp->rtp_irq_work);
}
// Wait for a grace period for the specified flavor of Tasks RCU.
static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
{
/* Complain if the scheduler has not started. */
RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
"synchronize_rcu_tasks called too soon");
/* Wait for the grace period. */
wait_rcu_gp(rtp->call_func);
}
// RCU callback function for rcu_barrier_tasks_generic().
static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
{
@ -439,6 +436,11 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
WRITE_ONCE(rtp->percpu_dequeue_lim, 1);
pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
}
for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
}
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
}
@ -497,10 +499,41 @@ static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp)
rcu_tasks_invoke_cbs(rtp, rtpcp);
}
/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
static int __noreturn rcu_tasks_kthread(void *arg)
// Wait for one grace period.
static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
{
int needgpcb;
mutex_lock(&rtp->tasks_gp_mutex);
// If there were none, wait a bit and start over.
if (unlikely(midboot)) {
needgpcb = 0x2;
} else {
set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
rcuwait_wait_event(&rtp->cbs_wait,
(needgpcb = rcu_tasks_need_gpcb(rtp)),
TASK_IDLE);
}
if (needgpcb & 0x2) {
// Wait for one grace period.
set_tasks_gp_state(rtp, RTGS_WAIT_GP);
rtp->gp_start = jiffies;
rcu_seq_start(&rtp->tasks_gp_seq);
rtp->gp_func(rtp);
rcu_seq_end(&rtp->tasks_gp_seq);
}
// Invoke callbacks.
set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
mutex_unlock(&rtp->tasks_gp_mutex);
}
// RCU-tasks kthread that detects grace periods and invokes callbacks.
static int __noreturn rcu_tasks_kthread(void *arg)
{
struct rcu_tasks *rtp = arg;
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
@ -514,31 +547,30 @@ static int __noreturn rcu_tasks_kthread(void *arg)
* This loop is terminated by the system going down. ;-)
*/
for (;;) {
set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
// Wait for one grace period and invoke any callbacks
// that are ready.
rcu_tasks_one_gp(rtp, false);
/* If there were none, wait a bit and start over. */
rcuwait_wait_event(&rtp->cbs_wait,
(needgpcb = rcu_tasks_need_gpcb(rtp)),
TASK_IDLE);
if (needgpcb & 0x2) {
// Wait for one grace period.
set_tasks_gp_state(rtp, RTGS_WAIT_GP);
rtp->gp_start = jiffies;
rcu_seq_start(&rtp->tasks_gp_seq);
rtp->gp_func(rtp);
rcu_seq_end(&rtp->tasks_gp_seq);
}
/* Invoke callbacks. */
set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
/* Paranoid sleep to keep this from entering a tight loop */
// Paranoid sleep to keep this from entering a tight loop.
schedule_timeout_idle(rtp->gp_sleep);
}
}
// Wait for a grace period for the specified flavor of Tasks RCU.
static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
{
/* Complain if the scheduler has not started. */
RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
"synchronize_rcu_tasks called too soon");
// If the grace-period kthread is running, use it.
if (READ_ONCE(rtp->kthread_ptr)) {
wait_rcu_gp(rtp->call_func);
return;
}
rcu_tasks_one_gp(rtp, true);
}
/* Spawn RCU-tasks grace-period kthread. */
static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
{
@ -630,7 +662,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
struct task_struct *t;
set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP);
rtp->pregp_func();
rtp->pregp_func(&holdouts);
/*
* There were callbacks, so we need to wait for an RCU-tasks
@ -639,10 +671,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
* and make a list of them in holdouts.
*/
set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST);
rcu_read_lock();
for_each_process_thread(g, t)
rtp->pertask_func(t, &holdouts);
rcu_read_unlock();
if (rtp->pertask_func) {
rcu_read_lock();
for_each_process_thread(g, t)
rtp->pertask_func(t, &holdouts);
rcu_read_unlock();
}
set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST);
rtp->postscan_func(&holdouts);
@ -760,7 +794,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// disabling.
/* Pre-grace-period preparation. */
static void rcu_tasks_pregp_step(void)
static void rcu_tasks_pregp_step(struct list_head *hop)
{
/*
* Wait for all pre-existing t->on_rq and t->nvcsw transitions
@ -1105,11 +1139,10 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
// 3. Avoids expensive read-side instructions, having overhead similar
// to that of Preemptible RCU.
//
// There are of course downsides. The grace-period code can send IPIs to
// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace.
// It is necessary to scan the full tasklist, much as for Tasks RCU. There
// is a single callback queue guarded by a single lock, again, much as for
// Tasks RCU. If needed, these downsides can be at least partially remedied.
// There are of course downsides. For example, the grace-period code
// can send IPIs to CPUs, even when those CPUs are in the idle loop or
// in nohz_full userspace. If needed, these downsides can be at least
// partially remedied.
//
// Perhaps most important, this variant of RCU does not affect the vanilla
// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace
@ -1122,38 +1155,30 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
// invokes these functions in this order:
//
// rcu_tasks_trace_pregp_step():
// Initialize the count of readers and block CPU-hotplug operations.
// rcu_tasks_trace_pertask(), invoked on every non-idle task:
// Initialize per-task state and attempt to identify an immediate
// quiescent state for that task, or, failing that, attempt to
// set that task's .need_qs flag so that task's next outermost
// rcu_read_unlock_trace() will report the quiescent state (in which
// case the count of readers is incremented). If both attempts fail,
// the task is added to a "holdout" list. Note that IPIs are used
// to invoke trc_read_check_handler() in the context of running tasks
// in order to avoid ordering overhead on common-case shared-variable
// accessses.
// Disables CPU hotplug, adds all currently executing tasks to the
// holdout list, then checks the state of all tasks that blocked
// or were preempted within their current RCU Tasks Trace read-side
// critical section, adding them to the holdout list if appropriate.
// Finally, this function re-enables CPU hotplug.
// The ->pertask_func() pointer is NULL, so there is no per-task processing.
// rcu_tasks_trace_postscan():
// Initialize state and attempt to identify an immediate quiescent
// state as above (but only for idle tasks), unblock CPU-hotplug
// operations, and wait for an RCU grace period to avoid races with
// tasks that are in the process of exiting.
// Invokes synchronize_rcu() to wait for late-stage exiting tasks
// to finish exiting.
// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty:
// Scans the holdout list, attempting to identify a quiescent state
// for each task on the list. If there is a quiescent state, the
// corresponding task is removed from the holdout list.
// corresponding task is removed from the holdout list. Once this
// list is empty, the grace period has completed.
// rcu_tasks_trace_postgp():
// Wait for the count of readers do drop to zero, reporting any stalls.
// Also execute full memory barriers to maintain ordering with code
// executing after the grace period.
// Provides the needed full memory barrier and does debug checks.
//
// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks.
//
// Pre-grace-period update-side code is ordered before the grace
// period via the ->cbs_lock and barriers in rcu_tasks_kthread().
// Pre-grace-period read-side code is ordered before the grace period by
// atomic_dec_and_test() of the count of readers (for IPIed readers) and by
// scheduler context-switch ordering (for locked-down non-running readers).
// Pre-grace-period update-side code is ordered before the grace period
// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period
// read-side code is ordered before the grace period by atomic operations
// on .b.need_qs flag of each task involved in this process, or by scheduler
// context-switch ordering (for locked-down non-running readers).
// The lockdep state must be outside of #ifdef to be useful.
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@ -1165,9 +1190,6 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
#ifdef CONFIG_TASKS_TRACE_RCU
static atomic_t trc_n_readers_need_end; // Number of waited-for readers.
static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks.
// Record outstanding IPIs to each CPU. No point in sending two...
static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
@ -1176,44 +1198,104 @@ static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
static unsigned long n_heavy_reader_attempts;
static unsigned long n_heavy_reader_updates;
static unsigned long n_heavy_reader_ofl_updates;
static unsigned long n_trc_holdouts;
void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace,
"RCU Tasks Trace");
/*
* This irq_work handler allows rcu_read_unlock_trace() to be invoked
* while the scheduler locks are held.
*/
static void rcu_read_unlock_iw(struct irq_work *iwp)
/* Load from ->trc_reader_special.b.need_qs with proper ordering. */
static u8 rcu_ld_need_qs(struct task_struct *t)
{
wake_up(&trc_wait);
smp_mb(); // Enforce full grace-period ordering.
return smp_load_acquire(&t->trc_reader_special.b.need_qs);
}
static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw);
/* If we are the last reader, wake up the grace-period kthread. */
/* Store to ->trc_reader_special.b.need_qs with proper ordering. */
static void rcu_st_need_qs(struct task_struct *t, u8 v)
{
smp_store_release(&t->trc_reader_special.b.need_qs, v);
smp_mb(); // Enforce full grace-period ordering.
}
/*
* Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
* the four-byte operand-size restriction of some platforms.
* Returns the old value, which is often ignored.
*/
u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
{
union rcu_special ret;
union rcu_special trs_old = READ_ONCE(t->trc_reader_special);
union rcu_special trs_new = trs_old;
if (trs_old.b.need_qs != old)
return trs_old.b.need_qs;
trs_new.b.need_qs = new;
ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s);
return ret.b.need_qs;
}
EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
/*
* If we are the last reader, signal the grace-period kthread.
* Also remove from the per-CPU list of blocked tasks.
*/
void rcu_read_unlock_trace_special(struct task_struct *t)
{
int nq = READ_ONCE(t->trc_reader_special.b.need_qs);
unsigned long flags;
struct rcu_tasks_percpu *rtpcp;
union rcu_special trs;
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) &&
t->trc_reader_special.b.need_mb)
// Open-coded full-word version of rcu_ld_need_qs().
smp_mb(); // Enforce full grace-period ordering.
trs = smp_load_acquire(&t->trc_reader_special);
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb)
smp_mb(); // Pairs with update-side barriers.
// Update .need_qs before ->trc_reader_nesting for irq/NMI handlers.
if (nq)
WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) {
u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS,
TRC_NEED_QS_CHECKED);
WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result);
}
if (trs.b.blocked) {
rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu);
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
list_del_init(&t->trc_blkd_node);
WRITE_ONCE(t->trc_reader_special.b.blocked, false);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
}
WRITE_ONCE(t->trc_reader_nesting, 0);
if (nq && atomic_dec_and_test(&trc_n_readers_need_end))
irq_work_queue(&rcu_tasks_trace_iw);
}
EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special);
/* Add a newly blocked reader task to its CPU's list. */
void rcu_tasks_trace_qs_blkd(struct task_struct *t)
{
unsigned long flags;
struct rcu_tasks_percpu *rtpcp;
local_irq_save(flags);
rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu);
raw_spin_lock_rcu_node(rtpcp); // irqs already disabled
t->trc_blkd_cpu = smp_processor_id();
if (!rtpcp->rtp_blkd_tasks.next)
INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
WRITE_ONCE(t->trc_reader_special.b.blocked, true);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
}
EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd);
/* Add a task to the holdout list, if it is not already on the list. */
static void trc_add_holdout(struct task_struct *t, struct list_head *bhp)
{
if (list_empty(&t->trc_holdout_list)) {
get_task_struct(t);
list_add(&t->trc_holdout_list, bhp);
n_trc_holdouts++;
}
}
@ -1223,37 +1305,36 @@ static void trc_del_holdout(struct task_struct *t)
if (!list_empty(&t->trc_holdout_list)) {
list_del_init(&t->trc_holdout_list);
put_task_struct(t);
n_trc_holdouts--;
}
}
/* IPI handler to check task state. */
static void trc_read_check_handler(void *t_in)
{
int nesting;
struct task_struct *t = current;
struct task_struct *texp = t_in;
// If the task is no longer running on this CPU, leave.
if (unlikely(texp != t)) {
if (unlikely(texp != t))
goto reset_ipi; // Already on holdout list, so will check later.
}
// If the task is not in a read-side critical section, and
// if this is the last reader, awaken the grace-period kthread.
if (likely(!READ_ONCE(t->trc_reader_nesting))) {
WRITE_ONCE(t->trc_reader_checked, true);
nesting = READ_ONCE(t->trc_reader_nesting);
if (likely(!nesting)) {
rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
goto reset_ipi;
}
// If we are racing with an rcu_read_unlock_trace(), try again later.
if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0))
if (unlikely(nesting < 0))
goto reset_ipi;
WRITE_ONCE(t->trc_reader_checked, true);
// Get here if the task is in a read-side critical section. Set
// its state so that it will awaken the grace-period kthread upon
// exit from that critical section.
atomic_inc(&trc_n_readers_need_end); // One more to wait on.
WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
// Get here if the task is in a read-side critical section.
// Set its state so that it will update state for the grace-period
// kthread upon exit from that critical section.
rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED);
reset_ipi:
// Allow future IPIs to be sent on CPU and for task.
@ -1264,48 +1345,50 @@ reset_ipi:
}
/* Callback function for scheduler to check locked-down task. */
static int trc_inspect_reader(struct task_struct *t, void *arg)
static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
{
struct list_head *bhp = bhp_in;
int cpu = task_cpu(t);
int nesting;
bool ofl = cpu_is_offline(cpu);
if (task_curr(t)) {
WARN_ON_ONCE(ofl && !is_idle_task(t));
if (task_curr(t) && !ofl) {
// If no chance of heavyweight readers, do it the hard way.
if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
return -EINVAL;
// If heavyweight readers are enabled on the remote task,
// we can inspect its state despite its currently running.
// However, we cannot safely change its state.
n_heavy_reader_attempts++;
if (!ofl && // Check for "running" idle tasks on offline CPUs.
!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
// Check for "running" idle tasks on offline CPUs.
if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
return -EINVAL; // No quiescent state, do it the hard way.
n_heavy_reader_updates++;
if (ofl)
n_heavy_reader_ofl_updates++;
nesting = 0;
} else {
// The task is not running, so C-language access is safe.
nesting = t->trc_reader_nesting;
WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t));
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl)
n_heavy_reader_ofl_updates++;
}
// If not exiting a read-side critical section, mark as checked
// so that the grace-period kthread will remove it from the
// holdout list.
t->trc_reader_checked = nesting >= 0;
if (nesting <= 0)
return nesting ? -EINVAL : 0; // If in QS, done, otherwise try again later.
if (!nesting) {
rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
return 0; // In QS, so done.
}
if (nesting < 0)
return -EINVAL; // Reader transitioning, try again later.
// The task is in a read-side critical section, so set up its
// state so that it will awaken the grace-period kthread upon exit
// from that critical section.
atomic_inc(&trc_n_readers_need_end); // One more to wait on.
WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
// state so that it will update state upon exit from that critical
// section.
if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED))
trc_add_holdout(t, bhp);
return 0;
}
@ -1321,14 +1404,14 @@ static void trc_wait_for_one_reader(struct task_struct *t,
// The current task had better be in a quiescent state.
if (t == current) {
t->trc_reader_checked = true;
rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
return;
}
// Attempt to nail down the task for inspection.
get_task_struct(t);
if (!task_call_func(t, trc_inspect_reader, NULL)) {
if (!task_call_func(t, trc_inspect_reader, bhp)) {
put_task_struct(t);
return;
}
@ -1366,56 +1449,93 @@ static void trc_wait_for_one_reader(struct task_struct *t,
}
}
/* Initialize for a new RCU-tasks-trace grace period. */
static void rcu_tasks_trace_pregp_step(void)
/*
* Initialize for first-round processing for the specified task.
* Return false if task is NULL or already taken care of, true otherwise.
*/
static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself)
{
int cpu;
// During early boot when there is only the one boot CPU, there
// is no idle task for the other CPUs. Also, the grace-period
// kthread is always in a quiescent state. In addition, just return
// if this task is already on the list.
if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list))
return false;
// Allow for fast-acting IPIs.
atomic_set(&trc_n_readers_need_end, 1);
rcu_st_need_qs(t, 0);
t->trc_ipi_to_cpu = -1;
return true;
}
/* Do first-round processing for the specified task. */
static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop)
{
if (rcu_tasks_trace_pertask_prep(t, true))
trc_wait_for_one_reader(t, hop);
}
/* Initialize for a new RCU-tasks-trace grace period. */
static void rcu_tasks_trace_pregp_step(struct list_head *hop)
{
LIST_HEAD(blkd_tasks);
int cpu;
unsigned long flags;
struct rcu_tasks_percpu *rtpcp;
struct task_struct *t;
// There shouldn't be any old IPIs, but...
for_each_possible_cpu(cpu)
WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu));
// Disable CPU hotplug across the tasklist scan.
// This also waits for all readers in CPU-hotplug code paths.
// Disable CPU hotplug across the CPU scan for the benefit of
// any IPIs that might be needed. This also waits for all readers
// in CPU-hotplug code paths.
cpus_read_lock();
}
/* Do first-round processing for the specified task. */
static void rcu_tasks_trace_pertask(struct task_struct *t,
struct list_head *hop)
{
// During early boot when there is only the one boot CPU, there
// is no idle task for the other CPUs. Just return.
if (unlikely(t == NULL))
return;
// These rcu_tasks_trace_pertask_prep() calls are serialized to
// allow safe access to the hop list.
for_each_online_cpu(cpu) {
rcu_read_lock();
t = cpu_curr_snapshot(cpu);
if (rcu_tasks_trace_pertask_prep(t, true))
trc_add_holdout(t, hop);
rcu_read_unlock();
}
WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
WRITE_ONCE(t->trc_reader_checked, false);
t->trc_ipi_to_cpu = -1;
trc_wait_for_one_reader(t, hop);
// Only after all running tasks have been accounted for is it
// safe to take care of the tasks that have blocked within their
// current RCU tasks trace read-side critical section.
for_each_possible_cpu(cpu) {
rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu);
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks);
while (!list_empty(&blkd_tasks)) {
rcu_read_lock();
t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node);
list_del_init(&t->trc_blkd_node);
list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
rcu_tasks_trace_pertask(t, hop);
rcu_read_unlock();
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
}
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
}
// Re-enable CPU hotplug now that the holdout list is populated.
cpus_read_unlock();
}
/*
* Do intermediate processing between task and holdout scans and
* pick up the idle tasks.
* Do intermediate processing between task and holdout scans.
*/
static void rcu_tasks_trace_postscan(struct list_head *hop)
{
int cpu;
for_each_possible_cpu(cpu)
rcu_tasks_trace_pertask(idle_task(cpu), hop);
// Re-enable CPU hotplug now that the tasklist scan has completed.
cpus_read_unlock();
// Wait for late-stage exiting tasks to finish exiting.
// These might have passed the call to exit_tasks_rcu_finish().
synchronize_rcu();
// Any tasks that exit after this point will set ->trc_reader_checked.
// Any tasks that exit after this point will set
// TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
}
/* Communicate task state back to the RCU tasks trace stall warning request. */
@ -1429,11 +1549,11 @@ static int trc_check_slow_task(struct task_struct *t, void *arg)
{
struct trc_stall_chk_rdr *trc_rdrp = arg;
if (task_curr(t))
if (task_curr(t) && cpu_online(task_cpu(t)))
return false; // It is running, so decline to inspect it.
trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting);
trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu);
trc_rdrp->needqs = READ_ONCE(t->trc_reader_special.b.need_qs);
trc_rdrp->needqs = rcu_ld_need_qs(t);
return true;
}
@ -1450,18 +1570,21 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
}
cpu = task_cpu(t);
if (!task_call_func(t, trc_check_slow_task, &trc_rdr))
pr_alert("P%d: %c\n",
pr_alert("P%d: %c%c\n",
t->pid,
".I"[t->trc_ipi_to_cpu >= 0],
".i"[is_idle_tsk]);
else
pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n",
pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n",
t->pid,
".I"[trc_rdr.ipi_to_cpu >= 0],
".i"[is_idle_tsk],
".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)],
".B"[!!data_race(t->trc_reader_special.b.blocked)],
trc_rdr.nesting,
" N"[!!trc_rdr.needqs],
cpu);
" !CN"[trc_rdr.needqs & 0x3],
" ?"[trc_rdr.needqs > 0x3],
cpu, cpu_online(cpu) ? "" : "(offline)");
sched_show_task(t);
}
@ -1481,18 +1604,18 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
{
struct task_struct *g, *t;
// Disable CPU hotplug across the holdout list scan.
// Disable CPU hotplug across the holdout list scan for IPIs.
cpus_read_lock();
list_for_each_entry_safe(t, g, hop, trc_holdout_list) {
// If safe and needed, try to check the current task.
if (READ_ONCE(t->trc_ipi_to_cpu) == -1 &&
!READ_ONCE(t->trc_reader_checked))
!(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED))
trc_wait_for_one_reader(t, hop);
// If check succeeded, remove this task from the list.
if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 &&
READ_ONCE(t->trc_reader_checked))
rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED)
trc_del_holdout(t);
else if (needreport)
show_stalled_task_trace(t, firstreport);
@ -1516,10 +1639,6 @@ static void rcu_tasks_trace_empty_fn(void *unused)
static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
{
int cpu;
bool firstreport;
struct task_struct *g, *t;
LIST_HEAD(holdouts);
long ret;
// Wait for any lingering IPI handlers to complete. Note that
// if a CPU has gone offline or transitioned to userspace in the
@ -1530,37 +1649,6 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))))
smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
// Remove the safety count.
smp_mb__before_atomic(); // Order vs. earlier atomics
atomic_dec(&trc_n_readers_need_end);
smp_mb__after_atomic(); // Order vs. later atomics
// Wait for readers.
set_tasks_gp_state(rtp, RTGS_WAIT_READERS);
for (;;) {
ret = wait_event_idle_exclusive_timeout(
trc_wait,
atomic_read(&trc_n_readers_need_end) == 0,
READ_ONCE(rcu_task_stall_timeout));
if (ret)
break; // Count reached zero.
// Stall warning time, so make a list of the offenders.
rcu_read_lock();
for_each_process_thread(g, t)
if (READ_ONCE(t->trc_reader_special.b.need_qs))
trc_add_holdout(t, &holdouts);
rcu_read_unlock();
firstreport = true;
list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) {
if (READ_ONCE(t->trc_reader_special.b.need_qs))
show_stalled_task_trace(t, &firstreport);
trc_del_holdout(t); // Release task_struct reference.
}
if (firstreport)
pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n");
show_stalled_ipi_trace();
pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end));
}
smp_mb(); // Caller's code must be ordered after wakeup.
// Pairs with pretty much every ordering primitive.
}
@ -1568,11 +1656,14 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
/* Report any needed quiescent state for this exiting task. */
static void exit_tasks_rcu_finish_trace(struct task_struct *t)
{
WRITE_ONCE(t->trc_reader_checked, true);
union rcu_special trs = READ_ONCE(t->trc_reader_special);
rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
WRITE_ONCE(t->trc_reader_nesting, 0);
if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked))
rcu_read_unlock_trace_special(t);
else
WRITE_ONCE(t->trc_reader_nesting, 0);
}
/**
@ -1646,7 +1737,6 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
rcu_tasks_trace.init_fract = 1;
}
rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step;
rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask;
rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan;
rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace;
rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp;
@ -1659,7 +1749,8 @@ void show_rcu_tasks_trace_gp_kthread(void)
{
char buf[64];
sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end),
sprintf(buf, "N%lu h:%lu/%lu/%lu",
data_race(n_trc_holdouts),
data_race(n_heavy_reader_ofl_updates),
data_race(n_heavy_reader_updates),
data_race(n_heavy_reader_attempts));
@ -1686,23 +1777,24 @@ struct rcu_tasks_test_desc {
struct rcu_head rh;
const char *name;
bool notrun;
unsigned long runstart;
};
static struct rcu_tasks_test_desc tests[] = {
{
.name = "call_rcu_tasks()",
/* If not defined, the test is skipped. */
.notrun = !IS_ENABLED(CONFIG_TASKS_RCU),
.notrun = IS_ENABLED(CONFIG_TASKS_RCU),
},
{
.name = "call_rcu_tasks_rude()",
/* If not defined, the test is skipped. */
.notrun = !IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
.notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
},
{
.name = "call_rcu_tasks_trace()",
/* If not defined, the test is skipped. */
.notrun = !IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
.notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
}
};
@ -1713,46 +1805,85 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
pr_info("Callback from %s invoked.\n", rttd->name);
rttd->notrun = true;
rttd->notrun = false;
}
static void rcu_tasks_initiate_self_tests(void)
{
unsigned long j = jiffies;
pr_info("Running RCU-tasks wait API self tests\n");
#ifdef CONFIG_TASKS_RCU
tests[0].runstart = j;
synchronize_rcu_tasks();
call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_RUDE_RCU
tests[1].runstart = j;
synchronize_rcu_tasks_rude();
call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
tests[2].runstart = j;
synchronize_rcu_tasks_trace();
call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
#endif
}
/*
* Return: 0 - test passed
* 1 - test failed, but have not timed out yet
* -1 - test failed and timed out
*/
static int rcu_tasks_verify_self_tests(void)
{
int ret = 0;
int i;
unsigned long bst = rcu_task_stall_timeout;
if (bst <= 0 || bst > RCU_TASK_BOOT_STALL_TIMEOUT)
bst = RCU_TASK_BOOT_STALL_TIMEOUT;
for (i = 0; i < ARRAY_SIZE(tests); i++) {
if (!tests[i].notrun) { // still hanging.
pr_err("%s has been failed.\n", tests[i].name);
ret = -1;
while (tests[i].notrun) { // still hanging.
if (time_after(jiffies, tests[i].runstart + bst)) {
pr_err("%s has failed boot-time tests.\n", tests[i].name);
ret = -1;
break;
}
ret = 1;
break;
}
}
if (ret)
WARN_ON(1);
WARN_ON(ret < 0);
return ret;
}
late_initcall(rcu_tasks_verify_self_tests);
/*
* Repeat the rcu_tasks_verify_self_tests() call once every second until the
* test passes or has timed out.
*/
static struct delayed_work rcu_tasks_verify_work;
static void rcu_tasks_verify_work_fn(struct work_struct *work __maybe_unused)
{
int ret = rcu_tasks_verify_self_tests();
if (ret <= 0)
return;
/* Test fails but not timed out yet, reschedule another check */
schedule_delayed_work(&rcu_tasks_verify_work, HZ);
}
static int rcu_tasks_verify_schedule_work(void)
{
INIT_DELAYED_WORK(&rcu_tasks_verify_work, rcu_tasks_verify_work_fn);
rcu_tasks_verify_work_fn(NULL);
return 0;
}
late_initcall(rcu_tasks_verify_schedule_work);
#else /* #ifdef CONFIG_PROVE_RCU */
static void rcu_tasks_initiate_self_tests(void) { }
#endif /* #else #ifdef CONFIG_PROVE_RCU */

View File

@ -58,7 +58,7 @@ void rcu_qs(void)
rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
raise_softirq_irqoff(RCU_SOFTIRQ);
}
WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 1);
WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
local_irq_restore(flags);
}
@ -139,8 +139,10 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
/*
* Wait for a grace period to elapse. But it is illegal to invoke
* synchronize_rcu() from within an RCU read-side critical section.
* Therefore, any legal call to synchronize_rcu() is a quiescent
* state, and so on a UP system, synchronize_rcu() need do nothing.
* Therefore, any legal call to synchronize_rcu() is a quiescent state,
* and so on a UP system, synchronize_rcu() need do nothing, other than
* let the polled APIs know that another grace period elapsed.
*
* (But Lai Jiangshan points out the benefits of doing might_sleep()
* to reduce latency.)
*
@ -152,6 +154,7 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
@ -213,10 +216,24 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
*/
bool poll_state_synchronize_rcu(unsigned long oldstate)
{
return READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
return oldstate == RCU_GET_STATE_COMPLETED || READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
#ifdef CONFIG_KASAN_GENERIC
void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
if (head) {
void *ptr = (void *) head - (unsigned long) func;
kasan_record_aux_stack_noalloc(ptr);
}
__kvfree_call_rcu(head, func);
}
EXPORT_SYMBOL_GPL(kvfree_call_rcu);
#endif
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);

View File

@ -154,7 +154,11 @@ static void sync_sched_exp_online_cleanup(int cpu);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
/* rcuc/rcub/rcuop kthread realtime priority */
/*
* rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
* real-time priority(enabling/disabling) is controlled by
* the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration.
*/
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
module_param(kthread_prio, int, 0444);
@ -1775,6 +1779,79 @@ static void rcu_strict_gp_boundary(void *unused)
invoke_rcu_core();
}
// Has rcu_init() been invoked? This is used (for example) to determine
// whether spinlocks may be acquired safely.
static bool rcu_init_invoked(void)
{
return !!rcu_state.n_online_cpus;
}
// Make the polled API aware of the beginning of a grace period.
static void rcu_poll_gp_seq_start(unsigned long *snap)
{
struct rcu_node *rnp = rcu_get_root();
if (rcu_init_invoked())
raw_lockdep_assert_held_rcu_node(rnp);
// If RCU was idle, note beginning of GP.
if (!rcu_seq_state(rcu_state.gp_seq_polled))
rcu_seq_start(&rcu_state.gp_seq_polled);
// Either way, record current state.
*snap = rcu_state.gp_seq_polled;
}
// Make the polled API aware of the end of a grace period.
static void rcu_poll_gp_seq_end(unsigned long *snap)
{
struct rcu_node *rnp = rcu_get_root();
if (rcu_init_invoked())
raw_lockdep_assert_held_rcu_node(rnp);
// If the previously noted GP is still in effect, record the
// end of that GP. Either way, zero counter to avoid counter-wrap
// problems.
if (*snap && *snap == rcu_state.gp_seq_polled) {
rcu_seq_end(&rcu_state.gp_seq_polled);
rcu_state.gp_seq_polled_snap = 0;
rcu_state.gp_seq_polled_exp_snap = 0;
} else {
*snap = 0;
}
}
// Make the polled API aware of the beginning of a grace period, but
// where caller does not hold the root rcu_node structure's lock.
static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
{
struct rcu_node *rnp = rcu_get_root();
if (rcu_init_invoked()) {
lockdep_assert_irqs_enabled();
raw_spin_lock_irq_rcu_node(rnp);
}
rcu_poll_gp_seq_start(snap);
if (rcu_init_invoked())
raw_spin_unlock_irq_rcu_node(rnp);
}
// Make the polled API aware of the end of a grace period, but where
// caller does not hold the root rcu_node structure's lock.
static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
{
struct rcu_node *rnp = rcu_get_root();
if (rcu_init_invoked()) {
lockdep_assert_irqs_enabled();
raw_spin_lock_irq_rcu_node(rnp);
}
rcu_poll_gp_seq_end(snap);
if (rcu_init_invoked())
raw_spin_unlock_irq_rcu_node(rnp);
}
/*
* Initialize a new grace period. Return false if no grace period required.
*/
@ -1810,6 +1887,7 @@ static noinline_for_stack bool rcu_gp_init(void)
rcu_seq_start(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
@ -1971,19 +2049,23 @@ static void rcu_gp_fqs(bool first_time)
*/
static noinline_for_stack void rcu_gp_fqs_loop(void)
{
bool first_gp_fqs;
bool first_gp_fqs = true;
int gf = 0;
unsigned long j;
int ret;
struct rcu_node *rnp = rcu_get_root();
first_gp_fqs = true;
j = READ_ONCE(jiffies_till_first_fqs);
if (rcu_state.cbovld)
gf = RCU_GP_FLAG_OVLD;
ret = 0;
for (;;) {
if (!ret) {
if (rcu_state.cbovld) {
j = (j + 2) / 3;
if (j <= 0)
j = 1;
}
if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) {
WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
/*
* jiffies_force_qs before RCU_GP_WAIT_FQS state
@ -2001,7 +2083,15 @@ static noinline_for_stack void rcu_gp_fqs_loop(void)
rcu_gp_torture_wait();
WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
/* Locking provides needed memory barriers. */
/* If grace period done, leave loop. */
/*
* Exit the loop if the root rcu_node structure indicates that the grace period
* has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check
* is required only for single-node rcu_node trees because readers blocking
* the current grace period are queued only on leaf rcu_node structures.
* For multi-node trees, checking the root node's ->qsmask suffices, because a
* given root node's ->qsmask bit is cleared only when all CPUs and tasks from
* the corresponding leaf nodes have passed through their quiescent state.
*/
if (!READ_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp))
break;
@ -2069,6 +2159,7 @@ static noinline void rcu_gp_cleanup(void)
* safe for us to drop the lock in order to mark the grace
* period as completed in all of the rcu_node structures.
*/
rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
@ -2530,7 +2621,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
trace_rcu_batch_end(rcu_state.name, 0,
!rcu_segcblist_empty(&rdp->cblist),
need_resched(), is_idle_task(current),
rcu_is_callbacks_kthread());
rcu_is_callbacks_kthread(rdp));
return;
}
@ -2608,7 +2699,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_nocb_lock_irqsave(rdp, flags);
rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
is_idle_task(current), rcu_is_callbacks_kthread(rdp));
/* Update counts and requeue any remaining callbacks. */
rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
@ -3211,7 +3302,6 @@ struct kfree_rcu_cpu_work {
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
* @monitor_todo: Tracks whether a @monitor_work delayed work is pending
* @initialized: The @rcu_work fields have been initialized
* @count: Number of objects for which GP not started
* @bkvcache:
@ -3236,7 +3326,6 @@ struct kfree_rcu_cpu {
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
raw_spinlock_t lock;
struct delayed_work monitor_work;
bool monitor_todo;
bool initialized;
int count;
@ -3416,6 +3505,18 @@ static void kfree_rcu_work(struct work_struct *work)
}
}
static bool
need_offload_krc(struct kfree_rcu_cpu *krcp)
{
int i;
for (i = 0; i < FREE_N_CHANNELS; i++)
if (krcp->bkvhead[i])
return true;
return !!krcp->head;
}
/*
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
*/
@ -3472,9 +3573,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
// of the channels that is still busy we should rearm the
// work to repeat an attempt. Because previous batches are
// still in progress.
if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
krcp->monitor_todo = false;
else
if (need_offload_krc(krcp))
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
@ -3662,11 +3761,8 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
WRITE_ONCE(krcp->count, krcp->count + 1);
// Set timer to drain after KFREE_DRAIN_JIFFIES.
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!krcp->monitor_todo) {
krcp->monitor_todo = true;
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
}
unlock_return:
krc_this_cpu_unlock(krcp, flags);
@ -3741,14 +3837,8 @@ void __init kfree_rcu_scheduler_running(void)
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
raw_spin_lock_irqsave(&krcp->lock, flags);
if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
krcp->monitor_todo) {
raw_spin_unlock_irqrestore(&krcp->lock, flags);
continue;
}
krcp->monitor_todo = true;
schedule_delayed_work_on(cpu, &krcp->monitor_work,
KFREE_DRAIN_JIFFIES);
if (need_offload_krc(krcp))
schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
}
@ -3837,8 +3927,18 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
if (rcu_blocking_is_gp())
if (rcu_blocking_is_gp()) {
// Note well that this code runs with !PREEMPT && !SMP.
// In addition, all code that advances grace periods runs at
// process level. Therefore, this normal GP overlaps with
// other normal GPs only by being fully nested within them,
// which allows reuse of ->gp_seq_polled_snap.
rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
if (rcu_init_invoked())
cond_resched_tasks_rcu_qs();
return; // Context allows vacuous grace periods.
}
if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
else
@ -3860,7 +3960,7 @@ unsigned long get_state_synchronize_rcu(void)
* before the load from ->gp_seq.
*/
smp_mb(); /* ^^^ */
return rcu_seq_snap(&rcu_state.gp_seq);
return rcu_seq_snap(&rcu_state.gp_seq_polled);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
@ -3889,7 +3989,13 @@ unsigned long start_poll_synchronize_rcu(void)
rdp = this_cpu_ptr(&rcu_data);
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); // irqs already disabled.
needwake = rcu_start_this_gp(rnp, rdp, gp_seq);
// Note it is possible for a grace period to have elapsed between
// the above call to get_state_synchronize_rcu() and the below call
// to rcu_seq_snap. This is OK, the worst that happens is that we
// get a grace period that no one needed. These accesses are ordered
// by smp_mb(), and we are accessing them in the opposite order
// from which they are updated at grace-period start, as required.
needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (needwake)
rcu_gp_kthread_wake();
@ -3911,7 +4017,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
*
* Yes, this function does not take counter wrap into account.
* But counter wrap is harmless. If the counter wraps, we have waited for
* more than 2 billion grace periods (and way more on a 64-bit system!).
* more than a billion grace periods (and way more on a 64-bit system!).
* Those needing to keep oldstate values for very long time periods
* (many hours even on 32-bit systems) should check them occasionally
* and either refresh them or set a flag indicating that the grace period
@ -3924,7 +4030,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
*/
bool poll_state_synchronize_rcu(unsigned long oldstate)
{
if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) {
if (oldstate == RCU_GET_STATE_COMPLETED ||
rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) {
smp_mb(); /* Ensure GP ends before subsequent accesses. */
return true;
}
@ -3935,20 +4042,20 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
/**
* cond_synchronize_rcu - Conditionally wait for an RCU grace period
*
* @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
* @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
*
* If a full RCU grace period has elapsed since the earlier call to
* get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
* Otherwise, invoke synchronize_rcu() to wait for a full grace period.
*
* Yes, this function does not take counter wrap into account. But
* counter wrap is harmless. If the counter wraps, we have waited for
* Yes, this function does not take counter wrap into account.
* But counter wrap is harmless. If the counter wraps, we have waited for
* more than 2 billion grace periods (and way more on a 64-bit system!),
* so waiting for one additional grace period should be just fine.
* so waiting for a couple of additional grace periods should be just fine.
*
* This function provides the same memory-ordering guarantees that
* would be provided by a synchronize_rcu() that was invoked at the call
* to the function that provided @oldstate, and that returned at the end
* to the function that provided @oldstate and that returned at the end
* of this function.
*/
void cond_synchronize_rcu(unsigned long oldstate)
@ -4441,6 +4548,7 @@ void rcu_report_dead(unsigned int cpu)
rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
/* Report quiescent state -before- changing ->qsmaskinitnext! */
rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
@ -4486,6 +4594,7 @@ void rcutree_migrate_callbacks(int cpu)
needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_disable(&rdp->cblist);
WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
check_cb_ovld_locked(my_rdp, my_rnp);
if (rcu_rdp_is_offloaded(my_rdp)) {
raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
__call_rcu_nocb_wake(my_rdp, true, flags);
@ -4701,6 +4810,9 @@ static void __init rcu_init_one(void)
init_waitqueue_head(&rnp->exp_wq[3]);
spin_lock_init(&rnp->exp_lock);
mutex_init(&rnp->boost_kthread_mutex);
raw_spin_lock_init(&rnp->exp_poll_lock);
rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
}
}
@ -4926,6 +5038,10 @@ void __init rcu_init(void)
qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
else
qovld_calc = qovld;
// Kick-start any polled grace periods that started early.
if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1))
(void)start_poll_synchronize_rcu_expedited();
}
#include "tree_stall.h"

View File

@ -133,6 +133,10 @@ struct rcu_node {
wait_queue_head_t exp_wq[4];
struct rcu_exp_work rew;
bool exp_need_flush; /* Need to flush workitem? */
raw_spinlock_t exp_poll_lock;
/* Lock and data for polled expedited grace periods. */
unsigned long exp_seq_poll_rq;
struct work_struct exp_poll_wq;
} ____cacheline_internodealigned_in_smp;
/*
@ -235,6 +239,7 @@ struct rcu_data {
* if rdp_gp.
*/
struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */
struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */
/* The following fields are used by CB kthread, hence new cacheline. */
struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
@ -323,6 +328,9 @@ struct rcu_state {
short gp_state; /* GP kthread sleep state. */
unsigned long gp_wake_time; /* Last GP kthread wake. */
unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */
unsigned long gp_seq_polled; /* GP seq for polled API. */
unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */
unsigned long gp_seq_polled_exp_snap; /* ->gp_seq_polled at expedited GP start. */
/* End of fields guarded by root rcu_node's lock. */
@ -425,7 +433,7 @@ static void rcu_flavor_sched_clock_irq(int user);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static bool rcu_is_callbacks_kthread(void);
static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
static void rcu_cpu_kthread_setup(unsigned int cpu);
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
@ -481,3 +489,6 @@ static void rcu_iw_handler(struct irq_work *iwp);
static void check_cpu_stall(struct rcu_data *rdp);
static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
const unsigned long gpssdelay);
/* Forward declarations for tree_exp.h. */
static void sync_rcu_do_polled_gp(struct work_struct *wp);

View File

@ -18,6 +18,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_exp_gp_seq_start(void)
{
rcu_seq_start(&rcu_state.expedited_sequence);
rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
}
/*
@ -34,6 +35,7 @@ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void)
*/
static void rcu_exp_gp_seq_end(void)
{
rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
rcu_seq_end(&rcu_state.expedited_sequence);
smp_mb(); /* Ensure that consecutive grace periods serialize. */
}
@ -621,7 +623,6 @@ static void synchronize_rcu_expedited_wait(void)
return;
if (rcu_stall_is_suppressed())
continue;
panic_on_rcu_stall();
trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
rcu_state.name);
@ -636,10 +637,11 @@ static void synchronize_rcu_expedited_wait(void)
continue;
ndetected++;
rdp = per_cpu_ptr(&rcu_data, cpu);
pr_cont(" %d-%c%c%c", cpu,
pr_cont(" %d-%c%c%c%c", cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rnp->expmaskinit)],
"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
"N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
"D."[!!(rdp->cpu_no_qs.b.exp)]);
}
}
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
@ -669,6 +671,7 @@ static void synchronize_rcu_expedited_wait(void)
}
}
jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
panic_on_rcu_stall();
}
}
@ -913,8 +916,18 @@ void synchronize_rcu_expedited(void)
"Illegal synchronize_rcu_expedited() in RCU read-side critical section");
/* Is the state is such that the call is a grace period? */
if (rcu_blocking_is_gp())
return;
if (rcu_blocking_is_gp()) {
// Note well that this code runs with !PREEMPT && !SMP.
// In addition, all code that advances grace periods runs
// at process level. Therefore, this expedited GP overlaps
// with other expedited GPs only by being fully nested within
// them, which allows reuse of ->gp_seq_polled_exp_snap.
rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
if (rcu_init_invoked())
cond_resched();
return; // Context allows vacuous grace periods.
}
/* If expedited grace periods are prohibited, fall back to normal. */
if (rcu_gp_is_normal()) {
@ -950,3 +963,93 @@ void synchronize_rcu_expedited(void)
synchronize_rcu_expedited_destroy_work(&rew);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
/*
* Ensure that start_poll_synchronize_rcu_expedited() has the expedited
* RCU grace periods that it needs.
*/
static void sync_rcu_do_polled_gp(struct work_struct *wp)
{
unsigned long flags;
int i = 0;
struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq);
unsigned long s;
raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
s = rnp->exp_seq_poll_rq;
rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
if (s == RCU_GET_STATE_COMPLETED)
return;
while (!poll_state_synchronize_rcu(s)) {
synchronize_rcu_expedited();
if (i == 10 || i == 20)
pr_info("%s: i = %d s = %lx gp_seq_polled = %lx\n", __func__, i, s, READ_ONCE(rcu_state.gp_seq_polled));
i++;
}
raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
s = rnp->exp_seq_poll_rq;
if (poll_state_synchronize_rcu(s))
rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
}
/**
* start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period
*
* Returns a cookie to pass to a call to cond_synchronize_rcu(),
* cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(),
* allowing them to determine whether or not any sort of grace period has
* elapsed in the meantime. If the needed expedited grace period is not
* already slated to start, initiates that grace period.
*/
unsigned long start_poll_synchronize_rcu_expedited(void)
{
unsigned long flags;
struct rcu_data *rdp;
struct rcu_node *rnp;
unsigned long s;
s = get_state_synchronize_rcu();
rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
rnp = rdp->mynode;
if (rcu_init_invoked())
raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
if (!poll_state_synchronize_rcu(s)) {
rnp->exp_seq_poll_rq = s;
if (rcu_init_invoked())
queue_work(rcu_gp_wq, &rnp->exp_poll_wq);
}
if (rcu_init_invoked())
raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
return s;
}
EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
/**
* cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period
*
* @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
*
* If any type of full RCU grace period has elapsed since the earlier
* call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(),
* or start_poll_synchronize_rcu_expedited(), just return. Otherwise,
* invoke synchronize_rcu_expedited() to wait for a full grace period.
*
* Yes, this function does not take counter wrap into account.
* But counter wrap is harmless. If the counter wraps, we have waited for
* more than 2 billion grace periods (and way more on a 64-bit system!),
* so waiting for a couple of additional grace periods should be just fine.
*
* This function provides the same memory-ordering guarantees that
* would be provided by a synchronize_rcu() that was invoked at the call
* to the function that provided @oldstate and that returned at the end
* of this function.
*/
void cond_synchronize_rcu_expedited(unsigned long oldstate)
{
if (!poll_state_synchronize_rcu(oldstate))
synchronize_rcu_expedited();
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);

View File

@ -546,52 +546,51 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
}
}
/*
* Check if we ignore this rdp.
*
* We check that without holding the nocb lock but
* we make sure not to miss a freshly offloaded rdp
* with the current ordering:
*
* rdp_offload_toggle() nocb_gp_enabled_cb()
* ------------------------- ----------------------------
* WRITE flags LOCK nocb_gp_lock
* LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
* READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
* UNLOCK nocb_gp_lock READ flags
*/
static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
{
u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
return rcu_segcblist_test_flags(&rdp->cblist, flags);
}
static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
bool *needwake_state)
static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
bool *wake_state)
{
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
int ret;
if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
*needwake_state = true;
}
return false;
rcu_nocb_lock_irqsave(rdp, flags);
if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
/*
* Offloading. Set our flag and notify the offload worker.
* We will handle this rdp until it ever gets de-offloaded.
*/
rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
*wake_state = true;
ret = 1;
} else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
/*
* De-offloading. Clear our flag and notify the de-offload worker.
* We will ignore this rdp until it ever gets re-offloaded.
*/
rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
*wake_state = true;
ret = 0;
} else {
WARN_ON_ONCE(1);
ret = -1;
}
/*
* De-offloading. Clear our flag and notify the de-offload worker.
* We will ignore this rdp until it ever gets re-offloaded.
*/
WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
*needwake_state = true;
return true;
rcu_nocb_unlock_irqrestore(rdp, flags);
return ret;
}
static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
{
trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
!READ_ONCE(my_rdp->nocb_gp_sleep));
trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
}
/*
* No-CBs GP kthreads come here to wait for additional callbacks to show up
@ -609,7 +608,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
bool needwait_gp = false; // This prevents actual uninitialized use.
bool needwake;
bool needwake_gp;
struct rcu_data *rdp;
struct rcu_data *rdp, *rdp_toggling = NULL;
struct rcu_node *rnp;
unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
bool wasempty = false;
@ -634,19 +633,10 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
* is added to the list, so the skipped-over rcu_data structures
* won't be ignored for long.
*/
list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) {
bool needwake_state = false;
if (!nocb_gp_enabled_cb(rdp))
continue;
list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
rcu_nocb_lock_irqsave(rdp, flags);
if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
if (needwake_state)
swake_up_one(&rdp->nocb_state_wq);
continue;
}
lockdep_assert_held(&rdp->nocb_lock);
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
if (bypass_ncbs &&
(time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
@ -656,8 +646,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
if (needwake_state)
swake_up_one(&rdp->nocb_state_wq);
continue; /* No callbacks here, try next. */
}
if (bypass_ncbs) {
@ -705,8 +693,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
}
if (needwake_gp)
rcu_gp_kthread_wake();
if (needwake_state)
swake_up_one(&rdp->nocb_state_wq);
}
my_rdp->nocb_gp_bypass = bypass;
@ -723,13 +709,19 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
/* Polling, so trace if first poll in the series. */
if (gotcbs)
trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
schedule_timeout_idle(1);
if (list_empty(&my_rdp->nocb_head_rdp)) {
raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
if (!my_rdp->nocb_toggling_rdp)
WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
/* Wait for any offloading rdp */
nocb_gp_sleep(my_rdp, cpu);
} else {
schedule_timeout_idle(1);
}
} else if (!needwait_gp) {
/* Wait for callbacks to appear. */
trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
!READ_ONCE(my_rdp->nocb_gp_sleep));
trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
nocb_gp_sleep(my_rdp, cpu);
} else {
rnp = my_rdp->mynode;
trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
@ -739,15 +731,49 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
!READ_ONCE(my_rdp->nocb_gp_sleep));
trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
}
if (!rcu_nocb_poll) {
raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
// (De-)queue an rdp to/from the group if its nocb state is changing
rdp_toggling = my_rdp->nocb_toggling_rdp;
if (rdp_toggling)
my_rdp->nocb_toggling_rdp = NULL;
if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
del_timer(&my_rdp->nocb_timer);
}
WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
} else {
rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp);
if (rdp_toggling) {
/*
* Paranoid locking to make sure nocb_toggling_rdp is well
* reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could
* race with another round of nocb toggling for this rdp.
* Nocb locking should prevent from that already but we stick
* to paranoia, especially in rare path.
*/
raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
my_rdp->nocb_toggling_rdp = NULL;
raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
}
}
if (rdp_toggling) {
bool wake_state = false;
int ret;
ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state);
if (ret == 1)
list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
else if (ret == 0)
list_del(&rdp_toggling->nocb_entry_rdp);
if (wake_state)
swake_up_one(&rdp_toggling->nocb_state_wq);
}
my_rdp->nocb_gp_seq = -1;
WARN_ON(signal_pending(current));
}
@ -966,16 +992,15 @@ static int rdp_offload_toggle(struct rcu_data *rdp,
swake_up_one(&rdp->nocb_cb_wq);
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
// Queue this rdp for add/del to/from the list to iterate on rcuog
WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp);
if (rdp_gp->nocb_gp_sleep) {
rdp_gp->nocb_gp_sleep = false;
wake_gp = true;
}
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
if (wake_gp)
wake_up_process(rdp_gp->nocb_gp_kthread);
return 0;
return wake_gp;
}
static long rcu_nocb_rdp_deoffload(void *arg)
@ -983,9 +1008,15 @@ static long rcu_nocb_rdp_deoffload(void *arg)
struct rcu_data *rdp = arg;
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
int ret;
int wake_gp;
struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
/*
* rcu_nocb_rdp_deoffload() may be called directly if
* rcuog/o[p] spawn failed, because at this time the rdp->cpu
* is not online yet.
*/
WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu));
pr_info("De-offloading %d\n", rdp->cpu);
@ -1009,12 +1040,41 @@ static long rcu_nocb_rdp_deoffload(void *arg)
*/
rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE);
invoke_rcu_core();
ret = rdp_offload_toggle(rdp, false, flags);
swait_event_exclusive(rdp->nocb_state_wq,
!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
SEGCBLIST_KTHREAD_GP));
/* Stop nocb_gp_wait() from iterating over this structure. */
list_del_rcu(&rdp->nocb_entry_rdp);
wake_gp = rdp_offload_toggle(rdp, false, flags);
mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
if (rdp_gp->nocb_gp_kthread) {
if (wake_gp)
wake_up_process(rdp_gp->nocb_gp_kthread);
/*
* If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB.
* Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog.
*/
if (!rdp->nocb_cb_kthread) {
rcu_nocb_lock_irqsave(rdp, flags);
rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
rcu_nocb_unlock_irqrestore(rdp, flags);
}
swait_event_exclusive(rdp->nocb_state_wq,
!rcu_segcblist_test_flags(cblist,
SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP));
} else {
/*
* No kthread to clear the flags for us or remove the rdp from the nocb list
* to iterate. Do it here instead. Locking doesn't look stricly necessary
* but we stick to paranoia in this rare path.
*/
rcu_nocb_lock_irqsave(rdp, flags);
rcu_segcblist_clear_flags(&rdp->cblist,
SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
rcu_nocb_unlock_irqrestore(rdp, flags);
list_del(&rdp->nocb_entry_rdp);
}
mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
/*
* Lock one last time to acquire latest callback updates from kthreads
* so we can later handle callbacks locally without locking.
@ -1035,7 +1095,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
return ret;
return 0;
}
int rcu_nocb_cpu_deoffload(int cpu)
@ -1043,8 +1103,8 @@ int rcu_nocb_cpu_deoffload(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
int ret = 0;
mutex_lock(&rcu_state.barrier_mutex);
cpus_read_lock();
mutex_lock(&rcu_state.barrier_mutex);
if (rcu_rdp_is_offloaded(rdp)) {
if (cpu_online(cpu)) {
ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
@ -1055,8 +1115,8 @@ int rcu_nocb_cpu_deoffload(int cpu)
ret = -EINVAL;
}
}
cpus_read_unlock();
mutex_unlock(&rcu_state.barrier_mutex);
cpus_read_unlock();
return ret;
}
@ -1067,7 +1127,8 @@ static long rcu_nocb_rdp_offload(void *arg)
struct rcu_data *rdp = arg;
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
int ret;
int wake_gp;
struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
/*
@ -1077,17 +1138,10 @@ static long rcu_nocb_rdp_offload(void *arg)
if (!rdp->nocb_gp_rdp)
return -EINVAL;
pr_info("Offloading %d\n", rdp->cpu);
if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread))
return -EINVAL;
/*
* Cause future nocb_gp_wait() invocations to iterate over
* structure, resetting ->nocb_gp_sleep and waking up the related
* "rcuog". Since nocb_gp_wait() in turn locks ->nocb_gp_lock
* before setting ->nocb_gp_sleep again, we are guaranteed to
* iterate this newly added structure before "rcuog" goes to
* sleep again.
*/
list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp);
pr_info("Offloading %d\n", rdp->cpu);
/*
* Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
@ -1111,7 +1165,9 @@ static long rcu_nocb_rdp_offload(void *arg)
* WRITE flags READ callbacks
* rcu_nocb_unlock() rcu_nocb_unlock()
*/
ret = rdp_offload_toggle(rdp, true, flags);
wake_gp = rdp_offload_toggle(rdp, true, flags);
if (wake_gp)
wake_up_process(rdp_gp->nocb_gp_kthread);
swait_event_exclusive(rdp->nocb_state_wq,
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
@ -1124,7 +1180,7 @@ static long rcu_nocb_rdp_offload(void *arg)
rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE);
rcu_nocb_unlock_irqrestore(rdp, flags);
return ret;
return 0;
}
int rcu_nocb_cpu_offload(int cpu)
@ -1132,8 +1188,8 @@ int rcu_nocb_cpu_offload(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
int ret = 0;
mutex_lock(&rcu_state.barrier_mutex);
cpus_read_lock();
mutex_lock(&rcu_state.barrier_mutex);
if (!rcu_rdp_is_offloaded(rdp)) {
if (cpu_online(cpu)) {
ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
@ -1144,8 +1200,8 @@ int rcu_nocb_cpu_offload(int cpu)
ret = -EINVAL;
}
}
cpus_read_unlock();
mutex_unlock(&rcu_state.barrier_mutex);
cpus_read_unlock();
return ret;
}
@ -1155,11 +1211,21 @@ void __init rcu_init_nohz(void)
{
int cpu;
bool need_rcu_nocb_mask = false;
bool offload_all = false;
struct rcu_data *rdp;
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
#if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL)
if (!rcu_state.nocb_is_setup) {
need_rcu_nocb_mask = true;
offload_all = true;
}
#endif /* #if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) */
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) {
need_rcu_nocb_mask = true;
offload_all = false; /* NO_HZ_FULL has its own mask. */
}
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
if (need_rcu_nocb_mask) {
@ -1180,6 +1246,9 @@ void __init rcu_init_nohz(void)
cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
if (offload_all)
cpumask_setall(rcu_nocb_mask);
if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
cpumask_and(rcu_nocb_mask, cpu_possible_mask,
@ -1246,7 +1315,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
"rcuog/%d", rdp_gp->cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
return;
goto end;
}
WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
if (kthread_prio)
@ -1258,12 +1327,21 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
t = kthread_run(rcu_nocb_cb_kthread, rdp,
"rcuo%c/%d", rcu_state.abbr, cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
return;
goto end;
if (kthread_prio)
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
WRITE_ONCE(rdp->nocb_cb_kthread, t);
WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
return;
end:
mutex_lock(&rcu_state.barrier_mutex);
if (rcu_rdp_is_offloaded(rdp)) {
rcu_nocb_rdp_deoffload(rdp);
cpumask_clear_cpu(cpu, rcu_nocb_mask);
}
mutex_unlock(&rcu_state.barrier_mutex);
}
/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */

View File

@ -460,7 +460,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
* be quite short, for example, in the case of the call from
* rcu_read_unlock_special().
*/
static void
static notrace void
rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
{
bool empty_exp;
@ -581,7 +581,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* is disabled. This function cannot be expected to understand these
* nuances, so the caller must handle them.
*/
static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) ||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
@ -595,7 +595,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
* evaluate safety in terms of interrupt, softirq, and preemption
* disabling.
*/
static void rcu_preempt_deferred_qs(struct task_struct *t)
static notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
unsigned long flags;
@ -899,8 +899,8 @@ void rcu_note_context_switch(bool preempt)
this_cpu_write(rcu_data.rcu_urgent_qs, false);
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
rcu_momentary_dyntick_idle();
rcu_tasks_qs(current, preempt);
out:
rcu_tasks_qs(current, preempt);
trace_rcu_utilization(TPS("End context switch"));
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@ -926,7 +926,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
* Because there is no preemptible RCU, there can be no deferred quiescent
* states.
*/
static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return false;
}
@ -935,7 +935,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
// period for a quiescent state from this CPU. Note that requests from
// tasks are handled when removing the task from the blocked-tasks list
// below.
static void rcu_preempt_deferred_qs(struct task_struct *t)
static notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@ -1012,6 +1012,25 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
WRITE_ONCE(rdp->rcuc_activity, jiffies);
}
static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp)
{
#ifdef CONFIG_RCU_NOCB_CPU
return rdp->nocb_cb_kthread == current;
#else
return false;
#endif
}
/*
* Is the current CPU running the RCU-callbacks kthread?
* Caller must have preemption disabled.
*/
static bool rcu_is_callbacks_kthread(struct rcu_data *rdp)
{
return rdp->rcu_cpu_kthread_task == current ||
rcu_is_callbacks_nocb_kthread(rdp);
}
#ifdef CONFIG_RCU_BOOST
/*
@ -1140,7 +1159,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
(rnp->gp_tasks != NULL &&
rnp->boost_tasks == NULL &&
rnp->qsmask == 0 &&
(!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) {
(!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld ||
IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) {
if (rnp->exp_tasks == NULL)
WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@ -1151,15 +1171,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
}
}
/*
* Is the current CPU running the RCU-callbacks kthread?
* Caller must have preemption disabled.
*/
static bool rcu_is_callbacks_kthread(void)
{
return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;
}
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
/*
@ -1242,11 +1253,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
static bool rcu_is_callbacks_kthread(void)
{
return false;
}
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}

View File

@ -516,6 +516,19 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls.
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot);
module_param(rcu_cpu_stall_suppress_at_boot, int, 0444);
/**
* get_completed_synchronize_rcu - Return a pre-completed polled state cookie
*
* Returns a value that will always be treated by functions like
* poll_state_synchronize_rcu() as a cookie whose grace period has already
* completed.
*/
unsigned long get_completed_synchronize_rcu(void)
{
return RCU_GET_STATE_COMPLETED;
}
EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu);
#ifdef CONFIG_PROVE_RCU
/*

View File

@ -4263,6 +4263,38 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
return ret;
}
/**
* cpu_curr_snapshot - Return a snapshot of the currently running task
* @cpu: The CPU on which to snapshot the task.
*
* Returns the task_struct pointer of the task "currently" running on
* the specified CPU. If the same task is running on that CPU throughout,
* the return value will be a pointer to that task's task_struct structure.
* If the CPU did any context switches even vaguely concurrently with the
* execution of this function, the return value will be a pointer to the
* task_struct structure of a randomly chosen task that was running on
* that CPU somewhere around the time that this function was executing.
*
* If the specified CPU was offline, the return value is whatever it
* is, perhaps a pointer to the task_struct structure of that CPU's idle
* task, but there is no guarantee. Callers wishing a useful return
* value must take some action to ensure that the specified CPU remains
* online throughout.
*
* This function executes full memory barriers before and after fetching
* the pointer, which permits the caller to confine this function's fetch
* with respect to the caller's accesses to other shared variables.
*/
struct task_struct *cpu_curr_snapshot(int cpu)
{
struct task_struct *t;
smp_mb(); /* Pairing determined by caller's synchronization design. */
t = rcu_dereference(cpu_curr(cpu));
smp_mb(); /* Pairing determined by caller's synchronization design. */
return t;
}
/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.

View File

@ -174,9 +174,9 @@ static int __init csdlock_debug(char *str)
if (val)
static_branch_enable(&csdlock_debug_enabled);
return 0;
return 1;
}
early_param("csdlock_debug", csdlock_debug);
__setup("csdlock_debug=", csdlock_debug);
static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);

View File

@ -35,7 +35,7 @@ then
exit 1
fi
# Remember where we started so that we can get back and the end.
# Remember where we started so that we can get back at the end.
curcommit="`git status | head -1 | awk '{ print $NF }'`"
nfail=0
@ -73,15 +73,10 @@ do
# Test the specified commit.
git checkout $i > $resdir/$ds/$idir/git-checkout.out 2>&1
echo git checkout return code: $? "(Commit $ntry: $i)"
kvm.sh --allcpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1
kvm.sh --allcpus --duration 3 --trust-make --datestamp "$ds/$idir" > $resdir/$ds/$idir/kvm.sh.out 2>&1
ret=$?
echo kvm.sh return code $ret for commit $i from branch $gitbr
# Move the build products to their resting place.
runresdir="`grep -m 1 '^Results directory:' < $resdir/$ds/$idir/kvm.sh.out | sed -e 's/^Results directory://'`"
mv $runresdir $resdir/$ds/$idir
rrd="`echo $runresdir | sed -e 's,^.*/,,'`"
echo Run results: $resdir/$ds/$idir/$rrd
echo Run results: $resdir/$ds/$idir
if test "$ret" -ne 0
then
# Failure, so leave all evidence intact.

View File

@ -262,6 +262,7 @@ echo All batches started. `date` | tee -a "$oldrun/remote-log"
# Wait for all remaining scenarios to complete and collect results.
for i in $systems
do
echo " ---" Waiting for $i `date` | tee -a "$oldrun/remote-log"
while checkremotefile "$i" "$resdir/$ds/remote.run"
do
sleep 30

View File

@ -164,7 +164,7 @@ do
shift
;;
--gdb)
TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO=y"; export TORTURE_KCONFIG_GDB_ARG
TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y"; export TORTURE_KCONFIG_GDB_ARG
TORTURE_BOOT_GDB_ARG="nokaslr"; export TORTURE_BOOT_GDB_ARG
TORTURE_QEMU_GDB_ARG="-s -S"; export TORTURE_QEMU_GDB_ARG
;;
@ -180,7 +180,7 @@ do
shift
;;
--kasan)
TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG
TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG
if test -n "$torture_qemu_mem_default"
then
TORTURE_QEMU_MEM=2G
@ -192,7 +192,7 @@ do
shift
;;
--kcsan)
TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG
TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG
;;
--kmake-arg|--kmake-args)
checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$'