Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RCU updates from Ingo Molnar:
 "The main RCU changes in this development cycle were:

   - Miscellaneous fixes, including a change to call_rcu()'s rcu_head
     alignment check.

   - Security-motivated list consistency checks, which are disabled by
     default behind DEBUG_LIST.

   - Torture-test updates.

   - Documentation updates, yet again just simple changes"

* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  torture: Prevent jitter from delaying build-only runs
  torture: Remove obsolete files from rcutorture .gitignore
  rcu: Don't kick unless grace period or request
  rcu: Make expedited grace periods recheck dyntick idle state
  torture: Trace long read-side delays
  rcu: RCU_TRACE enables event tracing as well as debugfs
  rcu: Remove obsolete comment from __call_rcu()
  rcu: Remove obsolete rcu_check_callbacks() header comment
  rcu: Tighten up __call_rcu() rcu_head alignment check
  Documentation/RCU: Fix minor typo
  documentation: Present updated RCU guarantee
  bug: Avoid Kconfig warning for BUG_ON_DATA_CORRUPTION
  lib/Kconfig.debug: Fix typo in select statement
  lkdtm: Add tests for struct list corruption
  bug: Provide toggle for BUG on data corruption
  list: Split list_del() debug checking into separate function
  rculist: Consolidate DEBUG_LIST for list_add_rcu()
  list: Split list_add() debug checking into separate function
This commit is contained in:
Linus Torvalds 2016-12-12 09:09:54 -08:00
commit 718c0ddd6a
17 changed files with 222 additions and 108 deletions

View File

@ -547,7 +547,7 @@ The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
It could reuse a value formerly fetched from this same pointer.
It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
mash-up of two distince pointer values.
mash-up of two distinct pointer values.
It might even use value-speculation optimizations, where it makes
a wrong guess, but by the time it gets around to checking the
value, an update has changed the pointer to match the wrong guess.
@ -659,6 +659,29 @@ systems with more than one CPU:
In other words, a given instance of <tt>synchronize_rcu()</tt>
can avoid waiting on a given RCU read-side critical section only
if it can prove that <tt>synchronize_rcu()</tt> started first.
<p>
A related question is &ldquo;When <tt>rcu_read_lock()</tt>
doesn't generate any code, why does it matter how it relates
to a grace period?&rdquo;
The answer is that it is not the relationship of
<tt>rcu_read_lock()</tt> itself that is important, but rather
the relationship of the code within the enclosed RCU read-side
critical section to the code preceding and following the
grace period.
If we take this viewpoint, then a given RCU read-side critical
section begins before a given grace period when some access
preceding the grace period observes the effect of some access
within the critical section, in which case none of the accesses
within the critical section may observe the effects of any
access following the grace period.
<p>
As of late 2016, mathematical models of RCU take this
viewpoint, for example, see slides&nbsp;62 and&nbsp;63
of the
<a href="http://www2.rdrop.com/users/paulmck/scalability/paper/LinuxMM.2016.10.04c.LCE.pdf">2016 LinuxCon EU</a>
presentation.
</font></td></tr>
<tr><td>&nbsp;</td></tr>
</table>

View File

@ -237,7 +237,7 @@ rcu_dereference()
The reader uses rcu_dereference() to fetch an RCU-protected
pointer, which returns a value that may then be safely
dereferenced. Note that rcu_deference() does not actually
dereferenced. Note that rcu_dereference() does not actually
dereference the pointer, instead, it protects the pointer for
later dereferencing. It also executes any needed memory-barrier
instructions for a given CPU architecture. Currently, only Alpha

View File

@ -21,6 +21,8 @@ void lkdtm_SPINLOCKUP(void);
void lkdtm_HUNG_TASK(void);
void lkdtm_ATOMIC_UNDERFLOW(void);
void lkdtm_ATOMIC_OVERFLOW(void);
void lkdtm_CORRUPT_LIST_ADD(void);
void lkdtm_CORRUPT_LIST_DEL(void);
/* lkdtm_heap.c */
void lkdtm_OVERWRITE_ALLOCATION(void);

View File

@ -5,8 +5,13 @@
* test source files.
*/
#include "lkdtm.h"
#include <linux/list.h>
#include <linux/sched.h>
struct lkdtm_list {
struct list_head node;
};
/*
* Make sure our attempts to over run the kernel stack doesn't trigger
* a compiler warning when CONFIG_FRAME_WARN is set. Then make sure we
@ -146,3 +151,66 @@ void lkdtm_ATOMIC_OVERFLOW(void)
pr_info("attempting bad atomic overflow\n");
atomic_inc(&over);
}
void lkdtm_CORRUPT_LIST_ADD(void)
{
/*
* Initially, an empty list via LIST_HEAD:
* test_head.next = &test_head
* test_head.prev = &test_head
*/
LIST_HEAD(test_head);
struct lkdtm_list good, bad;
void *target[2] = { };
void *redirection = &target;
pr_info("attempting good list addition\n");
/*
* Adding to the list performs these actions:
* test_head.next->prev = &good.node
* good.node.next = test_head.next
* good.node.prev = test_head
* test_head.next = good.node
*/
list_add(&good.node, &test_head);
pr_info("attempting corrupted list addition\n");
/*
* In simulating this "write what where" primitive, the "what" is
* the address of &bad.node, and the "where" is the address held
* by "redirection".
*/
test_head.next = redirection;
list_add(&bad.node, &test_head);
if (target[0] == NULL && target[1] == NULL)
pr_err("Overwrite did not happen, but no BUG?!\n");
else
pr_err("list_add() corruption not detected!\n");
}
void lkdtm_CORRUPT_LIST_DEL(void)
{
LIST_HEAD(test_head);
struct lkdtm_list item;
void *target[2] = { };
void *redirection = &target;
list_add(&item.node, &test_head);
pr_info("attempting good list removal\n");
list_del(&item.node);
pr_info("attempting corrupted list removal\n");
list_add(&item.node, &test_head);
/* As with the list_add() test above, this corrupts "next". */
item.node.next = redirection;
list_del(&item.node);
if (target[0] == NULL && target[1] == NULL)
pr_err("Overwrite did not happen, but no BUG?!\n");
else
pr_err("list_del() corruption not detected!\n");
}

View File

@ -197,6 +197,8 @@ struct crashtype crashtypes[] = {
CRASHTYPE(EXCEPTION),
CRASHTYPE(LOOP),
CRASHTYPE(OVERFLOW),
CRASHTYPE(CORRUPT_LIST_ADD),
CRASHTYPE(CORRUPT_LIST_DEL),
CRASHTYPE(CORRUPT_STACK),
CRASHTYPE(UNALIGNED_LOAD_STORE_WRITE),
CRASHTYPE(OVERWRITE_ALLOCATION),

View File

@ -121,4 +121,21 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
}
#endif /* CONFIG_GENERIC_BUG */
/*
* Since detected data corruption should stop operation on the affected
* structures, this returns false if the corruption condition is found.
*/
#define CHECK_DATA_CORRUPTION(condition, fmt, ...) \
do { \
if (unlikely(condition)) { \
if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
pr_err(fmt, ##__VA_ARGS__); \
BUG(); \
} else \
WARN(1, fmt, ##__VA_ARGS__); \
return false; \
} \
} while (0)
#endif /* _LINUX_BUG_H */

View File

@ -28,27 +28,42 @@ static inline void INIT_LIST_HEAD(struct list_head *list)
list->prev = list;
}
#ifdef CONFIG_DEBUG_LIST
extern bool __list_add_valid(struct list_head *new,
struct list_head *prev,
struct list_head *next);
extern bool __list_del_entry_valid(struct list_head *entry);
#else
static inline bool __list_add_valid(struct list_head *new,
struct list_head *prev,
struct list_head *next)
{
return true;
}
static inline bool __list_del_entry_valid(struct list_head *entry)
{
return true;
}
#endif
/*
* Insert a new entry between two known consecutive entries.
*
* This is only for internal list manipulation where we know
* the prev/next entries already!
*/
#ifndef CONFIG_DEBUG_LIST
static inline void __list_add(struct list_head *new,
struct list_head *prev,
struct list_head *next)
{
if (!__list_add_valid(new, prev, next))
return;
next->prev = new;
new->next = next;
new->prev = prev;
WRITE_ONCE(prev->next, new);
}
#else
extern void __list_add(struct list_head *new,
struct list_head *prev,
struct list_head *next);
#endif
/**
* list_add - add a new entry
@ -96,22 +111,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next)
* Note: list_empty() on entry does not return true after this, the entry is
* in an undefined state.
*/
#ifndef CONFIG_DEBUG_LIST
static inline void __list_del_entry(struct list_head *entry)
{
if (!__list_del_entry_valid(entry))
return;
__list_del(entry->prev, entry->next);
}
static inline void list_del(struct list_head *entry)
{
__list_del(entry->prev, entry->next);
__list_del_entry(entry);
entry->next = LIST_POISON1;
entry->prev = LIST_POISON2;
}
#else
extern void __list_del_entry(struct list_head *entry);
extern void list_del(struct list_head *entry);
#endif
/**
* list_replace - replace old entry by new one

View File

@ -45,19 +45,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
* This is only for internal list manipulation where we know
* the prev/next entries already!
*/
#ifndef CONFIG_DEBUG_LIST
static inline void __list_add_rcu(struct list_head *new,
struct list_head *prev, struct list_head *next)
{
if (!__list_add_valid(new, prev, next))
return;
new->next = next;
new->prev = prev;
rcu_assign_pointer(list_next_rcu(prev), new);
next->prev = new;
}
#else
void __list_add_rcu(struct list_head *new,
struct list_head *prev, struct list_head *next);
#endif
/**
* list_add_rcu - add a new entry to rcu-protected list

View File

@ -698,7 +698,10 @@ TRACE_EVENT(rcu_batch_end,
/*
* Tracepoint for rcutorture readers. The first argument is the name
* of the RCU flavor from rcutorture's viewpoint and the second argument
* is the callback address.
* is the callback address. The third argument is the start time in
* seconds, and the last two arguments are the grace period numbers
* at the beginning and end of the read, respectively. Note that the
* callback address can be NULL.
*/
TRACE_EVENT(rcu_torture_read,

View File

@ -289,15 +289,24 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
static void rcu_read_delay(struct torture_random_state *rrsp)
{
unsigned long started;
unsigned long completed;
const unsigned long shortdelay_us = 200;
const unsigned long longdelay_ms = 50;
unsigned long long ts;
/* We want a short delay sometimes to make a reader delay the grace
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
started = cur_ops->completed();
ts = rcu_trace_clock_local();
mdelay(longdelay_ms);
completed = cur_ops->completed();
do_trace_rcu_torture_read(cur_ops->name, NULL, ts,
started, completed);
}
if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
udelay(shortdelay_us);
#ifdef CONFIG_PREEMPT

View File

@ -1304,7 +1304,8 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
if (!rcu_kick_kthreads)
return;
j = READ_ONCE(rsp->jiffies_kick_kthreads);
if (time_after(jiffies, j) && rsp->gp_kthread) {
if (time_after(jiffies, j) && rsp->gp_kthread &&
(rcu_gp_in_progress(rsp) || READ_ONCE(rsp->gp_flags))) {
WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name);
rcu_ftrace_dump(DUMP_ALL);
wake_up_process(rsp->gp_kthread);
@ -2828,8 +2829,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
* Also schedule RCU core processing.
*
* This function must be called from hardirq context. It is normally
* invoked from the scheduling-clock interrupt. If rcu_pending returns
* false, there is no point in invoking rcu_check_callbacks().
* invoked from the scheduling-clock interrupt.
*/
void rcu_check_callbacks(int user)
{
@ -3121,7 +3121,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
unsigned long flags;
struct rcu_data *rdp;
WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
if (debug_rcu_head_queue(head)) {
/* Probable double call_rcu(), so leak the callback. */
WRITE_ONCE(head->func, rcu_leak_callback);
@ -3130,13 +3132,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
}
head->func = func;
head->next = NULL;
/*
* Opportunistically note grace-period endings and beginnings.
* Note that we might see a beginning right after we see an
* end, but never vice versa, since this CPU has to pass through
* a quiescent state betweentimes.
*/
local_irq_save(flags);
rdp = this_cpu_ptr(rsp->rda);

View File

@ -404,6 +404,7 @@ struct rcu_data {
atomic_long_t exp_workdone1; /* # done by others #1. */
atomic_long_t exp_workdone2; /* # done by others #2. */
atomic_long_t exp_workdone3; /* # done by others #3. */
int exp_dynticks_snap; /* Double-check need for IPI. */
/* 7) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU

View File

@ -358,8 +358,10 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
rdp->exp_dynticks_snap =
atomic_add_return(0, &rdtp->dynticks);
if (raw_smp_processor_id() == cpu ||
!(atomic_add_return(0, &rdtp->dynticks) & 0x1) ||
!(rdp->exp_dynticks_snap & 0x1) ||
!(rnp->qsmaskinitnext & rdp->grpmask))
mask_ofl_test |= rdp->grpmask;
}
@ -377,9 +379,17 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
/* IPI the remaining CPUs for expedited quiescent state. */
for_each_leaf_node_possible_cpu(rnp, cpu) {
unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
if (!(mask_ofl_ipi & mask))
continue;
retry_ipi:
if (atomic_add_return(0, &rdtp->dynticks) !=
rdp->exp_dynticks_snap) {
mask_ofl_test |= mask;
continue;
}
ret = smp_call_function_single(cpu, func, rsp, 0);
if (!ret) {
mask_ofl_ipi &= ~mask;

View File

@ -1218,7 +1218,7 @@ config DEBUG_BUGVERBOSE
config DEBUG_LIST
bool "Debug linked list manipulation"
depends on DEBUG_KERNEL
depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION
help
Enable this to turn on extended checks in the linked-list
walking routines.
@ -1434,7 +1434,8 @@ config RCU_TRACE
select TRACE_CLOCK
help
This option provides tracing in RCU which presents stats
in debugfs for debugging RCU implementation.
in debugfs for debugging RCU implementation. It also enables
additional tracepoints for ftrace-style event tracing.
Say Y here if you want to enable RCU tracing
Say N if you are unsure.
@ -1964,6 +1965,16 @@ config TEST_STATIC_KEYS
If unsure, say N.
config BUG_ON_DATA_CORRUPTION
bool "Trigger a BUG when data corruption is detected"
select DEBUG_LIST
help
Select this option if the kernel should BUG when it encounters
data corruption in kernel memory structures when they get checked
for validity.
If unsure, say N.
source "samples/Kconfig"
source "lib/Kconfig.kgdb"

View File

@ -2,8 +2,7 @@
* Copyright 2006, Red Hat, Inc., Dave Jones
* Released under the General Public License (GPL).
*
* This file contains the linked list implementations for
* DEBUG_LIST.
* This file contains the linked list validation for DEBUG_LIST.
*/
#include <linux/export.h>
@ -13,88 +12,48 @@
#include <linux/rculist.h>
/*
* Insert a new entry between two known consecutive entries.
*
* This is only for internal list manipulation where we know
* the prev/next entries already!
* Check that the data structures for the list manipulations are reasonably
* valid. Failures here indicate memory corruption (and possibly an exploit
* attempt).
*/
void __list_add(struct list_head *new,
struct list_head *prev,
bool __list_add_valid(struct list_head *new, struct list_head *prev,
struct list_head *next)
{
WARN(next->prev != prev,
"list_add corruption. next->prev should be "
"prev (%p), but was %p. (next=%p).\n",
CHECK_DATA_CORRUPTION(next->prev != prev,
"list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
prev, next->prev, next);
WARN(prev->next != next,
"list_add corruption. prev->next should be "
"next (%p), but was %p. (prev=%p).\n",
CHECK_DATA_CORRUPTION(prev->next != next,
"list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
next, prev->next, prev);
WARN(new == prev || new == next,
CHECK_DATA_CORRUPTION(new == prev || new == next,
"list_add double add: new=%p, prev=%p, next=%p.\n",
new, prev, next);
next->prev = new;
new->next = next;
new->prev = prev;
WRITE_ONCE(prev->next, new);
}
EXPORT_SYMBOL(__list_add);
void __list_del_entry(struct list_head *entry)
return true;
}
EXPORT_SYMBOL(__list_add_valid);
bool __list_del_entry_valid(struct list_head *entry)
{
struct list_head *prev, *next;
prev = entry->prev;
next = entry->next;
if (WARN(next == LIST_POISON1,
CHECK_DATA_CORRUPTION(next == LIST_POISON1,
"list_del corruption, %p->next is LIST_POISON1 (%p)\n",
entry, LIST_POISON1) ||
WARN(prev == LIST_POISON2,
entry, LIST_POISON1);
CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
"list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
entry, LIST_POISON2) ||
WARN(prev->next != entry,
"list_del corruption. prev->next should be %p, "
"but was %p\n", entry, prev->next) ||
WARN(next->prev != entry,
"list_del corruption. next->prev should be %p, "
"but was %p\n", entry, next->prev))
return;
entry, LIST_POISON2);
CHECK_DATA_CORRUPTION(prev->next != entry,
"list_del corruption. prev->next should be %p, but was %p\n",
entry, prev->next);
CHECK_DATA_CORRUPTION(next->prev != entry,
"list_del corruption. next->prev should be %p, but was %p\n",
entry, next->prev);
return true;
__list_del(prev, next);
}
EXPORT_SYMBOL(__list_del_entry);
/**
* list_del - deletes entry from list.
* @entry: the element to delete from the list.
* Note: list_empty on entry does not return true after this, the entry is
* in an undefined state.
*/
void list_del(struct list_head *entry)
{
__list_del_entry(entry);
entry->next = LIST_POISON1;
entry->prev = LIST_POISON2;
}
EXPORT_SYMBOL(list_del);
/*
* RCU variants.
*/
void __list_add_rcu(struct list_head *new,
struct list_head *prev, struct list_head *next)
{
WARN(next->prev != prev,
"list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
prev, next->prev, next);
WARN(prev->next != next,
"list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
next, prev->next, prev);
new->next = next;
new->prev = prev;
rcu_assign_pointer(list_next_rcu(prev), new);
next->prev = new;
}
EXPORT_SYMBOL(__list_add_rcu);
EXPORT_SYMBOL(__list_del_entry_valid);

View File

@ -1,6 +1,4 @@
initrd
linux-2.6
b[0-9]*
rcu-test-image
res
*.swp

View File

@ -303,6 +303,7 @@ then
fi
___EOF___
awk < $T/cfgcpu.pack \
-v TORTURE_BUILDONLY="$TORTURE_BUILDONLY" \
-v CONFIGDIR="$CONFIGFRAG/" \
-v KVM="$KVM" \
-v ncpus=$cpus \
@ -375,6 +376,10 @@ function dump(first, pastlast, batchnum)
njitter = ncpus;
else
njitter = ja[1];
if (TORTURE_BUILDONLY && njitter != 0) {
njitter = 0;
print "echo Build-only run, so suppressing jitter >> " rd "/log"
}
for (j = 0; j < njitter; j++)
print "jitter.sh " j " " dur " " ja[2] " " ja[3] "&"
print "wait"