Merge branch 'lkmm-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into locking/urgent

Pull the "Linux kernel memory model" tooling implementation from Paul E. McKenney: 'This pull request contains a single commit that adds a memory model to the tools directory. This memory model can (roughly speaking) be thought of as an automated version of memory-barriers.txt. It is written in the "cat" language, which is executable by the externally provided "herd7" simulator, which exhaustively explores the state space of small litmus tests.' Signed-off-by: Ingo Molnar <mingo@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org>
2024-12-27 05:11:48 +00:00 · 2018-01-31 09:28:45 +01:00 · 2018-01-31 09:28:45 +01:00 · 7246a966b6
commit 7246a966b6
parent 72906f3893 1c27b644c0
40 changed files with 3973 additions and 0 deletions
--- a/tools/memory-model/Documentation/cheatsheet.txt
+++ b/tools/memory-model/Documentation/cheatsheet.txt
@ -0,0 +1,30 @@
+                                  Prior Operation     Subsequent Operation
+                                  ---------------  ---------------------------
+                               C  Self  R  W  RWM  Self  R  W  DR  DW  RMW  SV
+                              __  ----  -  -  ---  ----  -  -  --  --  ---  --
+
+Store, e.g., WRITE_ONCE()            Y                                       Y
+Load, e.g., READ_ONCE()              Y                              Y        Y
+Unsuccessful RMW operation           Y                              Y        Y
+smp_read_barrier_depends()              Y                       Y   Y
+*_dereference()                      Y                          Y   Y        Y
+Successful *_acquire()               R                   Y  Y   Y   Y    Y   Y
+Successful *_release()         C        Y  Y    Y     W                      Y
+smp_rmb()                               Y       R        Y      Y        R
+smp_wmb()                                  Y    W           Y       Y    W
+smp_mb() & synchronize_rcu()  CP        Y  Y    Y        Y  Y   Y   Y    Y
+Successful full non-void RMW  CP     Y  Y  Y    Y     Y  Y  Y   Y   Y    Y   Y
+smp_mb__before_atomic()       CP        Y  Y    Y        a  a   a   a    Y
+smp_mb__after_atomic()        CP        a  a    Y        Y  Y   Y   Y
+
+
+Key:	C:	Ordering is cumulative
+	P:	Ordering propagates
+	R:	Read, for example, READ_ONCE(), or read portion of RMW
+	W:	Write, for example, WRITE_ONCE(), or write portion of RMW
+	Y:	Provides ordering
+	a:	Provides ordering given intervening RMW atomic operation
+	DR:	Dependent read (address dependency)
+	DW:	Dependent write (address, data, or control dependency)
+	RMW:	Atomic read-modify-write operation
+	SV	Same-variable access
--- a/tools/memory-model/Documentation/explanation.txt
+++ b/tools/memory-model/Documentation/explanation.txt
--- a/tools/memory-model/Documentation/recipes.txt
+++ b/tools/memory-model/Documentation/recipes.txt
@ -0,0 +1,570 @@
+This document provides "recipes", that is, litmus tests for commonly
+occurring situations, as well as a few that illustrate subtly broken but
+attractive nuisances.  Many of these recipes include example code from
+v4.13 of the Linux kernel.
+
+The first section covers simple special cases, the second section
+takes off the training wheels to cover more involved examples,
+and the third section provides a few rules of thumb.
+
+
+Simple special cases
+====================
+
+This section presents two simple special cases, the first being where
+there is only one CPU or only one memory location is accessed, and the
+second being use of that old concurrency workhorse, locking.
+
+
+Single CPU or single memory location
+------------------------------------
+
+If there is only one CPU on the one hand or only one variable
+on the other, the code will execute in order.  There are (as
+usual) some things to be careful of:
+
+1.	Some aspects of the C language are unordered.  For example,
+	in the expression "f(x) + g(y)", the order in which f and g are
+	called is not defined; the object code is allowed to use either
+	order or even to interleave the computations.
+
+2.	Compilers are permitted to use the "as-if" rule.  That is, a
+	compiler can emit whatever code it likes for normal accesses,
+	as long as the results of a single-threaded execution appear
+	just as if the compiler had followed all the relevant rules.
+	To see this, compile with a high level of optimization and run
+	the debugger on the resulting binary.
+
+3.	If there is only one variable but multiple CPUs, that variable
+	must be properly aligned and all accesses to that variable must
+	be full sized.	Variables that straddle cachelines or pages void
+	your full-ordering warranty, as do undersized accesses that load
+	from or store to only part of the variable.
+
+4.	If there are multiple CPUs, accesses to shared variables should
+	use READ_ONCE() and WRITE_ONCE() or stronger to prevent load/store
+	tearing, load/store fusing, and invented loads and stores.
+	There are exceptions to this rule, including:
+
+	i.	When there is no possibility of a given shared variable
+		being updated by some other CPU, for example, while
+		holding the update-side lock, reads from that variable
+		need not use READ_ONCE().
+
+	ii.	When there is no possibility of a given shared variable
+		being either read or updated by other CPUs, for example,
+		when running during early boot, reads from that variable
+		need not use READ_ONCE() and writes to that variable
+		need not use WRITE_ONCE().
+
+
+Locking
+-------
+
+Locking is well-known and straightforward, at least if you don't think
+about it too hard.  And the basic rule is indeed quite simple: Any CPU that
+has acquired a given lock sees any changes previously seen or made by any
+CPU before it released that same lock.  Note that this statement is a bit
+stronger than "Any CPU holding a given lock sees all changes made by any
+CPU during the time that CPU was holding this same lock".  For example,
+consider the following pair of code fragments:
+
+	/* See MP+polocks.litmus. */
+	void CPU0(void)
+	{
+		WRITE_ONCE(x, 1);
+		spin_lock(&mylock);
+		WRITE_ONCE(y, 1);
+		spin_unlock(&mylock);
+	}
+
+	void CPU1(void)
+	{
+		spin_lock(&mylock);
+		r0 = READ_ONCE(y);
+		spin_unlock(&mylock);
+		r1 = READ_ONCE(x);
+	}
+
+The basic rule guarantees that if CPU0() acquires mylock before CPU1(),
+then both r0 and r1 must be set to the value 1.  This also has the
+consequence that if the final value of r0 is equal to 1, then the final
+value of r1 must also be equal to 1.  In contrast, the weaker rule would
+say nothing about the final value of r1.
+
+The converse to the basic rule also holds, as illustrated by the
+following litmus test:
+
+	/* See MP+porevlocks.litmus. */
+	void CPU0(void)
+	{
+		r0 = READ_ONCE(y);
+		spin_lock(&mylock);
+		r1 = READ_ONCE(x);
+		spin_unlock(&mylock);
+	}
+
+	void CPU1(void)
+	{
+		spin_lock(&mylock);
+		WRITE_ONCE(x, 1);
+		spin_unlock(&mylock);
+		WRITE_ONCE(y, 1);
+	}
+
+This converse to the basic rule guarantees that if CPU0() acquires
+mylock before CPU1(), then both r0 and r1 must be set to the value 0.
+This also has the consequence that if the final value of r1 is equal
+to 0, then the final value of r0 must also be equal to 0.  In contrast,
+the weaker rule would say nothing about the final value of r0.
+
+These examples show only a single pair of CPUs, but the effects of the
+locking basic rule extend across multiple acquisitions of a given lock
+across multiple CPUs.
+
+However, it is not necessarily the case that accesses ordered by
+locking will be seen as ordered by CPUs not holding that lock.
+Consider this example:
+
+	/* See Z6.0+pooncelock+pooncelock+pombonce.litmus. */
+	void CPU0(void)
+	{
+		spin_lock(&mylock);
+		WRITE_ONCE(x, 1);
+		WRITE_ONCE(y, 1);
+		spin_unlock(&mylock);
+	}
+
+	void CPU1(void)
+	{
+		spin_lock(&mylock);
+		r0 = READ_ONCE(y);
+		WRITE_ONCE(z, 1);
+		spin_unlock(&mylock);
+	}
+
+	void CPU2(void)
+	{
+		WRITE_ONCE(z, 2);
+		smp_mb();
+		r1 = READ_ONCE(x);
+	}
+
+Counter-intuitive though it might be, it is quite possible to have
+the final value of r0 be 1, the final value of z be 2, and the final
+value of r1 be 0.  The reason for this surprising outcome is that
+CPU2() never acquired the lock, and thus did not benefit from the
+lock's ordering properties.
+
+Ordering can be extended to CPUs not holding the lock by careful use
+of smp_mb__after_spinlock():
+
+	/* See Z6.0+pooncelock+poonceLock+pombonce.litmus. */
+	void CPU0(void)
+	{
+		spin_lock(&mylock);
+		WRITE_ONCE(x, 1);
+		WRITE_ONCE(y, 1);
+		spin_unlock(&mylock);
+	}
+
+	void CPU1(void)
+	{
+		spin_lock(&mylock);
+		smp_mb__after_spinlock();
+		r0 = READ_ONCE(y);
+		WRITE_ONCE(z, 1);
+		spin_unlock(&mylock);
+	}
+
+	void CPU2(void)
+	{
+		WRITE_ONCE(z, 2);
+		smp_mb();
+		r1 = READ_ONCE(x);
+	}
+
+This addition of smp_mb__after_spinlock() strengthens the lock acquisition
+sufficiently to rule out the counter-intuitive outcome.
+
+
+Taking off the training wheels
+==============================
+
+This section looks at more complex examples, including message passing,
+load buffering, release-acquire chains, store buffering.
+Many classes of litmus tests have abbreviated names, which may be found
+here: https://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test6.pdf
+
+
+Message passing (MP)
+--------------------
+
+The MP pattern has one CPU execute a pair of stores to a pair of variables
+and another CPU execute a pair of loads from this same pair of variables,
+but in the opposite order.  The goal is to avoid the counter-intuitive
+outcome in which the first load sees the value written by the second store
+but the second load does not see the value written by the first store.
+In the absence of any ordering, this goal may not be met, as can be seen
+in the MP+poonceonces.litmus litmus test.  This section therefore looks at
+a number of ways of meeting this goal.
+
+
+Release and acquire
+~~~~~~~~~~~~~~~~~~~
+
+Use of smp_store_release() and smp_load_acquire() is one way to force
+the desired MP ordering.  The general approach is shown below:
+
+	/* See MP+pooncerelease+poacquireonce.litmus. */
+	void CPU0(void)
+	{
+		WRITE_ONCE(x, 1);
+		smp_store_release(&y, 1);
+	}
+
+	void CPU1(void)
+	{
+		r0 = smp_load_acquire(&y);
+		r1 = READ_ONCE(x);
+	}
+
+The smp_store_release() macro orders any prior accesses against the
+store, while the smp_load_acquire macro orders the load against any
+subsequent accesses.  Therefore, if the final value of r0 is the value 1,
+the final value of r1 must also be the value 1.
+
+The init_stack_slab() function in lib/stackdepot.c uses release-acquire
+in this way to safely initialize of a slab of the stack.  Working out
+the mutual-exclusion design is left as an exercise for the reader.
+
+
+Assign and dereference
+~~~~~~~~~~~~~~~~~~~~~~
+
+Use of rcu_assign_pointer() and rcu_dereference() is quite similar to the
+use of smp_store_release() and smp_load_acquire(), except that both
+rcu_assign_pointer() and rcu_dereference() operate on RCU-protected
+pointers.  The general approach is shown below:
+
+	/* See MP+onceassign+derefonce.litmus. */
+	int z;
+	int *y = &z;
+	int x;
+
+	void CPU0(void)
+	{
+		WRITE_ONCE(x, 1);
+		rcu_assign_pointer(y, &x);
+	}
+
+	void CPU1(void)
+	{
+		rcu_read_lock();
+		r0 = rcu_dereference(y);
+		r1 = READ_ONCE(*r0);
+		rcu_read_unlock();
+	}
+
+In this example, if the final value of r0 is &x then the final value of
+r1 must be 1.
+
+The rcu_assign_pointer() macro has the same ordering properties as does
+smp_store_release(), but the rcu_dereference() macro orders the load only
+against later accesses that depend on the value loaded.  A dependency
+is present if the value loaded determines the address of a later access
+(address dependency, as shown above), the value written by a later store
+(data dependency), or whether or not a later store is executed in the
+first place (control dependency).  Note that the term "data dependency"
+is sometimes casually used to cover both address and data dependencies.
+
+In lib/prime_numbers.c, the expand_to_next_prime() function invokes
+rcu_assign_pointer(), and the next_prime_number() function invokes
+rcu_dereference().  This combination mediates access to a bit vector
+that is expanded as additional primes are needed.
+
+
+Write and read memory barriers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It is usually better to use smp_store_release() instead of smp_wmb()
+and to use smp_load_acquire() instead of smp_rmb().  However, the older
+smp_wmb() and smp_rmb() APIs are still heavily used, so it is important
+to understand their use cases.  The general approach is shown below:
+
+	/* See MP+wmbonceonce+rmbonceonce.litmus. */
+	void CPU0(void)
+	{
+		WRITE_ONCE(x, 1);
+		smp_wmb();
+		WRITE_ONCE(y, 1);
+	}
+
+	void CPU1(void)
+	{
+		r0 = READ_ONCE(y);
+		smp_rmb();
+		r1 = READ_ONCE(x);
+	}
+
+The smp_wmb() macro orders prior stores against later stores, and the
+smp_rmb() macro orders prior loads against later loads.  Therefore, if
+the final value of r0 is 1, the final value of r1 must also be 1.
+
+The the xlog_state_switch_iclogs() function in fs/xfs/xfs_log.c contains
+the following write-side code fragment:
+
+	log->l_curr_block -= log->l_logBBsize;
+	ASSERT(log->l_curr_block >= 0);
+	smp_wmb();
+	log->l_curr_cycle++;
+
+And the xlog_valid_lsn() function in fs/xfs/xfs_log_priv.h contains
+the corresponding read-side code fragment:
+
+	cur_cycle = ACCESS_ONCE(log->l_curr_cycle);
+	smp_rmb();
+	cur_block = ACCESS_ONCE(log->l_curr_block);
+
+Alternatively, consider the following comment in function
+perf_output_put_handle() in kernel/events/ring_buffer.c:
+
+	 *   kernel				user
+	 *
+	 *   if (LOAD ->data_tail) {		LOAD ->data_head
+	 *			(A)		smp_rmb()	(C)
+	 *	STORE $data			LOAD $data
+	 *	smp_wmb()	(B)		smp_mb()	(D)
+	 *	STORE ->data_head		STORE ->data_tail
+	 *   }
+
+The B/C pairing is an example of the MP pattern using smp_wmb() on the
+write side and smp_rmb() on the read side.
+
+Of course, given that smp_mb() is strictly stronger than either smp_wmb()
+or smp_rmb(), any code fragment that would work with smp_rmb() and
+smp_wmb() would also work with smp_mb() replacing either or both of the
+weaker barriers.
+
+
+Load buffering (LB)
+-------------------
+
+The LB pattern has one CPU load from one variable and then store to a
+second, while another CPU loads from the second variable and then stores
+to the first.  The goal is to avoid the counter-intuitive situation where
+each load reads the value written by the other CPU's store.  In the
+absence of any ordering it is quite possible that this may happen, as
+can be seen in the LB+poonceonces.litmus litmus test.
+
+One way of avoiding the counter-intuitive outcome is through the use of a
+control dependency paired with a full memory barrier:
+
+	/* See LB+ctrlonceonce+mbonceonce.litmus. */
+	void CPU0(void)
+	{
+		r0 = READ_ONCE(x);
+		if (r0)
+			WRITE_ONCE(y, 1);
+	}
+
+	void CPU1(void)
+	{
+		r1 = READ_ONCE(y);
+		smp_mb();
+		WRITE_ONCE(x, 1);
+	}
+
+This pairing of a control dependency in CPU0() with a full memory
+barrier in CPU1() prevents r0 and r1 from both ending up equal to 1.
+
+The A/D pairing from the ring-buffer use case shown earlier also
+illustrates LB.  Here is a repeat of the comment in
+perf_output_put_handle() in kernel/events/ring_buffer.c, showing a
+control dependency on the kernel side and a full memory barrier on
+the user side:
+
+	 *   kernel				user
+	 *
+	 *   if (LOAD ->data_tail) {		LOAD ->data_head
+	 *			(A)		smp_rmb()	(C)
+	 *	STORE $data			LOAD $data
+	 *	smp_wmb()	(B)		smp_mb()	(D)
+	 *	STORE ->data_head		STORE ->data_tail
+	 *   }
+	 *
+	 * Where A pairs with D, and B pairs with C.
+
+The kernel's control dependency between the load from ->data_tail
+and the store to data combined with the user's full memory barrier
+between the load from data and the store to ->data_tail prevents
+the counter-intuitive outcome where the kernel overwrites the data
+before the user gets done loading it.
+
+
+Release-acquire chains
+----------------------
+
+Release-acquire chains are a low-overhead, flexible, and easy-to-use
+method of maintaining order.  However, they do have some limitations that
+need to be fully understood.  Here is an example that maintains order:
+
+	/* See ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus. */
+	void CPU0(void)
+	{
+		WRITE_ONCE(x, 1);
+		smp_store_release(&y, 1);
+	}
+
+	void CPU1(void)
+	{
+		r0 = smp_load_acquire(y);
+		smp_store_release(&z, 1);
+	}
+
+	void CPU2(void)
+	{
+		r1 = smp_load_acquire(z);
+		r2 = READ_ONCE(x);
+	}
+
+In this case, if r0 and r1 both have final values of 1, then r2 must
+also have a final value of 1.
+
+The ordering in this example is stronger than it needs to be.  For
+example, ordering would still be preserved if CPU1()'s smp_load_acquire()
+invocation was replaced with READ_ONCE().
+
+It is tempting to assume that CPU0()'s store to x is globally ordered
+before CPU1()'s store to z, but this is not the case:
+
+	/* See Z6.0+pooncerelease+poacquirerelease+mbonceonce.litmus. */
+	void CPU0(void)
+	{
+		WRITE_ONCE(x, 1);
+		smp_store_release(&y, 1);
+	}
+
+	void CPU1(void)
+	{
+		r0 = smp_load_acquire(y);
+		smp_store_release(&z, 1);
+	}
+
+	void CPU2(void)
+	{
+		WRITE_ONCE(z, 2);
+		smp_mb();
+		r1 = READ_ONCE(x);
+	}
+
+One might hope that if the final value of r0 is 1 and the final value
+of z is 2, then the final value of r1 must also be 1, but it really is
+possible for r1 to have the final value of 0.  The reason, of course,
+is that in this version, CPU2() is not part of the release-acquire chain.
+This situation is accounted for in the rules of thumb below.
+
+Despite this limitation, release-acquire chains are low-overhead as
+well as simple and powerful, at least as memory-ordering mechanisms go.
+
+
+Store buffering
+---------------
+
+Store buffering can be thought of as upside-down load buffering, so
+that one CPU first stores to one variable and then loads from a second,
+while another CPU stores to the second variable and then loads from the
+first.  Preserving order requires nothing less than full barriers:
+
+	/* See SB+mbonceonces.litmus. */
+	void CPU0(void)
+	{
+		WRITE_ONCE(x, 1);
+		smp_mb();
+		r0 = READ_ONCE(y);
+	}
+
+	void CPU1(void)
+	{
+		WRITE_ONCE(y, 1);
+		smp_mb();
+		r1 = READ_ONCE(x);
+	}
+
+Omitting either smp_mb() will allow both r0 and r1 to have final
+values of 0, but providing both full barriers as shown above prevents
+this counter-intuitive outcome.
+
+This pattern most famously appears as part of Dekker's locking
+algorithm, but it has a much more practical use within the Linux kernel
+of ordering wakeups.  The following comment taken from waitqueue_active()
+in include/linux/wait.h shows the canonical pattern:
+
+ *      CPU0 - waker                    CPU1 - waiter
+ *
+ *                                      for (;;) {
+ *      @cond = true;                     prepare_to_wait(&wq_head, &wait, state);
+ *      smp_mb();                         // smp_mb() from set_current_state()
+ *      if (waitqueue_active(wq_head))         if (@cond)
+ *        wake_up(wq_head);                      break;
+ *                                        schedule();
+ *                                      }
+ *                                      finish_wait(&wq_head, &wait);
+
+On CPU0, the store is to @cond and the load is in waitqueue_active().
+On CPU1, prepare_to_wait() contains both a store to wq_head and a call
+to set_current_state(), which contains an smp_mb() barrier; the load is
+"if (@cond)".  The full barriers prevent the undesirable outcome where
+CPU1 puts the waiting task to sleep and CPU0 fails to wake it up.
+
+Note that use of locking can greatly simplify this pattern.
+
+
+Rules of thumb
+==============
+
+There might seem to be no pattern governing what ordering primitives are
+needed in which situations, but this is not the case.  There is a pattern
+based on the relation between the accesses linking successive CPUs in a
+given litmus test.  There are three types of linkage:
+
+1.	Write-to-read, where the next CPU reads the value that the
+	previous CPU wrote.  The LB litmus-test patterns contain only
+	this type of relation.	In formal memory-modeling texts, this
+	relation is called "reads-from" and is usually abbreviated "rf".
+
+2.	Read-to-write, where the next CPU overwrites the value that the
+	previous CPU read.  The SB litmus test contains only this type
+	of relation.  In formal memory-modeling texts, this relation is
+	often called "from-reads" and is sometimes abbreviated "fr".
+
+3.	Write-to-write, where the next CPU overwrites the value written
+	by the previous CPU.  The Z6.0 litmus test pattern contains a
+	write-to-write relation between the last access of CPU1() and
+	the first access of CPU2().  In formal memory-modeling texts,
+	this relation is often called "coherence order" and is sometimes
+	abbreviated "co".  In the C++ standard, it is instead called
+	"modification order" and often abbreviated "mo".
+
+The strength of memory ordering required for a given litmus test to
+avoid a counter-intuitive outcome depends on the types of relations
+linking the memory accesses for the outcome in question:
+
+o	If all links are write-to-read links, then the weakest
+	possible ordering within each CPU suffices.  For example, in
+	the LB litmus test, a control dependency was enough to do the
+	job.
+
+o	If all but one of the links are write-to-read links, then a
+	release-acquire chain suffices.  Both the MP and the ISA2
+	litmus tests illustrate this case.
+
+o	If more than one of the links are something other than
+	write-to-read links, then a full memory barrier is required
+	between each successive pair of non-write-to-read links.  This
+	case is illustrated by the Z6.0 litmus tests, both in the
+	locking and in the release-acquire sections.
+
+However, if you find yourself having to stretch these rules of thumb
+to fit your situation, you should consider creating a litmus test and
+running it on the model.
--- a/tools/memory-model/Documentation/references.txt
+++ b/tools/memory-model/Documentation/references.txt
@ -0,0 +1,107 @@
+This document provides background reading for memory models and related
+tools.  These documents are aimed at kernel hackers who are interested
+in memory models.
+
+
+Hardware manuals and models
+===========================
+
+o	SPARC International Inc. (Ed.). 1994. "The SPARC Architecture
+	Reference Manual Version 9". SPARC International Inc.
+
+o	Compaq Computer Corporation (Ed.). 2002. "Alpha Architecture
+	Reference Manual".  Compaq Computer Corporation.
+
+o	Intel Corporation (Ed.). 2002. "A Formal Specification of Intel
+	Itanium Processor Family Memory Ordering". Intel Corporation.
+
+o	Intel Corporation (Ed.). 2002. "Intel 64 and IA-32 Architectures
+	Software Developer’s Manual". Intel Corporation.
+
+o	Peter Sewell, Susmit Sarkar, Scott Owens, Francesco Zappa Nardelli,
+	and Magnus O. Myreen. 2010. "x86-TSO: A Rigorous and Usable
+	Programmer's Model for x86 Multiprocessors". Commun. ACM 53, 7
+	(July, 2010), 89-97. http://doi.acm.org/10.1145/1785414.1785443
+
+o	IBM Corporation (Ed.). 2009. "Power ISA Version 2.06". IBM
+	Corporation.
+
+o	ARM Ltd. (Ed.). 2009. "ARM Barrier Litmus Tests and Cookbook".
+	ARM Ltd.
+
+o	Susmit Sarkar, Peter Sewell, Jade Alglave, Luc Maranget, and
+	Derek Williams.  2011. "Understanding POWER Multiprocessors". In
+	Proceedings of the 32Nd ACM SIGPLAN Conference on Programming
+	Language Design and Implementation (PLDI ’11). ACM, New York,
+	NY, USA, 175–186.
+
+o	Susmit Sarkar, Kayvan Memarian, Scott Owens, Mark Batty,
+	Peter Sewell, Luc Maranget, Jade Alglave, and Derek Williams.
+	2012. "Synchronising C/C++ and POWER". In Proceedings of the 33rd
+	ACM SIGPLAN Conference on Programming Language Design and
+	Implementation (PLDI '12). ACM, New York, NY, USA, 311-322.
+
+o	ARM Ltd. (Ed.). 2014. "ARM Architecture Reference Manual (ARMv8,
+	for ARMv8-A architecture profile)". ARM Ltd.
+
+o	Imagination Technologies, LTD. 2015. "MIPS(R) Architecture
+	For Programmers, Volume II-A: The MIPS64(R) Instruction,
+	Set Reference Manual". Imagination Technologies,
+	LTD. https://imgtec.com/?do-download=4302.
+
+o	Shaked Flur, Kathryn E. Gray, Christopher Pulte, Susmit
+	Sarkar, Ali Sezgin, Luc Maranget, Will Deacon, and Peter
+	Sewell. 2016. "Modelling the ARMv8 Architecture, Operationally:
+	Concurrency and ISA". In Proceedings of the 43rd Annual ACM
+	SIGPLAN-SIGACT Symposium on Principles of Programming Languages
+	(POPL ’16). ACM, New York, NY, USA, 608–621.
+
+o	Shaked Flur, Susmit Sarkar, Christopher Pulte, Kyndylan Nienhuis,
+	Luc Maranget, Kathryn E. Gray, Ali Sezgin, Mark Batty, and Peter
+	Sewell. 2017. "Mixed-size Concurrency: ARM, POWER, C/C++11,
+	and SC". In Proceedings of the 44th ACM SIGPLAN Symposium on
+	Principles of Programming Languages (POPL 2017). ACM, New York,
+	NY, USA, 429–442.
+
+
+Linux-kernel memory model
+=========================
+
+o	Andrea Parri, Alan Stern, Luc Maranget, Paul E. McKenney,
+	and Jade Alglave.  2017. "A formal model of
+	Linux-kernel memory ordering - companion webpage".
+	http://moscova.inria.fr/∼maranget/cats7/linux/. (2017). [Online;
+	accessed 30-January-2017].
+
+o	Jade Alglave, Luc Maranget, Paul E. McKenney, Andrea Parri, and
+	Alan Stern.  2017.  "A formal kernel memory-ordering model (part 1)"
+	Linux Weekly News.  https://lwn.net/Articles/718628/
+
+o	Jade Alglave, Luc Maranget, Paul E. McKenney, Andrea Parri, and
+	Alan Stern.  2017.  "A formal kernel memory-ordering model (part 2)"
+	Linux Weekly News.  https://lwn.net/Articles/720550/
+
+
+Memory-model tooling
+====================
+
+o	Daniel Jackson. 2002. "Alloy: A Lightweight Object Modelling
+	Notation". ACM Trans. Softw. Eng. Methodol. 11, 2 (April 2002),
+	256–290. http://doi.acm.org/10.1145/505145.505149
+
+o	Jade Alglave, Luc Maranget, and Michael Tautschnig. 2014. "Herding
+	Cats: Modelling, Simulation, Testing, and Data Mining for Weak
+	Memory". ACM Trans. Program. Lang. Syst. 36, 2, Article 7 (July
+	2014), 7:1–7:74 pages.
+
+o	Jade Alglave, Patrick Cousot, and Luc Maranget. 2016. "Syntax and
+	semantics of the weak consistency model specification language
+	cat". CoRR abs/1608.07531 (2016). http://arxiv.org/abs/1608.07531
+
+
+Memory-model comparisons
+========================
+
+o	Paul E. McKenney, Ulrich Weigand, Andrea Parri, and Boqun
+	Feng. 2016. "Linux-Kernel Memory Model". (6 June 2016).
+	http://open-std.org/JTC1/SC22/WG21/docs/papers/2016/p0124r2.html.
--- a/tools/memory-model/MAINTAINERS
+++ b/tools/memory-model/MAINTAINERS
@ -0,0 +1,15 @@
+LINUX KERNEL MEMORY MODEL
+M:	Alan Stern <stern@rowland.harvard.edu>
+M:	Andrea Parri <parri.andrea@gmail.com>
+M:	Will Deacon <will.deacon@arm.com>
+M:	Peter Zijlstra <peterz@infradead.org>
+M:	Boqun Feng <boqun.feng@gmail.com>
+M:	Nicholas Piggin <npiggin@gmail.com>
+M:	David Howells <dhowells@redhat.com>
+M:	Jade Alglave <j.alglave@ucl.ac.uk>
+M:	Luc Maranget <luc.maranget@inria.fr>
+M:	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+L:	linux-kernel@vger.kernel.org
+S:	Supported
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
+F:	tools/memory-model/
--- a/tools/memory-model/README
+++ b/tools/memory-model/README
@ -0,0 +1,220 @@
+			=========================
+			LINUX KERNEL MEMORY MODEL
+			=========================
+
+============
+INTRODUCTION
+============
+
+This directory contains the memory model of the Linux kernel, written
+in the "cat" language and executable by the (externally provided)
+"herd7" simulator, which exhaustively explores the state space of
+small litmus tests.
+
+In addition, the "klitmus7" tool (also externally provided) may be used
+to convert a litmus test to a Linux kernel module, which in turn allows
+that litmus test to be exercised within the Linux kernel.
+
+
+============
+REQUIREMENTS
+============
+
+The "herd7" and "klitmus7" tools must be downloaded separately:
+
+  https://github.com/herd/herdtools7
+
+See "herdtools7/INSTALL.md" for installation instructions.
+
+Alternatively, Abhishek Bhardwaj has kindly provided a Docker image
+of these tools at "abhishek40/memory-model".  Abhishek suggests the
+following commands to install and use this image:
+
+  - Users should install Docker for their distribution.
+  - docker run -itd abhishek40/memory-model
+  - docker attach <id-emitted-from-the-previous-command>
+
+Gentoo users might wish to make use of Patrick McLean's package:
+
+  https://gitweb.gentoo.org/repo/gentoo.git/tree/dev-util/herdtools7
+
+These packages may not be up-to-date with respect to the GitHub
+repository.
+
+
+==================
+BASIC USAGE: HERD7
+==================
+
+The memory model is used, in conjunction with "herd7", to exhaustively
+explore the state space of small litmus tests.
+
+For example, to run SB+mbonceonces.litmus against the memory model:
+
+  $ herd7 -conf linux-kernel.cfg litmus-tests/SB+mbonceonces.litmus
+
+Here is the corresponding output:
+
+  Test SB+mbonceonces Allowed
+  States 3
+  0:r0=0; 1:r0=1;
+  0:r0=1; 1:r0=0;
+  0:r0=1; 1:r0=1;
+  No
+  Witnesses
+  Positive: 0 Negative: 3
+  Condition exists (0:r0=0 /\ 1:r0=0)
+  Observation SB+mbonceonces Never 0 3
+  Time SB+mbonceonces 0.01
+  Hash=d66d99523e2cac6b06e66f4c995ebb48
+
+The "Positive: 0 Negative: 3" and the "Never 0 3" each indicate that
+this litmus test's "exists" clause can not be satisfied.
+
+See "herd7 -help" or "herdtools7/doc/" for more information.
+
+
+=====================
+BASIC USAGE: KLITMUS7
+=====================
+
+The "klitmus7" tool converts a litmus test into a Linux kernel module,
+which may then be loaded and run.
+
+For example, to run SB+mbonceonces.litmus against hardware:
+
+  $ mkdir mymodules
+  $ klitmus7 -o mymodules litmus-tests/SB+mbonceonces.litmus
+  $ cd mymodules ; make
+  $ sudo sh run.sh
+
+The corresponding output includes:
+
+  Test SB+mbonceonces Allowed
+  Histogram (3 states)
+  644580  :>0:r0=1; 1:r0=0;
+  644328  :>0:r0=0; 1:r0=1;
+  711092  :>0:r0=1; 1:r0=1;
+  No
+  Witnesses
+  Positive: 0, Negative: 2000000
+  Condition exists (0:r0=0 /\ 1:r0=0) is NOT validated
+  Hash=d66d99523e2cac6b06e66f4c995ebb48
+  Observation SB+mbonceonces Never 0 2000000
+  Time SB+mbonceonces 0.16
+
+The "Positive: 0 Negative: 2000000" and the "Never 0 2000000" indicate
+that during two million trials, the state specified in this litmus
+test's "exists" clause was not reached.
+
+And, as with "herd7", please see "klitmus7 -help" or "herdtools7/doc/"
+for more information.
+
+
+====================
+DESCRIPTION OF FILES
+====================
+
+Documentation/cheatsheet.txt
+	Quick-reference guide to the Linux-kernel memory model.
+
+Documentation/explanation.txt
+	Describes the memory model in detail.
+
+Documentation/recipes.txt
+	Lists common memory-ordering patterns.
+
+Documentation/references.txt
+	Provides background reading.
+
+linux-kernel.bell
+	Categorizes the relevant instructions, including memory
+	references, memory barriers, atomic read-modify-write operations,
+	lock acquisition/release, and RCU operations.
+
+	More formally, this file (1) lists the subtypes of the various
+	event types used by the memory model and (2) performs RCU
+	read-side critical section nesting analysis.
+
+linux-kernel.cat
+	Specifies what reorderings are forbidden by memory references,
+	memory barriers, atomic read-modify-write operations, and RCU.
+
+	More formally, this file specifies what executions are forbidden
+	by the memory model.  Allowed executions are those which
+	satisfy the model's "coherence", "atomic", "happens-before",
+	"propagation", and "rcu" axioms, which are defined in the file.
+
+linux-kernel.cfg
+	Convenience file that gathers the common-case herd7 command-line
+	arguments.
+
+linux-kernel.def
+	Maps from C-like syntax to herd7's internal litmus-test
+	instruction-set architecture.
+
+litmus-tests
+	Directory containing a few representative litmus tests, which
+	are listed in litmus-tests/README.  A great deal more litmus
+	tests are available at https://github.com/paulmckrcu/litmus.
+
+lock.cat
+	Provides a front-end analysis of lock acquisition and release,
+	for example, associating a lock acquisition with the preceding
+	and following releases and checking for self-deadlock.
+
+	More formally, this file defines a performance-enhanced scheme
+	for generation of the possible reads-from and coherence order
+	relations on the locking primitives.
+
+README
+	This file.
+
+
+===========
+LIMITATIONS
+===========
+
+The Linux-kernel memory model has the following limitations:
+
+1.	Compiler optimizations are not modeled.  Of course, the use
+	of READ_ONCE() and WRITE_ONCE() limits the compiler's ability
+	to optimize, but there is Linux-kernel code that uses bare C
+	memory accesses.  Handling this code is on the to-do list.
+	For more information, see Documentation/explanation.txt (in
+	particular, the "THE PROGRAM ORDER RELATION: po AND po-loc"
+	and "A WARNING" sections).
+
+2.	Multiple access sizes for a single variable are not supported,
+	and neither are misaligned or partially overlapping accesses.
+
+3.	Exceptions and interrupts are not modeled.  In some cases,
+	this limitation can be overcome by modeling the interrupt or
+	exception with an additional process.
+
+4.	I/O such as MMIO or DMA is not supported.
+
+5.	Self-modifying code (such as that found in the kernel's
+	alternatives mechanism, function tracer, Berkeley Packet Filter
+	JIT compiler, and module loader) is not supported.
+
+6.	Complete modeling of all variants of atomic read-modify-write
+	operations, locking primitives, and RCU is not provided.
+	For example, call_rcu() and rcu_barrier() are not supported.
+	However, a substantial amount of support is provided for these
+	operations, as shown in the linux-kernel.def file.
+
+The "herd7" tool has some additional limitations of its own, apart from
+the memory model:
+
+1.	Non-trivial data structures such as arrays or structures are
+	not supported.	However, pointers are supported, allowing trivial
+	linked lists to be constructed.
+
+2.	Dynamic memory allocation is not supported, although this can
+	be worked around in some cases by supplying multiple statically
+	allocated variables.
+
+Some of these limitations may be overcome in the future, but others are
+more likely to be addressed by incorporating the Linux-kernel memory model
+into other tools.
--- a/tools/memory-model/linux-kernel.bell
+++ b/tools/memory-model/linux-kernel.bell
@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0+
+(*
+ * Copyright (C) 2015 Jade Alglave <j.alglave@ucl.ac.uk>,
+ * Copyright (C) 2016 Luc Maranget <luc.maranget@inria.fr> for Inria
+ * Copyright (C) 2017 Alan Stern <stern@rowland.harvard.edu>,
+ *                    Andrea Parri <parri.andrea@gmail.com>
+ *
+ * An earlier version of this file appears in the companion webpage for
+ * "Frightening small children and disconcerting grown-ups: Concurrency
+ * in the Linux kernel" by Alglave, Maranget, McKenney, Parri, and Stern,
+ * which is to appear in ASPLOS 2018.
+ *)
+
+"Linux kernel memory model"
+
+enum Accesses = 'once (*READ_ONCE,WRITE_ONCE,ACCESS_ONCE*) ||
+		'release (*smp_store_release*) ||
+		'acquire (*smp_load_acquire*) ||
+		'noreturn (* R of non-return RMW *)
+instructions R[{'once,'acquire,'noreturn}]
+instructions W[{'once,'release}]
+instructions RMW[{'once,'acquire,'release}]
+
+enum Barriers = 'wmb (*smp_wmb*) ||
+		'rmb (*smp_rmb*) ||
+		'mb (*smp_mb*) ||
+		'rb_dep (*smp_read_barrier_depends*) ||
+		'rcu-lock (*rcu_read_lock*)  ||
+		'rcu-unlock (*rcu_read_unlock*) ||
+		'sync-rcu (*synchronize_rcu*) ||
+		'before_atomic (*smp_mb__before_atomic*) ||
+		'after_atomic (*smp_mb__after_atomic*) ||
+		'after_spinlock (*smp_mb__after_spinlock*)
+instructions F[Barriers]
+
+(* Compute matching pairs of nested Rcu-lock and Rcu-unlock *)
+let matched = let rec
+	    unmatched-locks = Rcu-lock \ domain(matched)
+	and unmatched-unlocks = Rcu-unlock \ range(matched)
+	and unmatched = unmatched-locks | unmatched-unlocks
+	and unmatched-po = [unmatched] ; po ; [unmatched]
+	and unmatched-locks-to-unlocks =
+		[unmatched-locks] ; po ; [unmatched-unlocks]
+	and matched = matched | (unmatched-locks-to-unlocks \
+		(unmatched-po ; unmatched-po))
+	in matched
+
+(* Validate nesting *)
+flag ~empty Rcu-lock \ domain(matched) as unbalanced-rcu-locking
+flag ~empty Rcu-unlock \ range(matched) as unbalanced-rcu-locking
+
+(* Outermost level of nesting only *)
+let crit = matched \ (po^-1 ; matched ; po^-1)
--- a/tools/memory-model/linux-kernel.cat
+++ b/tools/memory-model/linux-kernel.cat
@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0+
+(*
+ * Copyright (C) 2015 Jade Alglave <j.alglave@ucl.ac.uk>,
+ * Copyright (C) 2016 Luc Maranget <luc.maranget@inria.fr> for Inria
+ * Copyright (C) 2017 Alan Stern <stern@rowland.harvard.edu>,
+ *                    Andrea Parri <parri.andrea@gmail.com>
+ *
+ * An earlier version of this file appears in the companion webpage for
+ * "Frightening small children and disconcerting grown-ups: Concurrency
+ * in the Linux kernel" by Alglave, Maranget, McKenney, Parri, and Stern,
+ * which is to appear in ASPLOS 2018.
+ *)
+
+"Linux kernel memory model"
+
+(*
+ * File "lock.cat" handles locks and is experimental.
+ * It can be replaced by include "cos.cat" for tests that do not use locks.
+ *)
+
+include "lock.cat"
+
+(*******************)
+(* Basic relations *)
+(*******************)
+
+(* Fences *)
+let rb-dep = [R] ; fencerel(Rb_dep) ; [R]
+let rmb = [R \ Noreturn] ; fencerel(Rmb) ; [R \ Noreturn]
+let wmb = [W] ; fencerel(Wmb) ; [W]
+let mb = ([M] ; fencerel(Mb) ; [M]) |
+	([M] ; fencerel(Before_atomic) ; [RMW] ; po? ; [M]) |
+	([M] ; po? ; [RMW] ; fencerel(After_atomic) ; [M]) |
+	([M] ; po? ; [LKW] ; fencerel(After_spinlock) ; [M])
+let gp = po ; [Sync-rcu] ; po?
+
+let strong-fence = mb | gp
+
+(* Release Acquire *)
+let acq-po = [Acquire] ; po ; [M]
+let po-rel = [M] ; po ; [Release]
+let rfi-rel-acq = [Release] ; rfi ; [Acquire]
+
+(**********************************)
+(* Fundamental coherence ordering *)
+(**********************************)
+
+(* Sequential Consistency Per Variable *)
+let com = rf | co | fr
+acyclic po-loc | com as coherence
+
+(* Atomic Read-Modify-Write *)
+empty rmw & (fre ; coe) as atomic
+
+(**********************************)
+(* Instruction execution ordering *)
+(**********************************)
+
+(* Preserved Program Order *)
+let dep = addr | data
+let rwdep = (dep | ctrl) ; [W]
+let overwrite = co | fr
+let to-w = rwdep | (overwrite & int)
+let rrdep = addr | (dep ; rfi)
+let strong-rrdep = rrdep+ & rb-dep
+let to-r = strong-rrdep | rfi-rel-acq
+let fence = strong-fence | wmb | po-rel | rmb | acq-po
+let ppo = rrdep* ; (to-r | to-w | fence)
+
+(* Propagation: Ordering from release operations and strong fences. *)
+let A-cumul(r) = rfe? ; r
+let cumul-fence = A-cumul(strong-fence | po-rel) | wmb
+let prop = (overwrite & ext)? ; cumul-fence* ; rfe?
+
+(*
+ * Happens Before: Ordering from the passage of time.
+ * No fences needed here for prop because relation confined to one process.
+ *)
+let hb = ppo | rfe | ((prop \ id) & int)
+acyclic hb as happens-before
+
+(****************************************)
+(* Write and fence propagation ordering *)
+(****************************************)
+
+(* Propagation: Each non-rf link needs a strong fence. *)
+let pb = prop ; strong-fence ; hb*
+acyclic pb as propagation
+
+(*******)
+(* RCU *)
+(*******)
+
+(*
+ * Effect of read-side critical section proceeds from the rcu_read_lock()
+ * onward on the one hand and from the rcu_read_unlock() backwards on the
+ * other hand.
+ *)
+let rscs = po ; crit^-1 ; po?
+
+(*
+ * The synchronize_rcu() strong fence is special in that it can order not
+ * one but two non-rf relations, but only in conjunction with an RCU
+ * read-side critical section.
+ *)
+let link = hb* ; pb* ; prop
+
+(* Chains that affect the RCU grace-period guarantee *)
+let gp-link = gp ; link
+let rscs-link = rscs ; link
+
+(*
+ * A cycle containing at least as many grace periods as RCU read-side
+ * critical sections is forbidden.
+ *)
+let rec rcu-path =
+	gp-link |
+	(gp-link ; rscs-link) |
+	(rscs-link ; gp-link) |
+	(rcu-path ; rcu-path) |
+	(gp-link ; rcu-path ; rscs-link) |
+	(rscs-link ; rcu-path ; gp-link)
+
+irreflexive rcu-path as rcu
--- a/tools/memory-model/linux-kernel.cfg
+++ b/tools/memory-model/linux-kernel.cfg
@ -0,0 +1,21 @@
+macros linux-kernel.def
+bell linux-kernel.bell
+model linux-kernel.cat
+graph columns
+squished true
+showevents noregs
+movelabel true
+fontsize 8
+xscale 2.0
+yscale 1.5
+arrowsize 0.8
+showinitrf false
+showfinalrf false
+showinitwrites false
+splines spline
+pad 0.1
+edgeattr hb,color,indigo
+edgeattr co,color,blue
+edgeattr mb,color,darkgreen
+edgeattr wmb,color,darkgreen
+edgeattr rmb,color,darkgreen
--- a/tools/memory-model/linux-kernel.def
+++ b/tools/memory-model/linux-kernel.def
@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// An earlier version of this file appears in the companion webpage for
+// "Frightening small children and disconcerting grown-ups: Concurrency
+// in the Linux kernel" by Alglave, Maranget, McKenney, Parri, and Stern,
+// which is to appear in ASPLOS 2018.
+
+// ONCE
+READ_ONCE(X) __load{once}(X)
+WRITE_ONCE(X,V) { __store{once}(X,V); }
+
+// Release Acquire and friends
+smp_store_release(X,V) { __store{release}(*X,V); }
+smp_load_acquire(X) __load{acquire}(*X)
+rcu_assign_pointer(X,V) { __store{release}(X,V); }
+lockless_dereference(X) __load{lderef}(X)
+rcu_dereference(X) __load{deref}(X)
+
+// Fences
+smp_mb() { __fence{mb} ; }
+smp_rmb() { __fence{rmb} ; }
+smp_wmb() { __fence{wmb} ; }
+smp_read_barrier_depends() { __fence{rb_dep}; }
+smp_mb__before_atomic() { __fence{before_atomic} ; }
+smp_mb__after_atomic() { __fence{after_atomic} ; }
+smp_mb__after_spinlock() { __fence{after_spinlock} ; }
+
+// Exchange
+xchg(X,V)  __xchg{mb}(X,V)
+xchg_relaxed(X,V) __xchg{once}(X,V)
+xchg_release(X,V) __xchg{release}(X,V)
+xchg_acquire(X,V) __xchg{acquire}(X,V)
+cmpxchg(X,V,W) __cmpxchg{mb}(X,V,W)
+cmpxchg_relaxed(X,V,W) __cmpxchg{once}(X,V,W)
+cmpxchg_acquire(X,V,W) __cmpxchg{acquire}(X,V,W)
+cmpxchg_release(X,V,W) __cmpxchg{release}(X,V,W)
+
+// Spinlocks
+spin_lock(X) { __lock(X) ; }
+spin_unlock(X) { __unlock(X) ; }
+spin_trylock(X) __trylock(X)
+
+// RCU
+rcu_read_lock() { __fence{rcu-lock}; }
+rcu_read_unlock() { __fence{rcu-unlock};}
+synchronize_rcu() { __fence{sync-rcu}; }
+synchronize_rcu_expedited() { __fence{sync-rcu}; }
+
+// Atomic
+atomic_read(X) READ_ONCE(*X)
+atomic_set(X,V) { WRITE_ONCE(*X,V) ; }
+atomic_read_acquire(X) smp_load_acquire(X)
+atomic_set_release(X,V) { smp_store_release(X,V); }
+
+atomic_add(V,X) { __atomic_op(X,+,V) ; }
+atomic_sub(V,X) { __atomic_op(X,-,V) ; }
+atomic_inc(X)   { __atomic_op(X,+,1) ; }
+atomic_dec(X)   { __atomic_op(X,-,1) ; }
+
+atomic_add_return(V,X) __atomic_op_return{mb}(X,+,V)
+atomic_add_return_relaxed(V,X) __atomic_op_return{once}(X,+,V)
+atomic_add_return_acquire(V,X) __atomic_op_return{acquire}(X,+,V)
+atomic_add_return_release(V,X) __atomic_op_return{release}(X,+,V)
+atomic_fetch_add(V,X) __atomic_fetch_op{mb}(X,+,V)
+atomic_fetch_add_relaxed(V,X) __atomic_fetch_op{once}(X,+,V)
+atomic_fetch_add_acquire(V,X) __atomic_fetch_op{acquire}(X,+,V)
+atomic_fetch_add_release(V,X) __atomic_fetch_op{release}(X,+,V)
+
+atomic_inc_return(X) __atomic_op_return{mb}(X,+,1)
+atomic_inc_return_relaxed(X) __atomic_op_return{once}(X,+,1)
+atomic_inc_return_acquire(X) __atomic_op_return{acquire}(X,+,1)
+atomic_inc_return_release(X) __atomic_op_return{release}(X,+,1)
+atomic_fetch_inc(X) __atomic_fetch_op{mb}(X,+,1)
+atomic_fetch_inc_relaxed(X) __atomic_fetch_op{once}(X,+,1)
+atomic_fetch_inc_acquire(X) __atomic_fetch_op{acquire}(X,+,1)
+atomic_fetch_inc_release(X) __atomic_fetch_op{release}(X,+,1)
+
+atomic_sub_return(V,X) __atomic_op_return{mb}(X,-,V)
+atomic_sub_return_relaxed(V,X) __atomic_op_return{once}(X,-,V)
+atomic_sub_return_acquire(V,X) __atomic_op_return{acquire}(X,-,V)
+atomic_sub_return_release(V,X) __atomic_op_return{release}(X,-,V)
+atomic_fetch_sub(V,X) __atomic_fetch_op{mb}(X,-,V)
+atomic_fetch_sub_relaxed(V,X) __atomic_fetch_op{once}(X,-,V)
+atomic_fetch_sub_acquire(V,X) __atomic_fetch_op{acquire}(X,-,V)
+atomic_fetch_sub_release(V,X) __atomic_fetch_op{release}(X,-,V)
+
+atomic_dec_return(X) __atomic_op_return{mb}(X,-,1)
+atomic_dec_return_relaxed(X) __atomic_op_return{once}(X,-,1)
+atomic_dec_return_acquire(X) __atomic_op_return{acquire}(X,-,1)
+atomic_dec_return_release(X) __atomic_op_return{release}(X,-,1)
+atomic_fetch_dec(X) __atomic_fetch_op{mb}(X,-,1)
+atomic_fetch_dec_relaxed(X) __atomic_fetch_op{once}(X,-,1)
+atomic_fetch_dec_acquire(X) __atomic_fetch_op{acquire}(X,-,1)
+atomic_fetch_dec_release(X) __atomic_fetch_op{release}(X,-,1)
+
+atomic_xchg(X,V) __xchg{mb}(X,V)
+atomic_xchg_relaxed(X,V) __xchg{once}(X,V)
+atomic_xchg_release(X,V) __xchg{release}(X,V)
+atomic_xchg_acquire(X,V) __xchg{acquire}(X,V)
+atomic_cmpxchg(X,V,W) __cmpxchg{mb}(X,V,W)
+atomic_cmpxchg_relaxed(X,V,W) __cmpxchg{once}(X,V,W)
+atomic_cmpxchg_acquire(X,V,W) __cmpxchg{acquire}(X,V,W)
+atomic_cmpxchg_release(X,V,W) __cmpxchg{release}(X,V,W)
+
+atomic_sub_and_test(V,X) __atomic_op_return{mb}(X,-,V) == 0
+atomic_dec_and_test(X)  __atomic_op_return{mb}(X,-,1) == 0
+atomic_inc_and_test(X)  __atomic_op_return{mb}(X,+,1) == 0
+atomic_add_negative(V,X) __atomic_op_return{mb}(X,+,V) < 0
--- a/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus
+++ b/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus
@ -0,0 +1,19 @@
+C CoRR+poonceonce+Once
+
+{}
+
+P0(int *x)
+{
+	WRITE_ONCE(*x, 1);
+}
+
+P1(int *x)
+{
+	int r0;
+	int r1;
+
+	r0 = READ_ONCE(*x);
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ 1:r1=0)
--- a/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus
+++ b/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus
@ -0,0 +1,18 @@
+C CoRW+poonceonce+Once
+
+{}
+
+P0(int *x)
+{
+	int r0;
+
+	r0 = READ_ONCE(*x);
+	WRITE_ONCE(*x, 1);
+}
+
+P1(int *x)
+{
+	WRITE_ONCE(*x, 2);
+}
+
+exists (x=2 /\ 0:r0=2)
--- a/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus
+++ b/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus
@ -0,0 +1,18 @@
+C CoWR+poonceonce+Once
+
+{}
+
+P0(int *x)
+{
+	int r0;
+
+	WRITE_ONCE(*x, 1);
+	r0 = READ_ONCE(*x);
+}
+
+P1(int *x)
+{
+	WRITE_ONCE(*x, 2);
+}
+
+exists (x=1 /\ 0:r0=2)
--- a/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus
+++ b/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus
@ -0,0 +1,11 @@
+C CoWW+poonceonce
+
+{}
+
+P0(int *x)
+{
+	WRITE_ONCE(*x, 1);
+	WRITE_ONCE(*x, 2);
+}
+
+exists (x=1)
--- a/tools/memory-model/litmus-tests/IRIW+mbonceonces+OnceOnce.litmus
+++ b/tools/memory-model/litmus-tests/IRIW+mbonceonces+OnceOnce.litmus
@ -0,0 +1,35 @@
+C IRIW+mbonceonces+OnceOnce
+
+{}
+
+P0(int *x)
+{
+	WRITE_ONCE(*x, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+	int r1;
+
+	r0 = READ_ONCE(*x);
+	smp_mb();
+	r1 = READ_ONCE(*y);
+}
+
+P2(int *y)
+{
+	WRITE_ONCE(*y, 1);
+}
+
+P3(int *x, int *y)
+{
+	int r0;
+	int r1;
+
+	r0 = READ_ONCE(*y);
+	smp_mb();
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ 1:r1=0 /\ 3:r0=1 /\ 3:r1=0)
--- a/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus
+++ b/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus
@ -0,0 +1,33 @@
+C IRIW+poonceonces+OnceOnce
+
+{}
+
+P0(int *x)
+{
+	WRITE_ONCE(*x, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+	int r1;
+
+	r0 = READ_ONCE(*x);
+	r1 = READ_ONCE(*y);
+}
+
+P2(int *y)
+{
+	WRITE_ONCE(*y, 1);
+}
+
+P3(int *x, int *y)
+{
+	int r0;
+	int r1;
+
+	r0 = READ_ONCE(*y);
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ 1:r1=0 /\ 3:r0=1 /\ 3:r1=0)
--- a/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus
+++ b/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus
@ -0,0 +1,28 @@
+C ISA2+poonceonces
+
+{}
+
+P0(int *x, int *y)
+{
+	WRITE_ONCE(*x, 1);
+	WRITE_ONCE(*y, 1);
+}
+
+P1(int *y, int *z)
+{
+	int r0;
+
+	r0 = READ_ONCE(*y);
+	WRITE_ONCE(*z, 1);
+}
+
+P2(int *x, int *z)
+{
+	int r0;
+	int r1;
+
+	r0 = READ_ONCE(*z);
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ 2:r0=1 /\ 2:r1=0)
--- a/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus
+++ b/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus
@ -0,0 +1,28 @@
+C ISA2+pooncerelease+poacquirerelease+poacquireonce
+
+{}
+
+P0(int *x, int *y)
+{
+	WRITE_ONCE(*x, 1);
+	smp_store_release(y, 1);
+}
+
+P1(int *y, int *z)
+{
+	int r0;
+
+	r0 = smp_load_acquire(y);
+	smp_store_release(z, 1);
+}
+
+P2(int *x, int *z)
+{
+	int r0;
+	int r1;
+
+	r0 = smp_load_acquire(z);
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ 2:r0=1 /\ 2:r1=0)
--- a/tools/memory-model/litmus-tests/LB+ctrlonceonce+mbonceonce.litmus
+++ b/tools/memory-model/litmus-tests/LB+ctrlonceonce+mbonceonce.litmus
@ -0,0 +1,23 @@
+C LB+ctrlonceonce+mbonceonce
+
+{}
+
+P0(int *x, int *y)
+{
+	int r0;
+
+	r0 = READ_ONCE(*x);
+	if (r0)
+		WRITE_ONCE(*y, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+
+	r0 = READ_ONCE(*y);
+	smp_mb();
+	WRITE_ONCE(*x, 1);
+}
+
+exists (0:r0=1 /\ 1:r0=1)
--- a/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus
+++ b/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus
@ -0,0 +1,21 @@
+C LB+poacquireonce+pooncerelease
+
+{}
+
+P0(int *x, int *y)
+{
+	int r0;
+
+	r0 = READ_ONCE(*x);
+	smp_store_release(y, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+
+	r0 = smp_load_acquire(y);
+	WRITE_ONCE(*x, 1);
+}
+
+exists (0:r0=1 /\ 1:r0=1)
--- a/tools/memory-model/litmus-tests/LB+poonceonces.litmus
+++ b/tools/memory-model/litmus-tests/LB+poonceonces.litmus
@ -0,0 +1,21 @@
+C LB+poonceonces
+
+{}
+
+P0(int *x, int *y)
+{
+	int r0;
+
+	r0 = READ_ONCE(*x);
+	WRITE_ONCE(*y, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+
+	r0 = READ_ONCE(*y);
+	WRITE_ONCE(*x, 1);
+}
+
+exists (0:r0=1 /\ 1:r0=1)
--- a/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus
+++ b/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus
@ -0,0 +1,25 @@
+C MP+onceassign+derefonce.litmus
+
+{
+y=z;
+z=0;
+}
+
+P0(int *x, int **y)
+{
+	WRITE_ONCE(*x, 1);
+	rcu_assign_pointer(*y, x);
+}
+
+P1(int *x, int **y)
+{
+	int *r0;
+	int r1;
+
+	rcu_read_lock();
+	r0 = rcu_dereference(*y);
+	r1 = READ_ONCE(*r0);
+	rcu_read_unlock();
+}
+
+exists (1:r0=x /\ 1:r1=0)
--- a/tools/memory-model/litmus-tests/MP+polocks.litmus
+++ b/tools/memory-model/litmus-tests/MP+polocks.litmus
@ -0,0 +1,24 @@
+C MP+polocks
+
+{}
+
+P0(int *x, int *y, spinlock_t *mylock)
+{
+	WRITE_ONCE(*x, 1);
+	spin_lock(mylock);
+	WRITE_ONCE(*y, 1);
+	spin_unlock(mylock);
+}
+
+P1(int *x, int *y, spinlock_t *mylock)
+{
+	int r0;
+	int r1;
+
+	spin_lock(mylock);
+	r0 = READ_ONCE(*y);
+	spin_unlock(mylock);
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ 1:r1=0)
--- a/tools/memory-model/litmus-tests/MP+poonceonces.litmus
+++ b/tools/memory-model/litmus-tests/MP+poonceonces.litmus
@ -0,0 +1,20 @@
+C MP+poonceonces
+
+{}
+
+P0(int *x, int *y)
+{
+	WRITE_ONCE(*x, 1);
+	WRITE_ONCE(*y, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+	int r1;
+
+	r0 = READ_ONCE(*y);
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ 1:r1=0)
--- a/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus
+++ b/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus
@ -0,0 +1,20 @@
+C MP+pooncerelease+poacquireonce
+
+{}
+
+P0(int *x, int *y)
+{
+	WRITE_ONCE(*x, 1);
+	smp_store_release(y, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+	int r1;
+
+	r0 = smp_load_acquire(y);
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ 1:r1=0)
--- a/tools/memory-model/litmus-tests/MP+porevlocks.litmus
+++ b/tools/memory-model/litmus-tests/MP+porevlocks.litmus
@ -0,0 +1,24 @@
+C MP+porevlocks
+
+{}
+
+P0(int *x, int *y, spinlock_t *mylock)
+{
+	int r0;
+	int r1;
+
+	r0 = READ_ONCE(*y);
+	spin_lock(mylock);
+	r1 = READ_ONCE(*x);
+	spin_unlock(mylock);
+}
+
+P1(int *x, int *y, spinlock_t *mylock)
+{
+	spin_lock(mylock);
+	WRITE_ONCE(*x, 1);
+	spin_unlock(mylock);
+	WRITE_ONCE(*y, 1);
+}
+
+exists (0:r0=1 /\ 0:r1=0)
--- a/tools/memory-model/litmus-tests/MP+wmbonceonce+rmbonceonce.litmus
+++ b/tools/memory-model/litmus-tests/MP+wmbonceonce+rmbonceonce.litmus
@ -0,0 +1,22 @@
+C MP+wmbonceonce+rmbonceonce
+
+{}
+
+P0(int *x, int *y)
+{
+	WRITE_ONCE(*x, 1);
+	smp_wmb();
+	WRITE_ONCE(*y, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+	int r1;
+
+	r0 = READ_ONCE(*y);
+	smp_rmb();
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ 1:r1=0)
--- a/tools/memory-model/litmus-tests/R+mbonceonces.litmus
+++ b/tools/memory-model/litmus-tests/R+mbonceonces.litmus
@ -0,0 +1,21 @@
+C R+mbonceonces
+
+{}
+
+P0(int *x, int *y)
+{
+	WRITE_ONCE(*x, 1);
+	smp_mb();
+	WRITE_ONCE(*y, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+
+	WRITE_ONCE(*y, 2);
+	smp_mb();
+	r0 = READ_ONCE(*x);
+}
+
+exists (y=2 /\ 1:r0=0)
--- a/tools/memory-model/litmus-tests/R+poonceonces.litmus
+++ b/tools/memory-model/litmus-tests/R+poonceonces.litmus
@ -0,0 +1,19 @@
+C R+poonceonces
+
+{}
+
+P0(int *x, int *y)
+{
+	WRITE_ONCE(*x, 1);
+	WRITE_ONCE(*y, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+
+	WRITE_ONCE(*y, 2);
+	r0 = READ_ONCE(*x);
+}
+
+exists (y=2 /\ 1:r0=0)
--- a/tools/memory-model/litmus-tests/README
+++ b/tools/memory-model/litmus-tests/README
@ -0,0 +1,125 @@
+This directory contains the following litmus tests:
+
+CoRR+poonceonce+Once.litmus
+	Test of read-read coherence, that is, whether or not two
+	successive reads from the same variable are ordered.
+
+CoRW+poonceonce+Once.litmus
+	Test of read-write coherence, that is, whether or not a read
+	from a given variable followed by a write to that same variable
+	are ordered.
+
+CoWR+poonceonce+Once.litmus
+	Test of write-read coherence, that is, whether or not a write
+	to a given variable followed by a read from that same variable
+	are ordered.
+
+CoWW+poonceonce.litmus
+	Test of write-write coherence, that is, whether or not two
+	successive writes to the same variable are ordered.
+
+IRIW+mbonceonces+OnceOnce.litmus
+	Test of independent reads from independent writes with smp_mb()
+	between each pairs of reads.  In other words, is smp_mb()
+	sufficient to cause two different reading processes to agree on
+	the order of a pair of writes, where each write is to a different
+	variable by a different process.
+
+IRIW+poonceonces+OnceOnce.litmus
+	Test of independent reads from independent writes with nothing
+	between each pairs of reads.  In other words, is anything at all
+	needed to cause two different reading processes to agree on the
+	order of a pair of writes, where each write is to a different
+	variable by a different process.
+
+ISA2+poonceonces.litmus
+	As below, but with store-release replaced with WRITE_ONCE()
+	and load-acquire replaced with READ_ONCE().
+
+ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus
+	Can a release-acquire chain order a prior store against
+	a later load?
+
+LB+ctrlonceonce+mbonceonce.litmus
+	Does a control dependency and an smp_mb() suffice for the
+	load-buffering litmus test, where each process reads from one
+	of two variables then writes to the other?
+
+LB+poacquireonce+pooncerelease.litmus
+	Does a release-acquire pair suffice for the load-buffering
+	litmus test, where each process reads from one of two variables then
+	writes to the other?
+
+LB+poonceonces.litmus
+	As above, but with store-release replaced with WRITE_ONCE()
+	and load-acquire replaced with READ_ONCE().
+
+MP+onceassign+derefonce.litmus
+	As below, but with rcu_assign_pointer() and an rcu_dereference().
+
+MP+polocks.litmus
+	As below, but with the second access of the writer process
+	and the first access of reader process protected by a lock.
+
+MP+poonceonces.litmus
+	As below, but without the smp_rmb() and smp_wmb().
+
+MP+pooncerelease+poacquireonce.litmus
+	As below, but with a release-acquire chain.
+
+MP+porevlocks.litmus
+	As below, but with the first access of the writer process
+	and the second access of reader process protected by a lock.
+
+MP+wmbonceonce+rmbonceonce.litmus
+	Does a smp_wmb() (between the stores) and an smp_rmb() (between
+	the loads) suffice for the message-passing litmus test, where one
+	process writes data and then a flag, and the other process reads
+	the flag and then the data.  (This is similar to the ISA2 tests,
+	but with two processes instead of three.)
+
+R+mbonceonces.litmus
+	This is the fully ordered (via smp_mb()) version of one of
+	the classic counterintuitive litmus tests that illustrates the
+	effects of store propagation delays.
+
+R+poonceonces.litmus
+	As above, but without the smp_mb() invocations.
+
+SB+mbonceonces.litmus
+	This is the fully ordered (again, via smp_mb() version of store
+	buffering, which forms the core of Dekker's mutual-exclusion
+	algorithm.
+
+SB+poonceonces.litmus
+	As above, but without the smp_mb() invocations.
+
+S+poonceonces.litmus
+	As below, but without the smp_wmb() and acquire load.
+
+S+wmbonceonce+poacquireonce.litmus
+	Can a smp_wmb(), instead of a release, and an acquire order
+	a prior store against a subsequent store?
+
+WRC+poonceonces+Once.litmus
+WRC+pooncerelease+rmbonceonce+Once.litmus
+	These two are members of an extension of the MP litmus-test class
+	in which the first write is moved to a separate process.
+
+Z6.0+pooncelock+pooncelock+pombonce.litmus
+	Is the ordering provided by a spin_unlock() and a subsequent
+	spin_lock() sufficient to make ordering apparent to accesses
+	by a process not holding the lock?
+
+Z6.0+pooncelock+poonceLock+pombonce.litmus
+	As above, but with smp_mb__after_spinlock() immediately
+	following the spin_lock().
+
+Z6.0+pooncerelease+poacquirerelease+mbonceonce.litmus
+	Is the ordering provided by a release-acquire chain sufficient
+	to make ordering apparent to accesses by a process that does
+	not participate in that release-acquire chain?
+
+A great many more litmus tests are available here:
+
+	https://github.com/paulmckrcu/litmus
--- a/tools/memory-model/litmus-tests/S+poonceonces.litmus
+++ b/tools/memory-model/litmus-tests/S+poonceonces.litmus
@ -0,0 +1,19 @@
+C S+poonceonces
+
+{}
+
+P0(int *x, int *y)
+{
+	WRITE_ONCE(*x, 2);
+	WRITE_ONCE(*y, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+
+	r0 = READ_ONCE(*y);
+	WRITE_ONCE(*x, 1);
+}
+
+exists (x=2 /\ 1:r0=1)
--- a/tools/memory-model/litmus-tests/S+wmbonceonce+poacquireonce.litmus
+++ b/tools/memory-model/litmus-tests/S+wmbonceonce+poacquireonce.litmus
@ -0,0 +1,20 @@
+C S+wmbonceonce+poacquireonce
+
+{}
+
+P0(int *x, int *y)
+{
+	WRITE_ONCE(*x, 2);
+	smp_wmb();
+	WRITE_ONCE(*y, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+
+	r0 = smp_load_acquire(y);
+	WRITE_ONCE(*x, 1);
+}
+
+exists (x=2 /\ 1:r0=1)
--- a/tools/memory-model/litmus-tests/SB+mbonceonces.litmus
+++ b/tools/memory-model/litmus-tests/SB+mbonceonces.litmus
@ -0,0 +1,23 @@
+C SB+mbonceonces
+
+{}
+
+P0(int *x, int *y)
+{
+	int r0;
+
+	WRITE_ONCE(*x, 1);
+	smp_mb();
+	r0 = READ_ONCE(*y);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+
+	WRITE_ONCE(*y, 1);
+	smp_mb();
+	r0 = READ_ONCE(*x);
+}
+
+exists (0:r0=0 /\ 1:r0=0)
--- a/tools/memory-model/litmus-tests/SB+poonceonces.litmus
+++ b/tools/memory-model/litmus-tests/SB+poonceonces.litmus
@ -0,0 +1,21 @@
+C SB+poonceonces
+
+{}
+
+P0(int *x, int *y)
+{
+	int r0;
+
+	WRITE_ONCE(*x, 1);
+	r0 = READ_ONCE(*y);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+
+	WRITE_ONCE(*y, 1);
+	r0 = READ_ONCE(*x);
+}
+
+exists (0:r0=0 /\ 1:r0=0)
--- a/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus
+++ b/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus
@ -0,0 +1,27 @@
+C WRC+poonceonces+Once
+
+{}
+
+P0(int *x)
+{
+	WRITE_ONCE(*x, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+
+	r0 = READ_ONCE(*x);
+	WRITE_ONCE(*y, 1);
+}
+
+P2(int *x, int *y)
+{
+	int r0;
+	int r1;
+
+	r0 = READ_ONCE(*y);
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ 2:r0=1 /\ 2:r1=0)
--- a/tools/memory-model/litmus-tests/WRC+pooncerelease+rmbonceonce+Once.litmus
+++ b/tools/memory-model/litmus-tests/WRC+pooncerelease+rmbonceonce+Once.litmus
@ -0,0 +1,28 @@
+C WRC+pooncerelease+rmbonceonce+Once
+
+{}
+
+P0(int *x)
+{
+	WRITE_ONCE(*x, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+
+	r0 = READ_ONCE(*x);
+	smp_store_release(y, 1);
+}
+
+P2(int *x, int *y)
+{
+	int r0;
+	int r1;
+
+	r0 = READ_ONCE(*y);
+	smp_rmb();
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ 2:r0=1 /\ 2:r1=0)
--- a/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus
+++ b/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus
@ -0,0 +1,33 @@
+C Z6.0+pooncelock+poonceLock+pombonce
+
+{}
+
+P0(int *x, int *y, spinlock_t *mylock)
+{
+	spin_lock(mylock);
+	WRITE_ONCE(*x, 1);
+	WRITE_ONCE(*y, 1);
+	spin_unlock(mylock);
+}
+
+P1(int *y, int *z, spinlock_t *mylock)
+{
+	int r0;
+
+	spin_lock(mylock);
+	smp_mb__after_spinlock();
+	r0 = READ_ONCE(*y);
+	WRITE_ONCE(*z, 1);
+	spin_unlock(mylock);
+}
+
+P2(int *x, int *z)
+{
+	int r1;
+
+	WRITE_ONCE(*z, 2);
+	smp_mb();
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ z=2 /\ 2:r1=0)
--- a/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus
+++ b/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus
@ -0,0 +1,32 @@
+C Z6.0+pooncelock+pooncelock+pombonce
+
+{}
+
+P0(int *x, int *y, spinlock_t *mylock)
+{
+	spin_lock(mylock);
+	WRITE_ONCE(*x, 1);
+	WRITE_ONCE(*y, 1);
+	spin_unlock(mylock);
+}
+
+P1(int *y, int *z, spinlock_t *mylock)
+{
+	int r0;
+
+	spin_lock(mylock);
+	r0 = READ_ONCE(*y);
+	WRITE_ONCE(*z, 1);
+	spin_unlock(mylock);
+}
+
+P2(int *x, int *z)
+{
+	int r1;
+
+	WRITE_ONCE(*z, 2);
+	smp_mb();
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ z=2 /\ 2:r1=0)
--- a/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+mbonceonce.litmus
+++ b/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+mbonceonce.litmus
@ -0,0 +1,28 @@
+C Z6.0+pooncerelease+poacquirerelease+mbonceonce
+
+{}
+
+P0(int *x, int *y)
+{
+	WRITE_ONCE(*x, 1);
+	smp_store_release(y, 1);
+}
+
+P1(int *y, int *z)
+{
+	int r0;
+
+	r0 = smp_load_acquire(y);
+	smp_store_release(z, 1);
+}
+
+P2(int *x, int *z)
+{
+	int r1;
+
+	WRITE_ONCE(*z, 2);
+	smp_mb();
+	r1 = READ_ONCE(*x);
+}
+
+exists (1:r0=1 /\ z=2 /\ 2:r1=0)
--- a/tools/memory-model/lock.cat
+++ b/tools/memory-model/lock.cat
@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0+
+(*
+ * Copyright (C) 2016 Luc Maranget <luc.maranget@inria.fr> for Inria
+ * Copyright (C) 2017 Alan Stern <stern@rowland.harvard.edu>
+ *)
+
+(* Generate coherence orders and handle lock operations *)
+
+include "cross.cat"
+
+(* From lock reads to their partner lock writes *)
+let lk-rmw = ([LKR] ; po-loc ; [LKW]) \ (po ; po)
+let rmw = rmw | lk-rmw
+
+(*
+ * A paired LKR must always see an unlocked value; spin_lock() calls nested
+ * inside a critical section (for the same lock) always deadlock.
+ *)
+empty ([LKW] ; po-loc ; [domain(lk-rmw)]) \ (po-loc ; [UL] ; po-loc)
+	as lock-nest
+
+(* The litmus test is invalid if an LKW event is not part of an RMW pair *)
+flag ~empty LKW \ range(lk-rmw) as unpaired-LKW
+
+(* This will be allowed if we implement spin_is_locked() *)
+flag ~empty LKR \ domain(lk-rmw) as unpaired-LKR
+
+(* There should be no R or W accesses to spinlocks *)
+let ALL-LOCKS = LKR | LKW | UL | LF
+flag ~empty [M \ IW] ; loc ; [ALL-LOCKS] as mixed-lock-accesses
+
+(* The final value of a spinlock should not be tested *)
+flag ~empty [FW] ; loc ; [ALL-LOCKS] as lock-final
+
+
+(*
+ * Put lock operations in their appropriate classes, but leave UL out of W
+ * until after the co relation has been generated.
+ *)
+let R = R | LKR | LF
+let W = W | LKW
+
+let Release = Release | UL
+let Acquire = Acquire | LKR
+
+
+(* Match LKW events to their corresponding UL events *)
+let critical = ([LKW] ; po-loc ; [UL]) \ (po-loc ; [LKW | UL] ; po-loc)
+
+flag ~empty UL \ range(critical) as unmatched-unlock
+
+(* Allow up to one unmatched LKW per location; more must deadlock *)
+let UNMATCHED-LKW = LKW \ domain(critical)
+empty ([UNMATCHED-LKW] ; loc ; [UNMATCHED-LKW]) \ id as unmatched-locks
+
+
+(* rfi for LF events: link each LKW to the LF events in its critical section *)
+let rfi-lf = ([LKW] ; po-loc ; [LF]) \ ([LKW] ; po-loc ; [UL] ; po-loc)
+
+(* rfe for LF events *)
+let all-possible-rfe-lf =
+  (*
+   * Given an LF event r, compute the possible rfe edges for that event
+   * (all those starting from LKW events in other threads),
+   * and then convert that relation to a set of single-edge relations.
+   *)
+  let possible-rfe-lf r =
+    let pair-to-relation p = p ++ 0
+    in map pair-to-relation ((LKW * {r}) & loc & ext)
+  (* Do this for each LF event r that isn't in rfi-lf *)
+  in map possible-rfe-lf (LF \ range(rfi-lf))
+
+(* Generate all rf relations for LF events *)
+with rfe-lf from cross(all-possible-rfe-lf)
+let rf = rf | rfi-lf | rfe-lf
+
+
+(* Generate all co relations, including LKW events but not UL *)
+let co0 = co0 | ([IW] ; loc ; [LKW]) |
+	(([LKW] ; loc ; [UNMATCHED-LKW]) \ [UNMATCHED-LKW])
+include "cos-opt.cat"
+let W = W | UL
+let M = R | W
+
+(* Merge UL events into co *)
+let co = (co | critical | (critical^-1 ; co))+
+let coe = co & ext
+let coi = co & int
+
+(* Merge LKR events into rf *)
+let rf = rf | ([IW | UL] ; singlestep(co) ; lk-rmw^-1)
+let rfe = rf & ext
+let rfi = rf & int
+
+let fr = rf^-1 ; co
+let fre = fr & ext
+let fri = fr & int
+
+show co,rf,fr