x86/entry: Add STACKLEAK erasing the kernel stack at the end of syscalls
The STACKLEAK feature (initially developed by PaX Team) has the following benefits: 1. Reduces the information that can be revealed through kernel stack leak bugs. The idea of erasing the thread stack at the end of syscalls is similar to CONFIG_PAGE_POISONING and memzero_explicit() in kernel crypto, which all comply with FDP_RIP.2 (Full Residual Information Protection) of the Common Criteria standard. 2. Blocks some uninitialized stack variable attacks (e.g. CVE-2017-17712, CVE-2010-2963). That kind of bugs should be killed by improving C compilers in future, which might take a long time. This commit introduces the code filling the used part of the kernel stack with a poison value before returning to userspace. Full STACKLEAK feature also contains the gcc plugin which comes in a separate commit. The STACKLEAK feature is ported from grsecurity/PaX. More information at: https://grsecurity.net/ https://pax.grsecurity.net/ This code is modified from Brad Spengler/PaX Team's code in the last public patch of grsecurity/PaX based on our understanding of the code. Changes or omissions from the original code are ours and don't reflect the original grsecurity/PaX code. Performance impact: Hardware: Intel Core i7-4770, 16 GB RAM Test #1: building the Linux kernel on a single core 0.91% slowdown Test #2: hackbench -s 4096 -l 2000 -g 15 -f 25 -P 4.2% slowdown So the STACKLEAK description in Kconfig includes: "The tradeoff is the performance impact: on a single CPU system kernel compilation sees a 1% slowdown, other systems and workloads may vary and you are advised to test this feature on your expected workload before deploying it". Signed-off-by: Alexander Popov <alex.popov@linux.com> Acked-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com> Acked-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Kees Cook <keescook@chromium.org>
This commit is contained in:
		
							parent
							
								
									57361846b5
								
							
						
					
					
						commit
						afaef01c00
					
				@ -24,6 +24,7 @@ ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
 | 
			
		||||
[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
 | 
			
		||||
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 | 
			
		||||
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 | 
			
		||||
STACKLEAK_POISON value in this last hole: ffffffffffff4111
 | 
			
		||||
 | 
			
		||||
Virtual memory map with 5 level page tables:
 | 
			
		||||
 | 
			
		||||
@ -50,6 +51,7 @@ ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
 | 
			
		||||
[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
 | 
			
		||||
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 | 
			
		||||
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 | 
			
		||||
STACKLEAK_POISON value in this last hole: ffffffffffff4111
 | 
			
		||||
 | 
			
		||||
Architecture defines a 64-bit virtual address. Implementations can support
 | 
			
		||||
less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
 | 
			
		||||
 | 
			
		||||
@ -419,6 +419,13 @@ config SECCOMP_FILTER
 | 
			
		||||
 | 
			
		||||
	  See Documentation/userspace-api/seccomp_filter.rst for details.
 | 
			
		||||
 | 
			
		||||
config HAVE_ARCH_STACKLEAK
 | 
			
		||||
	bool
 | 
			
		||||
	help
 | 
			
		||||
	  An architecture should select this if it has the code which
 | 
			
		||||
	  fills the used part of the kernel stack with the STACKLEAK_POISON
 | 
			
		||||
	  value before returning from system calls.
 | 
			
		||||
 | 
			
		||||
config HAVE_STACKPROTECTOR
 | 
			
		||||
	bool
 | 
			
		||||
	help
 | 
			
		||||
 | 
			
		||||
@ -127,6 +127,7 @@ config X86
 | 
			
		||||
	select HAVE_ARCH_PREL32_RELOCATIONS
 | 
			
		||||
	select HAVE_ARCH_SECCOMP_FILTER
 | 
			
		||||
	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 | 
			
		||||
	select HAVE_ARCH_STACKLEAK
 | 
			
		||||
	select HAVE_ARCH_TRACEHOOK
 | 
			
		||||
	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 | 
			
		||||
	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
 | 
			
		||||
 | 
			
		||||
@ -329,8 +329,22 @@ For 32-bit we have the following conventions - kernel is built with
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
.macro STACKLEAK_ERASE_NOCLOBBER
 | 
			
		||||
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
 | 
			
		||||
	PUSH_AND_CLEAR_REGS
 | 
			
		||||
	call stackleak_erase
 | 
			
		||||
	POP_REGS
 | 
			
		||||
#endif
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
#endif /* CONFIG_X86_64 */
 | 
			
		||||
 | 
			
		||||
.macro STACKLEAK_ERASE
 | 
			
		||||
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
 | 
			
		||||
	call stackleak_erase
 | 
			
		||||
#endif
 | 
			
		||||
.endm
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * This does 'call enter_from_user_mode' unless we can avoid it based on
 | 
			
		||||
 * kernel config or using the static jump infrastructure.
 | 
			
		||||
 | 
			
		||||
@ -46,6 +46,8 @@
 | 
			
		||||
#include <asm/frame.h>
 | 
			
		||||
#include <asm/nospec-branch.h>
 | 
			
		||||
 | 
			
		||||
#include "calling.h"
 | 
			
		||||
 | 
			
		||||
	.section .entry.text, "ax"
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
@ -711,6 +713,7 @@ ENTRY(ret_from_fork)
 | 
			
		||||
	/* When we fork, we trace the syscall return in the child, too. */
 | 
			
		||||
	movl    %esp, %eax
 | 
			
		||||
	call    syscall_return_slowpath
 | 
			
		||||
	STACKLEAK_ERASE
 | 
			
		||||
	jmp     restore_all
 | 
			
		||||
 | 
			
		||||
	/* kernel thread */
 | 
			
		||||
@ -885,6 +888,8 @@ ENTRY(entry_SYSENTER_32)
 | 
			
		||||
	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
 | 
			
		||||
		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
 | 
			
		||||
 | 
			
		||||
	STACKLEAK_ERASE
 | 
			
		||||
 | 
			
		||||
/* Opportunistic SYSEXIT */
 | 
			
		||||
	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
 | 
			
		||||
 | 
			
		||||
@ -996,6 +1001,8 @@ ENTRY(entry_INT80_32)
 | 
			
		||||
	call	do_int80_syscall_32
 | 
			
		||||
.Lsyscall_32_done:
 | 
			
		||||
 | 
			
		||||
	STACKLEAK_ERASE
 | 
			
		||||
 | 
			
		||||
restore_all:
 | 
			
		||||
	TRACE_IRQS_IRET
 | 
			
		||||
	SWITCH_TO_ENTRY_STACK
 | 
			
		||||
 | 
			
		||||
@ -329,6 +329,8 @@ syscall_return_via_sysret:
 | 
			
		||||
	 * We are on the trampoline stack.  All regs except RDI are live.
 | 
			
		||||
	 * We can do future final exit work right here.
 | 
			
		||||
	 */
 | 
			
		||||
	STACKLEAK_ERASE_NOCLOBBER
 | 
			
		||||
 | 
			
		||||
	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
 | 
			
		||||
 | 
			
		||||
	popq	%rdi
 | 
			
		||||
@ -688,6 +690,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
 | 
			
		||||
	 * We are on the trampoline stack.  All regs except RDI are live.
 | 
			
		||||
	 * We can do future final exit work right here.
 | 
			
		||||
	 */
 | 
			
		||||
	STACKLEAK_ERASE_NOCLOBBER
 | 
			
		||||
 | 
			
		||||
	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -261,6 +261,11 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
 | 
			
		||||
 | 
			
		||||
	/* Opportunistic SYSRET */
 | 
			
		||||
sysret32_from_system_call:
 | 
			
		||||
	/*
 | 
			
		||||
	 * We are not going to return to userspace from the trampoline
 | 
			
		||||
	 * stack. So let's erase the thread stack right now.
 | 
			
		||||
	 */
 | 
			
		||||
	STACKLEAK_ERASE
 | 
			
		||||
	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
 | 
			
		||||
	movq	RBX(%rsp), %rbx		/* pt_regs->rbx */
 | 
			
		||||
	movq	RBP(%rsp), %rbp		/* pt_regs->rbp */
 | 
			
		||||
 | 
			
		||||
@ -1192,6 +1192,10 @@ struct task_struct {
 | 
			
		||||
	void				*security;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
 | 
			
		||||
	unsigned long			lowest_stack;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * New fields for task_struct should be added above here, so that
 | 
			
		||||
	 * they are included in the randomized portion of task_struct.
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										26
									
								
								include/linux/stackleak.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								include/linux/stackleak.h
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,26 @@
 | 
			
		||||
/* SPDX-License-Identifier: GPL-2.0 */
 | 
			
		||||
#ifndef _LINUX_STACKLEAK_H
 | 
			
		||||
#define _LINUX_STACKLEAK_H
 | 
			
		||||
 | 
			
		||||
#include <linux/sched.h>
 | 
			
		||||
#include <linux/sched/task_stack.h>
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Check that the poison value points to the unused hole in the
 | 
			
		||||
 * virtual memory map for your platform.
 | 
			
		||||
 */
 | 
			
		||||
#define STACKLEAK_POISON -0xBEEF
 | 
			
		||||
#define STACKLEAK_SEARCH_DEPTH 128
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
 | 
			
		||||
#include <asm/stacktrace.h>
 | 
			
		||||
 | 
			
		||||
static inline void stackleak_task_init(struct task_struct *t)
 | 
			
		||||
{
 | 
			
		||||
	t->lowest_stack = (unsigned long)end_of_stack(t) + sizeof(unsigned long);
 | 
			
		||||
}
 | 
			
		||||
#else /* !CONFIG_GCC_PLUGIN_STACKLEAK */
 | 
			
		||||
static inline void stackleak_task_init(struct task_struct *t) { }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
@ -117,6 +117,10 @@ obj-$(CONFIG_HAS_IOMEM) += iomem.o
 | 
			
		||||
obj-$(CONFIG_ZONE_DEVICE) += memremap.o
 | 
			
		||||
obj-$(CONFIG_RSEQ) += rseq.o
 | 
			
		||||
 | 
			
		||||
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
 | 
			
		||||
KASAN_SANITIZE_stackleak.o := n
 | 
			
		||||
KCOV_INSTRUMENT_stackleak.o := n
 | 
			
		||||
 | 
			
		||||
$(obj)/configs.o: $(obj)/config_data.h
 | 
			
		||||
 | 
			
		||||
targets += config_data.gz
 | 
			
		||||
 | 
			
		||||
@ -91,6 +91,7 @@
 | 
			
		||||
#include <linux/kcov.h>
 | 
			
		||||
#include <linux/livepatch.h>
 | 
			
		||||
#include <linux/thread_info.h>
 | 
			
		||||
#include <linux/stackleak.h>
 | 
			
		||||
 | 
			
		||||
#include <asm/pgtable.h>
 | 
			
		||||
#include <asm/pgalloc.h>
 | 
			
		||||
@ -1880,6 +1881,8 @@ static __latent_entropy struct task_struct *copy_process(
 | 
			
		||||
	if (retval)
 | 
			
		||||
		goto bad_fork_cleanup_io;
 | 
			
		||||
 | 
			
		||||
	stackleak_task_init(p);
 | 
			
		||||
 | 
			
		||||
	if (pid != &init_struct_pid) {
 | 
			
		||||
		pid = alloc_pid(p->nsproxy->pid_ns_for_children);
 | 
			
		||||
		if (IS_ERR(pid)) {
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										62
									
								
								kernel/stackleak.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								kernel/stackleak.c
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,62 @@
 | 
			
		||||
// SPDX-License-Identifier: GPL-2.0
 | 
			
		||||
/*
 | 
			
		||||
 * This code fills the used part of the kernel stack with a poison value
 | 
			
		||||
 * before returning to userspace. It's part of the STACKLEAK feature
 | 
			
		||||
 * ported from grsecurity/PaX.
 | 
			
		||||
 *
 | 
			
		||||
 * Author: Alexander Popov <alex.popov@linux.com>
 | 
			
		||||
 *
 | 
			
		||||
 * STACKLEAK reduces the information which kernel stack leak bugs can
 | 
			
		||||
 * reveal and blocks some uninitialized stack variable attacks.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include <linux/stackleak.h>
 | 
			
		||||
 | 
			
		||||
asmlinkage void stackleak_erase(void)
 | 
			
		||||
{
 | 
			
		||||
	/* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */
 | 
			
		||||
	unsigned long kstack_ptr = current->lowest_stack;
 | 
			
		||||
	unsigned long boundary = (unsigned long)end_of_stack(current);
 | 
			
		||||
	unsigned int poison_count = 0;
 | 
			
		||||
	const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long);
 | 
			
		||||
 | 
			
		||||
	/* Check that 'lowest_stack' value is sane */
 | 
			
		||||
	if (unlikely(kstack_ptr - boundary >= THREAD_SIZE))
 | 
			
		||||
		kstack_ptr = boundary;
 | 
			
		||||
 | 
			
		||||
	/* Search for the poison value in the kernel stack */
 | 
			
		||||
	while (kstack_ptr > boundary && poison_count <= depth) {
 | 
			
		||||
		if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON)
 | 
			
		||||
			poison_count++;
 | 
			
		||||
		else
 | 
			
		||||
			poison_count = 0;
 | 
			
		||||
 | 
			
		||||
		kstack_ptr -= sizeof(unsigned long);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * One 'long int' at the bottom of the thread stack is reserved and
 | 
			
		||||
	 * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y).
 | 
			
		||||
	 */
 | 
			
		||||
	if (kstack_ptr == boundary)
 | 
			
		||||
		kstack_ptr += sizeof(unsigned long);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Now write the poison value to the kernel stack. Start from
 | 
			
		||||
	 * 'kstack_ptr' and move up till the new 'boundary'. We assume that
 | 
			
		||||
	 * the stack pointer doesn't change when we write poison.
 | 
			
		||||
	 */
 | 
			
		||||
	if (on_thread_stack())
 | 
			
		||||
		boundary = current_stack_pointer;
 | 
			
		||||
	else
 | 
			
		||||
		boundary = current_top_of_stack();
 | 
			
		||||
 | 
			
		||||
	while (kstack_ptr < boundary) {
 | 
			
		||||
		*(unsigned long *)kstack_ptr = STACKLEAK_POISON;
 | 
			
		||||
		kstack_ptr += sizeof(unsigned long);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* Reset the 'lowest_stack' value for the next syscall */
 | 
			
		||||
	current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -139,4 +139,23 @@ config GCC_PLUGIN_RANDSTRUCT_PERFORMANCE
 | 
			
		||||
	  in structures.  This reduces the performance hit of RANDSTRUCT
 | 
			
		||||
	  at the cost of weakened randomization.
 | 
			
		||||
 | 
			
		||||
config GCC_PLUGIN_STACKLEAK
 | 
			
		||||
	bool "Erase the kernel stack before returning from syscalls"
 | 
			
		||||
	depends on GCC_PLUGINS
 | 
			
		||||
	depends on HAVE_ARCH_STACKLEAK
 | 
			
		||||
	help
 | 
			
		||||
	  This option makes the kernel erase the kernel stack before
 | 
			
		||||
	  returning from system calls. That reduces the information which
 | 
			
		||||
	  kernel stack leak bugs can reveal and blocks some uninitialized
 | 
			
		||||
	  stack variable attacks.
 | 
			
		||||
 | 
			
		||||
	  The tradeoff is the performance impact: on a single CPU system kernel
 | 
			
		||||
	  compilation sees a 1% slowdown, other systems and workloads may vary
 | 
			
		||||
	  and you are advised to test this feature on your expected workload
 | 
			
		||||
	  before deploying it.
 | 
			
		||||
 | 
			
		||||
	  This plugin was ported from grsecurity/PaX. More information at:
 | 
			
		||||
	   * https://grsecurity.net/
 | 
			
		||||
	   * https://pax.grsecurity.net/
 | 
			
		||||
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user