From 790ce3b40017bbd759a3d81e23c05d42b3d34b90 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Jul 2020 16:01:52 +0200 Subject: [PATCH 01/13] x86/idtentry: Remove stale comment Stack switching for interrupt handlers happens in C now for both 64 and 32bit. Remove the stale comment which claims the contrary. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/idtentry.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 2293b443b193..50ea186b8108 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -191,11 +191,9 @@ __visible noinstr void func(struct pt_regs *regs, unsigned long error_code) * to the function as error_code argument which needs to be truncated * to an u8 because the push is sign extending. * - * On 64-bit idtentry_enter/exit() are invoked in the ASM entry code before - * and after switching to the interrupt stack. On 32-bit this happens in C. - * * irq_enter/exit_rcu() are invoked before the function body and the - * KVM L1D flush request is set. + * KVM L1D flush request is set. Stack switching to the interrupt stack + * has to be done in the function body if necessary. */ #define DEFINE_IDTENTRY_IRQ(func) \ static __always_inline void __##func(struct pt_regs *regs, u8 vector); \ From 7f6fa101dfac8739764e47751d314551f6160c98 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 23 Jul 2020 09:14:05 -0700 Subject: [PATCH 02/13] x86: Correct noinstr qualifiers The noinstr qualifier is to be specified before the return type in the same way inline is used. These 2 cases were missed by previous patches. Signed-off-by: Ira Weiny Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20200723161405.852613-1-ira.weiny@intel.com --- arch/x86/kernel/alternative.c | 2 +- arch/x86/kernel/cpu/mce/core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8fd39ff74a49..069e77c0a360 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1044,7 +1044,7 @@ static __always_inline int patch_cmp(const void *key, const void *elt) return 0; } -int noinstr poke_int3_handler(struct pt_regs *regs) +noinstr int poke_int3_handler(struct pt_regs *regs) { struct bp_patching_desc *desc; struct text_poke_loc *tp; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 14e4b4d17ee5..6d7aa5642688 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1212,7 +1212,7 @@ static void kill_me_maybe(struct callback_head *cb) * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. */ -void noinstr do_machine_check(struct pt_regs *regs) +noinstr void do_machine_check(struct pt_regs *regs) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); From 8d5ea35c5e9139dbd19a3d73985d008d36c9968f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Jul 2020 00:00:00 +0200 Subject: [PATCH 03/13] x86/entry: Consolidate check_user_regs() The user register sanity check is sprinkled all over the place. Move it into enter_from_user_mode(). Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/20200722220519.943016204@linutronix.de --- arch/x86/entry/common.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 4eae4c1f254d..ab6cb8602c0b 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -82,10 +82,11 @@ static noinstr void check_user_regs(struct pt_regs *regs) * 2) Invoke context tracking if enabled to reactivate RCU * 3) Trace interrupts off state */ -static noinstr void enter_from_user_mode(void) +static noinstr void enter_from_user_mode(struct pt_regs *regs) { enum ctx_state state = ct_state(); + check_user_regs(regs); lockdep_hardirqs_off(CALLER_ADDR0); user_exit_irqoff(); @@ -95,8 +96,9 @@ static noinstr void enter_from_user_mode(void) instrumentation_end(); } #else -static __always_inline void enter_from_user_mode(void) +static __always_inline void enter_from_user_mode(struct pt_regs *regs) { + check_user_regs(regs); lockdep_hardirqs_off(CALLER_ADDR0); instrumentation_begin(); trace_hardirqs_off_finish(); @@ -369,9 +371,7 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { struct thread_info *ti; - check_user_regs(regs); - - enter_from_user_mode(); + enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); @@ -434,9 +434,7 @@ static void do_syscall_32_irqs_on(struct pt_regs *regs) /* Handles int $0x80 */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { - check_user_regs(regs); - - enter_from_user_mode(); + enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); @@ -487,8 +485,6 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) vdso_image_32.sym_int80_landing_pad; bool success; - check_user_regs(regs); - /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. @@ -496,7 +492,7 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) */ regs->ip = landing_pad; - enter_from_user_mode(); + enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); @@ -599,8 +595,7 @@ idtentry_state_t noinstr idtentry_enter(struct pt_regs *regs) }; if (user_mode(regs)) { - check_user_regs(regs); - enter_from_user_mode(); + enter_from_user_mode(regs); return ret; } @@ -733,8 +728,7 @@ void noinstr idtentry_exit(struct pt_regs *regs, idtentry_state_t state) */ void noinstr idtentry_enter_user(struct pt_regs *regs) { - check_user_regs(regs); - enter_from_user_mode(); + enter_from_user_mode(regs); } /** From 0b085e68f4072024ecaa3889aeeaab5f6c8eba5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Jul 2020 00:00:01 +0200 Subject: [PATCH 04/13] x86/entry: Consolidate 32/64 bit syscall entry 64bit and 32bit entry code have the same open coded syscall entry handling after the bitwidth specific bits. Move it to a helper function and share the code. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200722220520.051234096@linutronix.de --- arch/x86/entry/common.c | 95 ++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 53 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index ab6cb8602c0b..68d5c86b1985 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -366,8 +366,7 @@ __visible noinstr void syscall_return_slowpath(struct pt_regs *regs) exit_to_user_mode(); } -#ifdef CONFIG_X86_64 -__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) +static noinstr long syscall_enter(struct pt_regs *regs, unsigned long nr) { struct thread_info *ti; @@ -379,6 +378,16 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) nr = syscall_trace_enter(regs); + instrumentation_end(); + return nr; +} + +#ifdef CONFIG_X86_64 +__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) +{ + nr = syscall_enter(regs, nr); + + instrumentation_begin(); if (likely(nr < NR_syscalls)) { nr = array_index_nospec(nr, NR_syscalls); regs->ax = sys_call_table[nr](regs); @@ -390,64 +399,53 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) regs->ax = x32_sys_call_table[nr](regs); #endif } - __syscall_return_slowpath(regs); - instrumentation_end(); - exit_to_user_mode(); + syscall_return_slowpath(regs); } #endif #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) -/* - * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does - * all entry and exit work and returns with IRQs off. This function is - * extremely hot in workloads that use it, and it's usually called from - * do_fast_syscall_32, so forcibly inline it to improve performance. - */ -static void do_syscall_32_irqs_on(struct pt_regs *regs) +static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) { - struct thread_info *ti = current_thread_info(); - unsigned int nr = (unsigned int)regs->orig_ax; - -#ifdef CONFIG_IA32_EMULATION - ti->status |= TS_COMPAT; -#endif - - if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { - /* - * Subtlety here: if ptrace pokes something larger than - * 2^32-1 into orig_ax, this truncates it. This may or - * may not be necessary, but it matches the old asm - * behavior. - */ - nr = syscall_trace_enter(regs); - } + if (IS_ENABLED(CONFIG_IA32_EMULATION)) + current_thread_info()->status |= TS_COMPAT; + /* + * Subtlety here: if ptrace pokes something larger than 2^32-1 into + * orig_ax, the unsigned int return value truncates it. This may + * or may not be necessary, but it matches the old asm behavior. + */ + return syscall_enter(regs, (unsigned int)regs->orig_ax); +} +/* + * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. + */ +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, + unsigned int nr) +{ if (likely(nr < IA32_NR_syscalls)) { + instrumentation_begin(); nr = array_index_nospec(nr, IA32_NR_syscalls); regs->ax = ia32_sys_call_table[nr](regs); + instrumentation_end(); } - - __syscall_return_slowpath(regs); } /* Handles int $0x80 */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { - enter_from_user_mode(regs); - instrumentation_begin(); + unsigned int nr = syscall_32_enter(regs); - local_irq_enable(); - do_syscall_32_irqs_on(regs); - - instrumentation_end(); - exit_to_user_mode(); + do_syscall_32_irqs_on(regs, nr); + syscall_return_slowpath(regs); } -static bool __do_fast_syscall_32(struct pt_regs *regs) +static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) { + unsigned int nr = syscall_32_enter(regs); int res; + instrumentation_begin(); /* Fetch EBP from where the vDSO stashed it. */ if (IS_ENABLED(CONFIG_X86_64)) { /* @@ -460,17 +458,18 @@ static bool __do_fast_syscall_32(struct pt_regs *regs) res = get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp); } + instrumentation_end(); if (res) { /* User code screwed up. */ regs->ax = -EFAULT; - local_irq_disable(); - __prepare_exit_to_usermode(regs); + syscall_return_slowpath(regs); return false; } /* Now this is just like a normal syscall. */ - do_syscall_32_irqs_on(regs); + do_syscall_32_irqs_on(regs, nr); + syscall_return_slowpath(regs); return true; } @@ -483,7 +482,6 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) */ unsigned long landing_pad = (unsigned long)current->mm->context.vdso + vdso_image_32.sym_int80_landing_pad; - bool success; /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward @@ -492,17 +490,8 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) */ regs->ip = landing_pad; - enter_from_user_mode(regs); - instrumentation_begin(); - - local_irq_enable(); - success = __do_fast_syscall_32(regs); - - instrumentation_end(); - exit_to_user_mode(); - - /* If it failed, keep it simple: use IRET. */ - if (!success) + /* Invoke the syscall. If it failed, keep it simple: use IRET. */ + if (!__do_fast_syscall_32(regs)) return 0; #ifdef CONFIG_X86_64 From a377ac1cd9d7b9ac8d546dceb3d74956fbfd443f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Jul 2020 00:00:02 +0200 Subject: [PATCH 05/13] x86/entry: Move user return notifier out of loop Guests and user space share certain MSRs. KVM sets these MSRs to guest values once and does not set them back to user space values on every VM exit to spare the costly MSR operations. User return notifiers ensure that these MSRs are set back to the correct values before returning to user space in exit_to_usermode_loop(). There is no reason to evaluate the TIF flag indicating that user return notifiers need to be invoked in the loop. The important point is that they are invoked before returning to user space. Move the invocation out of the loop into the section which does the last preperatory steps before returning to user space. That section is not preemptible and runs with interrupts disabled until the actual return. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200722220520.159112003@linutronix.de --- arch/x86/entry/common.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 68d5c86b1985..9415ae5c6a29 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -208,7 +208,7 @@ static long syscall_trace_enter(struct pt_regs *regs) #define EXIT_TO_USERMODE_LOOP_FLAGS \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING) + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING) static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) { @@ -242,9 +242,6 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) rseq_handle_notify_resume(NULL, regs); } - if (cached_flags & _TIF_USER_RETURN_NOTIFY) - fire_user_return_notifiers(); - /* Disable IRQs and retry */ local_irq_disable(); @@ -273,6 +270,9 @@ static void __prepare_exit_to_usermode(struct pt_regs *regs) /* Reload ti->flags; we may have rescheduled above. */ cached_flags = READ_ONCE(ti->flags); + if (cached_flags & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); + if (unlikely(cached_flags & _TIF_IO_BITMAP)) tss_update_io_bitmap(); From 0bf019ea59e330770883ede4499d7f711d8c3adf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Jul 2020 00:00:03 +0200 Subject: [PATCH 06/13] x86/ptrace: Provide pt_regs helper for entry/exit As a preparatory step for moving the syscall and interrupt entry/exit handling into generic code, provide a pt_regs helper which retrieves the interrupt state from pt_regs. This is required to check whether interrupts are reenabled by return from interrupt/exception. Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/20200722220520.258511584@linutronix.de --- arch/x86/include/asm/ptrace.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 255b2dde2c1b..40aa69d04862 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -209,6 +209,11 @@ static inline void user_stack_pointer_set(struct pt_regs *regs, regs->sp = val; } +static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) +{ + return !(regs->flags & X86_EFLAGS_IF); +} + /* Query offset/name of register from its name/offset */ extern int regs_query_register_offset(const char *name); extern const char *regs_query_register_name(unsigned int offset); From 27d6b4d14f5c3ab21c4aef87dd04055a2d7adf14 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Jul 2020 00:00:04 +0200 Subject: [PATCH 07/13] x86/entry: Use generic syscall entry function Replace the syscall entry work handling with the generic version. Provide the necessary helper inlines to handle the real architecture specific parts, e.g. ptrace. Use a temporary define for idtentry_enter_user which will be cleaned up seperately. Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/20200722220520.376213694@linutronix.de --- arch/x86/Kconfig | 1 + arch/x86/entry/common.c | 181 ++-------------------------- arch/x86/include/asm/entry-common.h | 32 +++++ arch/x86/include/asm/idtentry.h | 5 +- arch/x86/include/asm/thread_info.h | 5 - 5 files changed, 45 insertions(+), 179 deletions(-) create mode 100644 arch/x86/include/asm/entry-common.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 883da0abf779..ccf02e6c144c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -115,6 +115,7 @@ config X86 select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP + select GENERIC_ENTRY select GENERIC_FIND_FIRST_BIT select GENERIC_IOMAP select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 9415ae5c6a29..d2fe85f44915 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -10,13 +10,13 @@ #include #include #include +#include #include #include #include #include #include #include -#include #include #include #include @@ -42,70 +42,8 @@ #include #include -#define CREATE_TRACE_POINTS #include -/* Check that the stack and regs on entry from user mode are sane. */ -static noinstr void check_user_regs(struct pt_regs *regs) -{ - if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { - /* - * Make sure that the entry code gave us a sensible EFLAGS - * register. Native because we want to check the actual CPU - * state, not the interrupt state as imagined by Xen. - */ - unsigned long flags = native_save_fl(); - WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | - X86_EFLAGS_NT)); - - /* We think we came from user mode. Make sure pt_regs agrees. */ - WARN_ON_ONCE(!user_mode(regs)); - - /* - * All entries from user mode (except #DF) should be on the - * normal thread stack and should have user pt_regs in the - * correct location. - */ - WARN_ON_ONCE(!on_thread_stack()); - WARN_ON_ONCE(regs != task_pt_regs(current)); - } -} - -#ifdef CONFIG_CONTEXT_TRACKING -/** - * enter_from_user_mode - Establish state when coming from user mode - * - * Syscall entry disables interrupts, but user mode is traced as interrupts - * enabled. Also with NO_HZ_FULL RCU might be idle. - * - * 1) Tell lockdep that interrupts are disabled - * 2) Invoke context tracking if enabled to reactivate RCU - * 3) Trace interrupts off state - */ -static noinstr void enter_from_user_mode(struct pt_regs *regs) -{ - enum ctx_state state = ct_state(); - - check_user_regs(regs); - lockdep_hardirqs_off(CALLER_ADDR0); - user_exit_irqoff(); - - instrumentation_begin(); - CT_WARN_ON(state != CONTEXT_USER); - trace_hardirqs_off_finish(); - instrumentation_end(); -} -#else -static __always_inline void enter_from_user_mode(struct pt_regs *regs) -{ - check_user_regs(regs); - lockdep_hardirqs_off(CALLER_ADDR0); - instrumentation_begin(); - trace_hardirqs_off_finish(); - instrumentation_end(); -} -#endif - /** * exit_to_user_mode - Fixup state when exiting to user mode * @@ -129,83 +67,6 @@ static __always_inline void exit_to_user_mode(void) lockdep_hardirqs_on(CALLER_ADDR0); } -static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) -{ -#ifdef CONFIG_X86_64 - if (arch == AUDIT_ARCH_X86_64) { - audit_syscall_entry(regs->orig_ax, regs->di, - regs->si, regs->dx, regs->r10); - } else -#endif - { - audit_syscall_entry(regs->orig_ax, regs->bx, - regs->cx, regs->dx, regs->si); - } -} - -/* - * Returns the syscall nr to run (which should match regs->orig_ax) or -1 - * to skip the syscall. - */ -static long syscall_trace_enter(struct pt_regs *regs) -{ - u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; - - struct thread_info *ti = current_thread_info(); - unsigned long ret = 0; - u32 work; - - work = READ_ONCE(ti->flags); - - if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { - ret = tracehook_report_syscall_entry(regs); - if (ret || (work & _TIF_SYSCALL_EMU)) - return -1L; - } - -#ifdef CONFIG_SECCOMP - /* - * Do seccomp after ptrace, to catch any tracer changes. - */ - if (work & _TIF_SECCOMP) { - struct seccomp_data sd; - - sd.arch = arch; - sd.nr = regs->orig_ax; - sd.instruction_pointer = regs->ip; -#ifdef CONFIG_X86_64 - if (arch == AUDIT_ARCH_X86_64) { - sd.args[0] = regs->di; - sd.args[1] = regs->si; - sd.args[2] = regs->dx; - sd.args[3] = regs->r10; - sd.args[4] = regs->r8; - sd.args[5] = regs->r9; - } else -#endif - { - sd.args[0] = regs->bx; - sd.args[1] = regs->cx; - sd.args[2] = regs->dx; - sd.args[3] = regs->si; - sd.args[4] = regs->di; - sd.args[5] = regs->bp; - } - - ret = __secure_computing(&sd); - if (ret == -1) - return ret; - } -#endif - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_enter(regs, regs->orig_ax); - - do_audit_syscall_entry(regs, arch); - - return ret ?: regs->orig_ax; -} - #define EXIT_TO_USERMODE_LOOP_FLAGS \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_NEED_RESCHED | _TIF_PATCH_PENDING) @@ -366,26 +227,10 @@ __visible noinstr void syscall_return_slowpath(struct pt_regs *regs) exit_to_user_mode(); } -static noinstr long syscall_enter(struct pt_regs *regs, unsigned long nr) -{ - struct thread_info *ti; - - enter_from_user_mode(regs); - instrumentation_begin(); - - local_irq_enable(); - ti = current_thread_info(); - if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) - nr = syscall_trace_enter(regs); - - instrumentation_end(); - return nr; -} - #ifdef CONFIG_X86_64 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { - nr = syscall_enter(regs, nr); + nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); if (likely(nr < NR_syscalls)) { @@ -407,6 +252,8 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) { + unsigned int nr = (unsigned int)regs->orig_ax; + if (IS_ENABLED(CONFIG_IA32_EMULATION)) current_thread_info()->status |= TS_COMPAT; /* @@ -414,7 +261,7 @@ static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) * orig_ax, the unsigned int return value truncates it. This may * or may not be necessary, but it matches the old asm behavior. */ - return syscall_enter(regs, (unsigned int)regs->orig_ax); + return (unsigned int)syscall_enter_from_user_mode(regs, nr); } /* @@ -568,7 +415,7 @@ SYSCALL_DEFINE0(ni_syscall) * solves the problem of kernel mode pagefaults which can schedule, which * is not possible after invoking rcu_irq_enter() without undoing it. * - * For user mode entries enter_from_user_mode() must be invoked to + * For user mode entries irqentry_enter_from_user_mode() must be invoked to * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit * would not be possible. * @@ -584,7 +431,7 @@ idtentry_state_t noinstr idtentry_enter(struct pt_regs *regs) }; if (user_mode(regs)) { - enter_from_user_mode(regs); + irqentry_enter_from_user_mode(regs); return ret; } @@ -615,7 +462,7 @@ idtentry_state_t noinstr idtentry_enter(struct pt_regs *regs) /* * If RCU is not watching then the same careful * sequence vs. lockdep and tracing is required - * as in enter_from_user_mode(). + * as in irqentry_enter_from_user_mode(). */ lockdep_hardirqs_off(CALLER_ADDR0); rcu_irq_enter(); @@ -708,18 +555,6 @@ void noinstr idtentry_exit(struct pt_regs *regs, idtentry_state_t state) } } -/** - * idtentry_enter_user - Handle state tracking on idtentry from user mode - * @regs: Pointer to pt_regs of interrupted context - * - * Invokes enter_from_user_mode() to establish the proper context for - * NOHZ_FULL. Otherwise scheduling on exit would not be possible. - */ -void noinstr idtentry_enter_user(struct pt_regs *regs) -{ - enter_from_user_mode(regs); -} - /** * idtentry_exit_user - Handle return from exception to user mode * @regs: Pointer to pt_regs (exception entry regs) diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h new file mode 100644 index 000000000000..7070b90c8312 --- /dev/null +++ b/arch/x86/include/asm/entry-common.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_X86_ENTRY_COMMON_H +#define _ASM_X86_ENTRY_COMMON_H + +/* Check that the stack and regs on entry from user mode are sane. */ +static __always_inline void arch_check_user_regs(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { + /* + * Make sure that the entry code gave us a sensible EFLAGS + * register. Native because we want to check the actual CPU + * state, not the interrupt state as imagined by Xen. + */ + unsigned long flags = native_save_fl(); + WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | + X86_EFLAGS_NT)); + + /* We think we came from user mode. Make sure pt_regs agrees. */ + WARN_ON_ONCE(!user_mode(regs)); + + /* + * All entries from user mode (except #DF) should be on the + * normal thread stack and should have user pt_regs in the + * correct location. + */ + WARN_ON_ONCE(!on_thread_stack()); + WARN_ON_ONCE(regs != task_pt_regs(current)); + } +} +#define arch_check_user_regs arch_check_user_regs + +#endif diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 1bc6f878bd30..449910fd454b 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -6,11 +6,14 @@ #include #ifndef __ASSEMBLY__ +#include #include #include -void idtentry_enter_user(struct pt_regs *regs); +/* Temporary define */ +#define idtentry_enter_user irqentry_enter_from_user_mode + void idtentry_exit_user(struct pt_regs *regs); typedef struct idtentry_state { diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8de8ceccb8bc..267701ae3d86 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -133,11 +133,6 @@ struct thread_info { #define _TIF_X32 (1 << TIF_X32) #define _TIF_FSCHECK (1 << TIF_FSCHECK) -/* Work to do before invoking the actual syscall. */ -#define _TIF_WORK_SYSCALL_ENTRY \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT) - /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ (_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \ From 167fd210ec0555d371a20435dac7c2c7052df7ed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Jul 2020 00:00:05 +0200 Subject: [PATCH 08/13] x86/entry: Use generic syscall exit functionality Replace the x86 variant with the generic version. Provide the relevant architecture specific helper functions and defines. Use a temporary define for idtentry_exit_user which will be cleaned up seperately. Signed-off-by: Thomas Gleixner Acked-by: Kees Cook Link: https://lkml.kernel.org/r/20200722220520.494648601@linutronix.de --- arch/x86/entry/common.c | 221 +--------------------------- arch/x86/entry/entry_32.S | 2 +- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/entry-common.h | 44 ++++++ arch/x86/include/asm/idtentry.h | 3 +- arch/x86/include/asm/signal.h | 1 - arch/x86/kernel/signal.c | 3 +- 7 files changed, 54 insertions(+), 222 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d2fe85f44915..bc96eb8e055a 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -15,15 +15,8 @@ #include #include #include -#include -#include -#include #include -#include -#include #include -#include -#include #include #include @@ -42,191 +35,6 @@ #include #include -#include - -/** - * exit_to_user_mode - Fixup state when exiting to user mode - * - * Syscall exit enables interrupts, but the kernel state is interrupts - * disabled when this is invoked. Also tell RCU about it. - * - * 1) Trace interrupts on state - * 2) Invoke context tracking if enabled to adjust RCU state - * 3) Clear CPU buffers if CPU is affected by MDS and the migitation is on. - * 4) Tell lockdep that interrupts are enabled - */ -static __always_inline void exit_to_user_mode(void) -{ - instrumentation_begin(); - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - instrumentation_end(); - - user_enter_irqoff(); - mds_user_clear_cpu_buffers(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -#define EXIT_TO_USERMODE_LOOP_FLAGS \ - (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING) - -static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) -{ - /* - * In order to return to user mode, we need to have IRQs off with - * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags - * can be set at any time on preemptible kernels if we have IRQs on, - * so we need to loop. Disabling preemption wouldn't help: doing the - * work to clear some of the flags can sleep. - */ - while (true) { - /* We have work to do. */ - local_irq_enable(); - - if (cached_flags & _TIF_NEED_RESCHED) - schedule(); - - if (cached_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - if (cached_flags & _TIF_PATCH_PENDING) - klp_update_patch_state(current); - - /* deal with pending signal delivery */ - if (cached_flags & _TIF_SIGPENDING) - do_signal(regs); - - if (cached_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } - - /* Disable IRQs and retry */ - local_irq_disable(); - - cached_flags = READ_ONCE(current_thread_info()->flags); - - if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) - break; - } -} - -static void __prepare_exit_to_usermode(struct pt_regs *regs) -{ - struct thread_info *ti = current_thread_info(); - u32 cached_flags; - - addr_limit_user_check(); - - lockdep_assert_irqs_disabled(); - lockdep_sys_exit(); - - cached_flags = READ_ONCE(ti->flags); - - if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) - exit_to_usermode_loop(regs, cached_flags); - - /* Reload ti->flags; we may have rescheduled above. */ - cached_flags = READ_ONCE(ti->flags); - - if (cached_flags & _TIF_USER_RETURN_NOTIFY) - fire_user_return_notifiers(); - - if (unlikely(cached_flags & _TIF_IO_BITMAP)) - tss_update_io_bitmap(); - - fpregs_assert_state_consistent(); - if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD)) - switch_fpu_return(); - -#ifdef CONFIG_COMPAT - /* - * Compat syscalls set TS_COMPAT. Make sure we clear it before - * returning to user mode. We need to clear it *after* signal - * handling, because syscall restart has a fixup for compat - * syscalls. The fixup is exercised by the ptrace_syscall_32 - * selftest. - * - * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer - * special case only applies after poking regs and before the - * very next return to user mode. - */ - ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); -#endif -} - -static noinstr void prepare_exit_to_usermode(struct pt_regs *regs) -{ - instrumentation_begin(); - __prepare_exit_to_usermode(regs); - instrumentation_end(); - exit_to_user_mode(); -} - -#define SYSCALL_EXIT_WORK_FLAGS \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ - _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) - -static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) -{ - bool step; - - audit_syscall_exit(regs); - - if (cached_flags & _TIF_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, regs->ax); - - /* - * If TIF_SYSCALL_EMU is set, we only get here because of - * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). - * We already reported this syscall instruction in - * syscall_trace_enter(). - */ - step = unlikely( - (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) - == _TIF_SINGLESTEP); - if (step || cached_flags & _TIF_SYSCALL_TRACE) - tracehook_report_syscall_exit(regs, step); -} - -static void __syscall_return_slowpath(struct pt_regs *regs) -{ - struct thread_info *ti = current_thread_info(); - u32 cached_flags = READ_ONCE(ti->flags); - - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); - - if (IS_ENABLED(CONFIG_PROVE_LOCKING) && - WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) - local_irq_enable(); - - rseq_syscall(regs); - - /* - * First do one-time work. If these work items are enabled, we - * want to run them exactly once per syscall exit with IRQs on. - */ - if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) - syscall_slow_exit_work(regs, cached_flags); - - local_irq_disable(); - __prepare_exit_to_usermode(regs); -} - -/* - * Called with IRQs on and fully valid regs. Returns with IRQs off in a - * state such that we can immediately switch to user mode. - */ -__visible noinstr void syscall_return_slowpath(struct pt_regs *regs) -{ - instrumentation_begin(); - __syscall_return_slowpath(regs); - instrumentation_end(); - exit_to_user_mode(); -} - #ifdef CONFIG_X86_64 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { @@ -245,7 +53,7 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) #endif } instrumentation_end(); - syscall_return_slowpath(regs); + syscall_exit_to_user_mode(regs); } #endif @@ -284,7 +92,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) unsigned int nr = syscall_32_enter(regs); do_syscall_32_irqs_on(regs, nr); - syscall_return_slowpath(regs); + syscall_exit_to_user_mode(regs); } static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) @@ -310,13 +118,13 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) if (res) { /* User code screwed up. */ regs->ax = -EFAULT; - syscall_return_slowpath(regs); + syscall_exit_to_user_mode(regs); return false; } /* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs, nr); - syscall_return_slowpath(regs); + syscall_exit_to_user_mode(regs); return true; } @@ -524,7 +332,7 @@ void noinstr idtentry_exit(struct pt_regs *regs, idtentry_state_t state) /* Check whether this returns to user mode */ if (user_mode(regs)) { - prepare_exit_to_usermode(regs); + irqentry_exit_to_user_mode(regs); } else if (regs->flags & X86_EFLAGS_IF) { /* * If RCU was not watching on entry this needs to be done @@ -555,25 +363,6 @@ void noinstr idtentry_exit(struct pt_regs *regs, idtentry_state_t state) } } -/** - * idtentry_exit_user - Handle return from exception to user mode - * @regs: Pointer to pt_regs (exception entry regs) - * - * Runs the necessary preemption and work checks and returns to the caller - * with interrupts disabled and no further work pending. - * - * This is the last action before returning to the low level ASM code which - * just needs to return to the appropriate context. - * - * Counterpart to idtentry_enter_user(). - */ -void noinstr idtentry_exit_user(struct pt_regs *regs) -{ - lockdep_assert_irqs_disabled(); - - prepare_exit_to_usermode(regs); -} - #ifdef CONFIG_XEN_PV #ifndef CONFIG_PREEMPTION /* diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2d0bd5d5f032..6addbd1d0775 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -846,7 +846,7 @@ SYM_CODE_START(ret_from_fork) 2: /* When we fork, we trace the syscall return in the child, too. */ movl %esp, %eax - call syscall_return_slowpath + call syscall_exit_to_user_mode jmp .Lsyscall_32_done /* kernel thread */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d2a00c97e53f..f423ca9e8a51 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -283,7 +283,7 @@ SYM_CODE_START(ret_from_fork) 2: UNWIND_HINT_REGS movq %rsp, %rdi - call syscall_return_slowpath /* returns with IRQs disabled */ + call syscall_exit_to_user_mode /* returns with IRQs disabled */ jmp swapgs_restore_regs_and_return_to_usermode 1: diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 7070b90c8312..a8f9315b9eae 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -2,6 +2,12 @@ #ifndef _ASM_X86_ENTRY_COMMON_H #define _ASM_X86_ENTRY_COMMON_H +#include + +#include +#include +#include + /* Check that the stack and regs on entry from user mode are sane. */ static __always_inline void arch_check_user_regs(struct pt_regs *regs) { @@ -29,4 +35,42 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs) } #define arch_check_user_regs arch_check_user_regs +#define ARCH_SYSCALL_EXIT_WORK (_TIF_SINGLESTEP) + +static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + unsigned long ti_work) +{ + if (ti_work & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); + + if (unlikely(ti_work & _TIF_IO_BITMAP)) + tss_update_io_bitmap(); + + fpregs_assert_state_consistent(); + if (unlikely(ti_work & _TIF_NEED_FPU_LOAD)) + switch_fpu_return(); + +#ifdef CONFIG_COMPAT + /* + * Compat syscalls set TS_COMPAT. Make sure we clear it before + * returning to user mode. We need to clear it *after* signal + * handling, because syscall restart has a fixup for compat + * syscalls. The fixup is exercised by the ptrace_syscall_32 + * selftest. + * + * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer + * special case only applies after poking regs and before the + * very next return to user mode. + */ + current_thread_info()->status &= ~(TS_COMPAT | TS_I386_REGS_POKED); +#endif +} +#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare + +static __always_inline void arch_exit_to_user_mode(void) +{ + mds_user_clear_cpu_buffers(); +} +#define arch_exit_to_user_mode arch_exit_to_user_mode + #endif diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 449910fd454b..f7d48ea51ab3 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -13,8 +13,7 @@ /* Temporary define */ #define idtentry_enter_user irqentry_enter_from_user_mode - -void idtentry_exit_user(struct pt_regs *regs); +#define idtentry_exit_user irqentry_exit_to_user_mode typedef struct idtentry_state { bool exit_rcu; diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 33d3c88a7225..6fd8410a3910 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -35,7 +35,6 @@ typedef sigset_t compat_sigset_t; #endif /* __ASSEMBLY__ */ #include #ifndef __ASSEMBLY__ -extern void do_signal(struct pt_regs *regs); #define __ARCH_HAS_SA_RESTORER diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 399f97abee02..d5fa494c2304 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -803,7 +804,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -void do_signal(struct pt_regs *regs) +void arch_do_signal(struct pt_regs *regs) { struct ksignal ksig; From 517e499227bed34acc69166691e2db5df3dc859a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Jul 2020 00:00:06 +0200 Subject: [PATCH 09/13] x86/entry: Cleanup idtentry_entry/exit_user Cleanup the temporary defines and use irqentry_ instead of idtentry_. Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/20200722220520.602603691@linutronix.de --- arch/x86/include/asm/idtentry.h | 4 ---- arch/x86/kernel/cpu/mce/core.c | 4 ++-- arch/x86/kernel/traps.c | 18 +++++++++--------- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index f7d48ea51ab3..bf59f72140cc 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -11,10 +11,6 @@ #include -/* Temporary define */ -#define idtentry_enter_user irqentry_enter_from_user_mode -#define idtentry_exit_user irqentry_exit_to_user_mode - typedef struct idtentry_state { bool exit_rcu; } idtentry_state_t; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6d7aa5642688..97ff8313544f 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1927,11 +1927,11 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) static __always_inline void exc_machine_check_user(struct pt_regs *regs) { - idtentry_enter_user(regs); + irqentry_enter_from_user_mode(regs); instrumentation_begin(); machine_check_vector(regs); instrumentation_end(); - idtentry_exit_user(regs); + irqentry_exit_to_user_mode(regs); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ab6828e360f2..59c7f54753b4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -638,18 +638,18 @@ DEFINE_IDTENTRY_RAW(exc_int3) return; /* - * idtentry_enter_user() uses static_branch_{,un}likely() and therefore - * can trigger INT3, hence poke_int3_handler() must be done - * before. If the entry came from kernel mode, then use nmi_enter() - * because the INT3 could have been hit in any context including - * NMI. + * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() + * and therefore can trigger INT3, hence poke_int3_handler() must + * be done before. If the entry came from kernel mode, then use + * nmi_enter() because the INT3 could have been hit in any context + * including NMI. */ if (user_mode(regs)) { - idtentry_enter_user(regs); + irqentry_enter_from_user_mode(regs); instrumentation_begin(); do_int3_user(regs); instrumentation_end(); - idtentry_exit_user(regs); + irqentry_exit_to_user_mode(regs); } else { nmi_enter(); instrumentation_begin(); @@ -901,12 +901,12 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, */ WARN_ON_ONCE(!user_mode(regs)); - idtentry_enter_user(regs); + irqentry_enter_from_user_mode(regs); instrumentation_begin(); handle_debug(regs, dr6, true); instrumentation_end(); - idtentry_exit_user(regs); + irqentry_exit_to_user_mode(regs); } #ifdef CONFIG_X86_64 From bdcd178ada90d2413bcc9df4211dcdd511a47586 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Jul 2020 00:00:07 +0200 Subject: [PATCH 10/13] x86/entry: Use generic interrupt entry/exit code Replace the x86 code with the generic variant. Use temporary defines for idtentry_* which will be cleaned up in the next step. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200722220520.711492752@linutronix.de --- arch/x86/entry/common.c | 167 +------------------------------- arch/x86/include/asm/idtentry.h | 10 +- 2 files changed, 5 insertions(+), 172 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index bc96eb8e055a..297e08ea9f87 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -198,171 +198,6 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -/** - * idtentry_enter - Handle state tracking on ordinary idtentries - * @regs: Pointer to pt_regs of interrupted context - * - * Invokes: - * - lockdep irqflag state tracking as low level ASM entry disabled - * interrupts. - * - * - Context tracking if the exception hit user mode. - * - * - The hardirq tracer to keep the state consistent as low level ASM - * entry disabled interrupts. - * - * As a precondition, this requires that the entry came from user mode, - * idle, or a kernel context in which RCU is watching. - * - * For kernel mode entries RCU handling is done conditional. If RCU is - * watching then the only RCU requirement is to check whether the tick has - * to be restarted. If RCU is not watching then rcu_irq_enter() has to be - * invoked on entry and rcu_irq_exit() on exit. - * - * Avoiding the rcu_irq_enter/exit() calls is an optimization but also - * solves the problem of kernel mode pagefaults which can schedule, which - * is not possible after invoking rcu_irq_enter() without undoing it. - * - * For user mode entries irqentry_enter_from_user_mode() must be invoked to - * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit - * would not be possible. - * - * Returns: An opaque object that must be passed to idtentry_exit() - * - * The return value must be fed into the state argument of - * idtentry_exit(). - */ -idtentry_state_t noinstr idtentry_enter(struct pt_regs *regs) -{ - idtentry_state_t ret = { - .exit_rcu = false, - }; - - if (user_mode(regs)) { - irqentry_enter_from_user_mode(regs); - return ret; - } - - /* - * If this entry hit the idle task invoke rcu_irq_enter() whether - * RCU is watching or not. - * - * Interupts can nest when the first interrupt invokes softirq - * processing on return which enables interrupts. - * - * Scheduler ticks in the idle task can mark quiescent state and - * terminate a grace period, if and only if the timer interrupt is - * not nested into another interrupt. - * - * Checking for __rcu_is_watching() here would prevent the nesting - * interrupt to invoke rcu_irq_enter(). If that nested interrupt is - * the tick then rcu_flavor_sched_clock_irq() would wrongfully - * assume that it is the first interupt and eventually claim - * quiescient state and end grace periods prematurely. - * - * Unconditionally invoke rcu_irq_enter() so RCU state stays - * consistent. - * - * TINY_RCU does not support EQS, so let the compiler eliminate - * this part when enabled. - */ - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { - /* - * If RCU is not watching then the same careful - * sequence vs. lockdep and tracing is required - * as in irqentry_enter_from_user_mode(). - */ - lockdep_hardirqs_off(CALLER_ADDR0); - rcu_irq_enter(); - instrumentation_begin(); - trace_hardirqs_off_finish(); - instrumentation_end(); - - ret.exit_rcu = true; - return ret; - } - - /* - * If RCU is watching then RCU only wants to check whether it needs - * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() - * already contains a warning when RCU is not watching, so no point - * in having another one here. - */ - instrumentation_begin(); - rcu_irq_enter_check_tick(); - /* Use the combo lockdep/tracing function */ - trace_hardirqs_off(); - instrumentation_end(); - - return ret; -} - -static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) -{ - if (may_sched && !preempt_count()) { - /* Sanity check RCU and thread stack */ - rcu_irq_exit_check_preempt(); - if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) - WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) - preempt_schedule_irq(); - } - /* Covers both tracing and lockdep */ - trace_hardirqs_on(); -} - -/** - * idtentry_exit - Handle return from exception that used idtentry_enter() - * @regs: Pointer to pt_regs (exception entry regs) - * @state: Return value from matching call to idtentry_enter() - * - * Depending on the return target (kernel/user) this runs the necessary - * preemption and work checks if possible and reguired and returns to - * the caller with interrupts disabled and no further work pending. - * - * This is the last action before returning to the low level ASM code which - * just needs to return to the appropriate context. - * - * Counterpart to idtentry_enter(). The return value of the entry - * function must be fed into the @state argument. - */ -void noinstr idtentry_exit(struct pt_regs *regs, idtentry_state_t state) -{ - lockdep_assert_irqs_disabled(); - - /* Check whether this returns to user mode */ - if (user_mode(regs)) { - irqentry_exit_to_user_mode(regs); - } else if (regs->flags & X86_EFLAGS_IF) { - /* - * If RCU was not watching on entry this needs to be done - * carefully and needs the same ordering of lockdep/tracing - * and RCU as the return to user mode path. - */ - if (state.exit_rcu) { - instrumentation_begin(); - /* Tell the tracer that IRET will enable interrupts */ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - instrumentation_end(); - rcu_irq_exit(); - lockdep_hardirqs_on(CALLER_ADDR0); - return; - } - - instrumentation_begin(); - idtentry_exit_cond_resched(regs, IS_ENABLED(CONFIG_PREEMPTION)); - instrumentation_end(); - } else { - /* - * IRQ flags state is correct already. Just tell RCU if it - * was not watching on entry. - */ - if (state.exit_rcu) - rcu_irq_exit(); - } -} - #ifdef CONFIG_XEN_PV #ifndef CONFIG_PREEMPTION /* @@ -427,7 +262,7 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) inhcall = get_and_clear_inhcall(); if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { instrumentation_begin(); - idtentry_exit_cond_resched(regs, true); + irqentry_exit_cond_resched(); instrumentation_end(); restore_inhcall(inhcall); } else { diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index bf59f72140cc..621e25d08a3f 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -11,12 +11,10 @@ #include -typedef struct idtentry_state { - bool exit_rcu; -} idtentry_state_t; - -idtentry_state_t idtentry_enter(struct pt_regs *regs); -void idtentry_exit(struct pt_regs *regs, idtentry_state_t state); +/* Temporary defines */ +typedef irqentry_state_t idtentry_state_t; +#define idtentry_enter irqentry_enter +#define idtentry_exit irqentry_exit /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points From a27a0a55495cdde4b8d98f82460dc46eb44777fd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Jul 2020 00:00:08 +0200 Subject: [PATCH 11/13] x86/entry: Cleanup idtentry_enter/exit Remove the temporary defines and fixup all references. Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/20200722220520.855839271@linutronix.de --- arch/x86/entry/common.c | 6 +++--- arch/x86/include/asm/idtentry.h | 33 ++++++++++++++------------------- arch/x86/kernel/kvm.c | 6 +++--- arch/x86/kernel/traps.c | 6 +++--- arch/x86/mm/fault.c | 6 +++--- 5 files changed, 26 insertions(+), 31 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 297e08ea9f87..3de0303703ae 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -248,9 +248,9 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) { struct pt_regs *old_regs; bool inhcall; - idtentry_state_t state; + irqentry_state_t state; - state = idtentry_enter(regs); + state = irqentry_enter(regs); old_regs = set_irq_regs(regs); instrumentation_begin(); @@ -266,7 +266,7 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) instrumentation_end(); restore_inhcall(inhcall); } else { - idtentry_exit(regs, state); + irqentry_exit(regs, state); } } #endif /* CONFIG_XEN_PV */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 621e25d08a3f..73eb277e63ab 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -11,11 +11,6 @@ #include -/* Temporary defines */ -typedef irqentry_state_t idtentry_state_t; -#define idtentry_enter irqentry_enter -#define idtentry_exit irqentry_exit - /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware @@ -45,8 +40,8 @@ typedef irqentry_state_t idtentry_state_t; * The macro is written so it acts as function definition. Append the * body with a pair of curly brackets. * - * idtentry_enter() contains common code which has to be invoked before - * arbitrary code in the body. idtentry_exit() contains common code + * irqentry_enter() contains common code which has to be invoked before + * arbitrary code in the body. irqentry_exit() contains common code * which has to run before returning to the low level assembly code. */ #define DEFINE_IDTENTRY(func) \ @@ -54,12 +49,12 @@ static __always_inline void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - idtentry_state_t state = idtentry_enter(regs); \ + irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ __##func (regs); \ instrumentation_end(); \ - idtentry_exit(regs, state); \ + irqentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs) @@ -101,12 +96,12 @@ static __always_inline void __##func(struct pt_regs *regs, \ __visible noinstr void func(struct pt_regs *regs, \ unsigned long error_code) \ { \ - idtentry_state_t state = idtentry_enter(regs); \ + irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ __##func (regs, error_code); \ instrumentation_end(); \ - idtentry_exit(regs, state); \ + irqentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs, \ @@ -161,7 +156,7 @@ __visible noinstr void func(struct pt_regs *regs) * body with a pair of curly brackets. * * Contrary to DEFINE_IDTENTRY_ERRORCODE() this does not invoke the - * idtentry_enter/exit() helpers before and after the body invocation. This + * irqentry_enter/exit() helpers before and after the body invocation. This * needs to be done in the body itself if applicable. Use if extra work * is required before the enter/exit() helpers are invoked. */ @@ -197,7 +192,7 @@ static __always_inline void __##func(struct pt_regs *regs, u8 vector); \ __visible noinstr void func(struct pt_regs *regs, \ unsigned long error_code) \ { \ - idtentry_state_t state = idtentry_enter(regs); \ + irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ irq_enter_rcu(); \ @@ -205,7 +200,7 @@ __visible noinstr void func(struct pt_regs *regs, \ __##func (regs, (u8)error_code); \ irq_exit_rcu(); \ instrumentation_end(); \ - idtentry_exit(regs, state); \ + irqentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs, u8 vector) @@ -229,7 +224,7 @@ static __always_inline void __##func(struct pt_regs *regs, u8 vector) * DEFINE_IDTENTRY_SYSVEC - Emit code for system vector IDT entry points * @func: Function name of the entry point * - * idtentry_enter/exit() and irq_enter/exit_rcu() are invoked before the + * irqentry_enter/exit() and irq_enter/exit_rcu() are invoked before the * function body. KVM L1D flush request is set. * * Runs the function on the interrupt stack if the entry hit kernel mode @@ -239,7 +234,7 @@ static void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - idtentry_state_t state = idtentry_enter(regs); \ + irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ irq_enter_rcu(); \ @@ -247,7 +242,7 @@ __visible noinstr void func(struct pt_regs *regs) \ run_on_irqstack_cond(__##func, regs, regs); \ irq_exit_rcu(); \ instrumentation_end(); \ - idtentry_exit(regs, state); \ + irqentry_exit(regs, state); \ } \ \ static noinline void __##func(struct pt_regs *regs) @@ -268,7 +263,7 @@ static __always_inline void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - idtentry_state_t state = idtentry_enter(regs); \ + irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ __irq_enter_raw(); \ @@ -276,7 +271,7 @@ __visible noinstr void func(struct pt_regs *regs) \ __##func (regs); \ __irq_exit_raw(); \ instrumentation_end(); \ - idtentry_exit(regs, state); \ + irqentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 3f78482d9496..233c77d056c9 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { u32 reason = kvm_read_and_reset_apf_flags(); - idtentry_state_t state; + irqentry_state_t state; switch (reason) { case KVM_PV_REASON_PAGE_NOT_PRESENT: @@ -243,7 +243,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) return false; } - state = idtentry_enter(regs); + state = irqentry_enter(regs); instrumentation_begin(); /* @@ -264,7 +264,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) } instrumentation_end(); - idtentry_exit(regs, state); + irqentry_exit(regs, state); return true; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 59c7f54753b4..be8fcfec004a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -245,7 +245,7 @@ static noinstr bool handle_bug(struct pt_regs *regs) DEFINE_IDTENTRY_RAW(exc_invalid_op) { - idtentry_state_t state; + irqentry_state_t state; /* * We use UD2 as a short encoding for 'CALL __WARN', as such @@ -255,11 +255,11 @@ DEFINE_IDTENTRY_RAW(exc_invalid_op) if (!user_mode(regs) && handle_bug(regs)) return; - state = idtentry_enter(regs); + state = irqentry_enter(regs); instrumentation_begin(); handle_invalid_op(regs); instrumentation_end(); - idtentry_exit(regs, state); + irqentry_exit(regs, state); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 5e41949453cc..5e5edd2ec893 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1377,7 +1377,7 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code, DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { unsigned long address = read_cr2(); - idtentry_state_t state; + irqentry_state_t state; prefetchw(¤t->mm->mmap_lock); @@ -1412,11 +1412,11 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) * code reenabled RCU to avoid subsequent wreckage which helps * debugability. */ - state = idtentry_enter(regs); + state = irqentry_enter(regs); instrumentation_begin(); handle_page_fault(regs, error_code, address); instrumentation_end(); - idtentry_exit(regs, state); + irqentry_exit(regs, state); } From 72c3c0fe54a3f3ddea8f5ca468ddf9deaf2100b7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Jul 2020 00:00:09 +0200 Subject: [PATCH 12/13] x86/kvm: Use generic xfer to guest work function Use the generic infrastructure to check for and handle pending work before transitioning into guest mode. This now handles TIF_NOTIFY_RESUME as well which was ignored so far. Handling it is important as this covers task work and task work will be used to offload the heavy lifting of POSIX CPU timers to thread context. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200722220520.979724969@linutronix.de --- arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/vmx/vmx.c | 11 +++++------ arch/x86/kvm/x86.c | 15 ++++++--------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index b277a2db6267..fbd5bd7a945a 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -42,6 +42,7 @@ config KVM select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL + select KVM_XFER_TO_GUEST_WORK select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select SRCU diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 13745f2a5ecd..9909375ee1fd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -5373,14 +5374,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) } /* - * Note, return 1 and not 0, vcpu_run() is responsible for - * morphing the pending signal into the proper return code. + * Note, return 1 and not 0, vcpu_run() will invoke + * xfer_to_guest_mode() which will create a proper return + * code. */ - if (signal_pending(current)) + if (__xfer_to_guest_mode_work_pending()) return 1; - - if (need_resched()) - schedule(); } return 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 88c593f83b28..82d4a9e88908 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -56,6 +56,7 @@ #include #include #include +#include #include @@ -1587,7 +1588,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || - need_resched() || signal_pending(current); + xfer_to_guest_mode_work_pending(); } EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request); @@ -8681,15 +8682,11 @@ static int vcpu_run(struct kvm_vcpu *vcpu) break; } - if (signal_pending(current)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - break; - } - if (need_resched()) { + if (xfer_to_guest_mode_work_pending()) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - cond_resched(); + r = xfer_to_guest_mode_handle_work(vcpu); + if (r) + return r; vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } From f3020b8891b890b48d9e1a83241e3cce518427c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Jul 2020 09:19:01 +0200 Subject: [PATCH 13/13] x86/kvm: Use __xfer_to_guest_mode_work_pending() in kvm_run_vcpu() The comments explicitely explain that the work flags check and handling in kvm_run_vcpu() is done with preemption and interrupts enabled as KVM invokes the check again right before entering guest mode with interrupts disabled which guarantees that the work flags are observed and handled before VMENTER. Nevertheless the flag pending check in kvm_run_vcpu() uses the helper variant which requires interrupts to be disabled triggering an instant lockdep splat. This was caught in testing before and then not fixed up in the patch before applying. :( Use the relaxed and intentionally racy __xfer_to_guest_mode_work_pending() instead. Fixes: 72c3c0fe54a3 ("x86/kvm: Use generic xfer to guest work function") Reported-by: Qian Cai writes: Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/87bljxa2sa.fsf@nanos.tec.linutronix.de --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 82d4a9e88908..532597265c50 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8682,7 +8682,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) break; } - if (xfer_to_guest_mode_work_pending()) { + if (__xfer_to_guest_mode_work_pending()) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); r = xfer_to_guest_mode_handle_work(vcpu); if (r)