diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0a1731a0f0ef..45e34be4ed56 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1893,6 +1893,12 @@ 0 -- machine default 1 -- force brightness inversion + ia32_emulation= [X86-64] + Format: + When true, allows loading 32-bit programs and executing 32-bit + syscalls, essentially overriding IA32_EMULATION_DEFAULT_DISABLED at + boot time. When false, unconditionally disables IA32 emulation. + icn= [HW,ISDN] Format: [,[,[,]]] diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9b2a598cce8d..ad478a2b49e2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2955,6 +2955,15 @@ config IA32_EMULATION 64-bit kernel. You should likely turn this on, unless you're 100% sure that you don't have any 32-bit programs left. +config IA32_EMULATION_DEFAULT_DISABLED + bool "IA32 emulation disabled by default" + default n + depends on IA32_EMULATION + help + Make IA32 emulation disabled by default. This prevents loading 32-bit + processes and access to 32-bit syscalls. If unsure, leave it to its + default value. + config X86_X32_ABI bool "x32 ABI for 64-bit mode" depends on X86_64 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 93c60c0c9d4a..d813160b14d8 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef CONFIG_XEN_PV #include @@ -70,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) return false; } -__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) +/* Returns true to return using SYSRET, or false to use IRET */ +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) { add_random_kstack_offset(); nr = syscall_enter_from_user_mode(regs, nr); @@ -84,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) instrumentation_end(); syscall_exit_to_user_mode(regs); + + /* + * Check that the register state is valid for using SYSRET to exit + * to userspace. Otherwise use the slower but fully capable IRET + * exit path. + */ + + /* XEN PV guests always use the IRET path */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* SYSRET requires RCX == RIP and R11 == EFLAGS */ + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) + return false; + + /* CS and SS must match the values set in MSR_STAR */ + if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) + return false; + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. + * + * TASK_SIZE_MAX covers all user-accessible addresses other than + * the deprecated vsyscall page. + */ + if (unlikely(regs->ip >= TASK_SIZE_MAX)) + return false; + + /* + * SYSRET cannot restore RF. It can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. + */ + if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) + return false; + + /* Use SYSRET to exit to userspace */ + return true; } #endif @@ -96,6 +138,16 @@ static __always_inline int syscall_32_enter(struct pt_regs *regs) return (int)regs->orig_ax; } +#ifdef CONFIG_IA32_EMULATION +bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED); + +static int ia32_emulation_override_cmdline(char *arg) +{ + return kstrtobool(arg, &__ia32_enabled); +} +early_param("ia32_emulation", ia32_emulation_override_cmdline); +#endif + /* * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. */ @@ -182,8 +234,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) return true; } -/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ -__visible noinstr long do_fast_syscall_32(struct pt_regs *regs) +/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ +__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs) { /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling @@ -201,41 +253,36 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) /* Invoke the syscall. If it failed, keep it simple: use IRET. */ if (!__do_fast_syscall_32(regs)) - return 0; + return false; -#ifdef CONFIG_X86_64 /* - * Opportunistic SYSRETL: if possible, try to return using SYSRETL. - * SYSRETL is available on all 64-bit CPUs, so we don't need to - * bother with SYSEXIT. - * - * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, - * because the ECX fixup above will ensure that this is essentially - * never the case. + * Check that the register state is valid for using SYSRETL/SYSEXIT + * to exit to userspace. Otherwise use the slower but fully capable + * IRET exit path. */ - return regs->cs == __USER32_CS && regs->ss == __USER_DS && - regs->ip == landing_pad && - (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; -#else - /* - * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. - * - * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, - * because the ECX fixup above will ensure that this is essentially - * never the case. - * - * We don't allow syscalls at all from VM86 mode, but we still - * need to check VM, because we might be returning from sys_vm86. - */ - return static_cpu_has(X86_FEATURE_SEP) && - regs->cs == __USER_CS && regs->ss == __USER_DS && - regs->ip == landing_pad && - (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; -#endif + + /* XEN PV guests always use the IRET path */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* EIP must point to the VDSO landing pad */ + if (unlikely(regs->ip != landing_pad)) + return false; + + /* CS and SS must match the values set in MSR_STAR */ + if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS)) + return false; + + /* If the TF, RF, or VM flags are set, use IRET */ + if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM))) + return false; + + /* Use SYSRETL/SYSEXIT to exit to userspace */ + return true; } -/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ -__visible noinstr long do_SYSENTER_32(struct pt_regs *regs) +/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ +__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs) { /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ regs->sp = regs->bp; diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 6e6af42e044a..c73047bf9f4b 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -837,7 +837,7 @@ SYM_FUNC_START(entry_SYSENTER_32) movl %esp, %eax call do_SYSENTER_32 - testl %eax, %eax + testb %al, %al jz .Lsyscall_32_done STACKLEAK_ERASE diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d656924eefc2..de6469dffe3a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -126,70 +126,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) * In the Xen PV case we must use iret anyway. */ - ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \ - X86_FEATURE_XENPV - - movq RCX(%rsp), %rcx - movq RIP(%rsp), %r11 - - cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ - jne swapgs_restore_regs_and_return_to_usermode - - /* - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. - * - * If width of "canonical tail" ever becomes variable, this will need - * to be updated to remain correct on both old and new CPUs. - * - * Change top bits to match most significant bit (47th or 56th bit - * depending on paging mode) in the address. - */ -#ifdef CONFIG_X86_5LEVEL - ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \ - "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57 -#else - shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx - sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx -#endif - - /* If this changed %rcx, it was not canonical */ - cmpq %rcx, %r11 - jne swapgs_restore_regs_and_return_to_usermode - - cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ - jne swapgs_restore_regs_and_return_to_usermode - - movq R11(%rsp), %r11 - cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ - jne swapgs_restore_regs_and_return_to_usermode - - /* - * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot - * restore RF properly. If the slowpath sets it for whatever reason, we - * need to restore it correctly. - * - * SYSRET can restore TF, but unlike IRET, restoring TF results in a - * trap from userspace immediately after SYSRET. This would cause an - * infinite loop whenever #DB happens with register state that satisfies - * the opportunistic SYSRET conditions. For example, single-stepping - * this user code: - * - * movq $stuck_here, %rcx - * pushfq - * popq %r11 - * stuck_here: - * - * would never get past 'stuck_here'. - */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz swapgs_restore_regs_and_return_to_usermode - - /* nothing to check for RSP */ - - cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ - jne swapgs_restore_regs_and_return_to_usermode + ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \ + "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV /* * We win! This label is here just for ease of understanding @@ -1509,18 +1447,16 @@ nmi_restore: iretq SYM_CODE_END(asm_exc_nmi) -#ifndef CONFIG_IA32_EMULATION /* * This handles SYSCALL from 32-bit code. There is no way to program * MSRs to fully disable 32-bit SYSCALL. */ -SYM_CODE_START(ignore_sysret) +SYM_CODE_START(entry_SYSCALL32_ignore) UNWIND_HINT_END_OF_STACK ENDBR mov $-ENOSYS, %eax sysretl -SYM_CODE_END(ignore_sysret) -#endif +SYM_CODE_END(entry_SYSCALL32_ignore) .pushsection .text, "ax" __FUNC_ALIGN diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 70150298f8bd..27c05d08558a 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -118,9 +118,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) movq %rsp, %rdi call do_SYSENTER_32 - /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ - "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV jmp sysret32_from_system_call .Lsysenter_fix_flags: @@ -212,13 +209,15 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) movq %rsp, %rdi call do_fast_syscall_32 + +sysret32_from_system_call: /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ + ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV - /* Opportunistic SYSRET */ -sysret32_from_system_call: /* + * Opportunistic SYSRET + * * We are not going to return to userspace from the trampoline * stack. So let's erase the thread stack right now. */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 18fd06f7936a..a0234dfd1031 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -149,7 +150,7 @@ do { \ ((x)->e_machine == EM_X86_64) #define compat_elf_check_arch(x) \ - (elf_check_arch_ia32(x) || \ + ((elf_check_arch_ia32(x) && ia32_enabled()) || \ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) static inline void elf_common_init(struct thread_struct *t, diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index fada857f0a1e..5a2ae24b1204 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -68,6 +68,20 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm); #endif -#endif /* CONFIG_IA32_EMULATION */ +extern bool __ia32_enabled; + +static inline bool ia32_enabled(void) +{ + return __ia32_enabled; +} + +#else /* !CONFIG_IA32_EMULATION */ + +static inline bool ia32_enabled(void) +{ + return IS_ENABLED(CONFIG_X86_32); +} + +#endif #endif /* _ASM_X86_IA32_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a3669a7774ed..6e30b27b1ebe 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -399,7 +399,7 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -extern asmlinkage void ignore_sysret(void); +extern asmlinkage void entry_SYSCALL32_ignore(void); /* Save actual FS/GS selectors and bases to current->thread */ void current_save_fsgs(void); diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 12ef86b19910..4d84122bd643 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -36,6 +36,9 @@ void entry_INT80_compat(void); #ifdef CONFIG_XEN_PV void xen_entry_INT80_compat(void); #endif +#else /* !CONFIG_IA32_EMULATION */ +#define entry_SYSCALL_compat NULL +#define entry_SYSENTER_compat NULL #endif void x86_configure_nx(void); diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 4fb36fba4b5a..f44e2f9ab65d 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -126,12 +126,12 @@ static inline int syscall_get_arch(struct task_struct *task) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } -void do_syscall_64(struct pt_regs *regs, int nr); +bool do_syscall_64(struct pt_regs *regs, int nr); #endif /* CONFIG_X86_32 */ void do_int80_syscall_32(struct pt_regs *regs); -long do_fast_syscall_32(struct pt_regs *regs); -long do_SYSENTER_32(struct pt_regs *regs); +bool do_fast_syscall_32(struct pt_regs *regs); +bool do_SYSENTER_32(struct pt_regs *regs); #endif /* _ASM_X86_SYSCALL_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4e5ffc8b0e46..0a3ea787ac51 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -2074,24 +2075,24 @@ void syscall_init(void) wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); -#ifdef CONFIG_IA32_EMULATION - wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); - /* - * This only works on Intel CPUs. - * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. - * This does not cause SYSENTER to jump to the wrong location, because - * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). - */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, - (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); -#else - wrmsrl_cstar((unsigned long)ignore_sysret); - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); -#endif + if (ia32_enabled()) { + wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); + /* + * This only works on Intel CPUs. + * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. + * This does not cause SYSENTER to jump to the wrong location, because + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + } else { + wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); + } /* * Flags to clear on syscall; clear as much as possible diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index b786d48f5a0f..8857abc706e4 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #define DPL0 0x0 @@ -116,6 +117,9 @@ static const __initconst struct idt_data def_idts[] = { #endif SYSG(X86_TRAP_OF, asm_exc_overflow), +}; + +static const struct idt_data ia32_idt[] __initconst = { #if defined(CONFIG_IA32_EMULATION) SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), #elif defined(CONFIG_X86_32) @@ -225,6 +229,9 @@ void __init idt_setup_early_traps(void) void __init idt_setup_traps(void) { idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true); + + if (ia32_enabled()) + idt_setup_from_table(idt_table, ia32_idt, ARRAY_SIZE(ia32_idt), true); } #ifdef CONFIG_X86_64