Changes to the x86 entry code in v6.7:

- Make IA32_EMULATION boot time configurable with
    the new ia32_emulation=<bool> boot option.
 
  - Clean up fast syscall return validation code: convert
    it to C and refactor the code.
 
  - As part of this, optimize the canonical RIP test code.
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmU9DiARHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1iNAw//cLn9gBXMVPDiCDVUOTqjkZ+OwIF11Y9v
 WatksSe5hrw0Bzl5CiSvtrWpTkKPnhyM8Lc1WD8l0YSMKprdkQfNAvQOPv0IMLjk
 XP1pgQhAiXwB87XL/G2sA6RunuK56zlnl7KJiDrQThrS/WOfrq3UkB2vyYEP4GtP
 69WZ/WM++u74uEml0+HZ0Z9HVvzwYl1VQPdTYfl52S4H3U8MXL89YEsPr13Ttq88
 FMKdXJ/VvItuVM/ZHHqFkGvRJjUtDWePLu29b684Ap6onDJ7uMMw86Gj5UxXtdpB
 Axsjuwlca8sCPotcqohay6IdyxIth6lMdvjPv0KhA+/QMrHbDaluv88YQs4k7Add
 1GPULH6oeDTHxMPOcJmFuSTpMY8HP6O9ZIXB6ogQRkLaDJKaWr5UQU7L2VBQ/WUy
 NRa6mba0XHYrz6U7DmtsdL0idWBJeJokHmaIcGJ/pp6gMznvufm2+SoJ6w6wcYva
 VTSTyrAAj/N9/TzJ5i8S2+yDPI9GanFpZJfYbW/rT9XGutvXWVKe3AmUNgR8O+hE
 JiEMfpR0TtXXlrik74jur/RPZhaFIE8MeCvJrkJ3oxQlPThYSTMBAlUOtD7kOfNT
 onjPrumREX4hOIBU+nnC9VrJMqxX9lz4xDzqw3jvX99Ma0o8Wx/UndWELX8tAYwd
 j8M8NWAbv90=
 =YkaP
 -----END PGP SIGNATURE-----

Merge tag 'x86-entry-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 entry updates from Ingo Molnar:

 - Make IA32_EMULATION boot time configurable with
   the new ia32_emulation=<bool> boot option

 - Clean up fast syscall return validation code: convert
   it to C and refactor the code

 - As part of this, optimize the canonical RIP test code

* tag 'x86-entry-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/entry/32: Clean up syscall fast exit tests
  x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test
  x86/entry/64: Convert SYSRET validation tests to C
  x86/entry/32: Remove SEP test for SYSEXIT
  x86/entry/32: Convert do_fast_syscall_32() to bool return type
  x86/entry/compat: Combine return value test from syscall handler
  x86/entry/64: Remove obsolete comment on tracing vs. SYSRET
  x86: Make IA32_EMULATION boot time configurable
  x86/entry: Make IA32 syscalls' availability depend on ia32_enabled()
  x86/elf: Make loading of 32bit processes depend on ia32_enabled()
  x86/entry: Compile entry_SYSCALL32_ignore() unconditionally
  x86/entry: Rename ignore_sysret()
  x86: Introduce ia32_enabled()
This commit is contained in:
Linus Torvalds 2023-10-30 15:27:27 -10:00
commit ed766c2611
13 changed files with 155 additions and 132 deletions

View File

@ -1893,6 +1893,12 @@
0 -- machine default
1 -- force brightness inversion
ia32_emulation= [X86-64]
Format: <bool>
When true, allows loading 32-bit programs and executing 32-bit
syscalls, essentially overriding IA32_EMULATION_DEFAULT_DISABLED at
boot time. When false, unconditionally disables IA32 emulation.
icn= [HW,ISDN]
Format: <io>[,<membase>[,<icn_id>[,<icn_id2>]]]

View File

@ -2955,6 +2955,15 @@ config IA32_EMULATION
64-bit kernel. You should likely turn this on, unless you're
100% sure that you don't have any 32-bit programs left.
config IA32_EMULATION_DEFAULT_DISABLED
bool "IA32 emulation disabled by default"
default n
depends on IA32_EMULATION
help
Make IA32 emulation disabled by default. This prevents loading 32-bit
processes and access to 32-bit syscalls. If unsure, leave it to its
default value.
config X86_X32_ABI
bool "x32 ABI for 64-bit mode"
depends on X86_64

View File

@ -19,6 +19,7 @@
#include <linux/nospec.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/init.h>
#ifdef CONFIG_XEN_PV
#include <xen/xen-ops.h>
@ -70,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
return false;
}
__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
/* Returns true to return using SYSRET, or false to use IRET */
__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
{
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
@ -84,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
instrumentation_end();
syscall_exit_to_user_mode(regs);
/*
* Check that the register state is valid for using SYSRET to exit
* to userspace. Otherwise use the slower but fully capable IRET
* exit path.
*/
/* XEN PV guests always use the IRET path */
if (cpu_feature_enabled(X86_FEATURE_XENPV))
return false;
/* SYSRET requires RCX == RIP and R11 == EFLAGS */
if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
return false;
/* CS and SS must match the values set in MSR_STAR */
if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
return false;
/*
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
* in kernel space. This essentially lets the user take over
* the kernel, since userspace controls RSP.
*
* TASK_SIZE_MAX covers all user-accessible addresses other than
* the deprecated vsyscall page.
*/
if (unlikely(regs->ip >= TASK_SIZE_MAX))
return false;
/*
* SYSRET cannot restore RF. It can restore TF, but unlike IRET,
* restoring TF results in a trap from userspace immediately after
* SYSRET.
*/
if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
return false;
/* Use SYSRET to exit to userspace */
return true;
}
#endif
@ -96,6 +138,16 @@ static __always_inline int syscall_32_enter(struct pt_regs *regs)
return (int)regs->orig_ax;
}
#ifdef CONFIG_IA32_EMULATION
bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
static int ia32_emulation_override_cmdline(char *arg)
{
return kstrtobool(arg, &__ia32_enabled);
}
early_param("ia32_emulation", ia32_emulation_override_cmdline);
#endif
/*
* Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
*/
@ -182,8 +234,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
return true;
}
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
{
/*
* Called using the internal vDSO SYSENTER/SYSCALL32 calling
@ -201,41 +253,36 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
/* Invoke the syscall. If it failed, keep it simple: use IRET. */
if (!__do_fast_syscall_32(regs))
return 0;
return false;
#ifdef CONFIG_X86_64
/*
* Opportunistic SYSRETL: if possible, try to return using SYSRETL.
* SYSRETL is available on all 64-bit CPUs, so we don't need to
* bother with SYSEXIT.
*
* Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
* because the ECX fixup above will ensure that this is essentially
* never the case.
* Check that the register state is valid for using SYSRETL/SYSEXIT
* to exit to userspace. Otherwise use the slower but fully capable
* IRET exit path.
*/
return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
regs->ip == landing_pad &&
(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
#else
/*
* Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
*
* Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
* because the ECX fixup above will ensure that this is essentially
* never the case.
*
* We don't allow syscalls at all from VM86 mode, but we still
* need to check VM, because we might be returning from sys_vm86.
*/
return static_cpu_has(X86_FEATURE_SEP) &&
regs->cs == __USER_CS && regs->ss == __USER_DS &&
regs->ip == landing_pad &&
(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
#endif
/* XEN PV guests always use the IRET path */
if (cpu_feature_enabled(X86_FEATURE_XENPV))
return false;
/* EIP must point to the VDSO landing pad */
if (unlikely(regs->ip != landing_pad))
return false;
/* CS and SS must match the values set in MSR_STAR */
if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
return false;
/* If the TF, RF, or VM flags are set, use IRET */
if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
return false;
/* Use SYSRETL/SYSEXIT to exit to userspace */
return true;
}
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
{
/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
regs->sp = regs->bp;

View File

@ -837,7 +837,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
movl %esp, %eax
call do_SYSENTER_32
testl %eax, %eax
testb %al, %al
jz .Lsyscall_32_done
STACKLEAK_ERASE

View File

@ -126,70 +126,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* In the Xen PV case we must use iret anyway.
*/
ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
X86_FEATURE_XENPV
movq RCX(%rsp), %rcx
movq RIP(%rsp), %r11
cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
jne swapgs_restore_regs_and_return_to_usermode
/*
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
* in kernel space. This essentially lets the user take over
* the kernel, since userspace controls RSP.
*
* If width of "canonical tail" ever becomes variable, this will need
* to be updated to remain correct on both old and new CPUs.
*
* Change top bits to match most significant bit (47th or 56th bit
* depending on paging mode) in the address.
*/
#ifdef CONFIG_X86_5LEVEL
ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
"shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
#else
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
#endif
/* If this changed %rcx, it was not canonical */
cmpq %rcx, %r11
jne swapgs_restore_regs_and_return_to_usermode
cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
jne swapgs_restore_regs_and_return_to_usermode
movq R11(%rsp), %r11
cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
jne swapgs_restore_regs_and_return_to_usermode
/*
* SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
* restore RF properly. If the slowpath sets it for whatever reason, we
* need to restore it correctly.
*
* SYSRET can restore TF, but unlike IRET, restoring TF results in a
* trap from userspace immediately after SYSRET. This would cause an
* infinite loop whenever #DB happens with register state that satisfies
* the opportunistic SYSRET conditions. For example, single-stepping
* this user code:
*
* movq $stuck_here, %rcx
* pushfq
* popq %r11
* stuck_here:
*
* would never get past 'stuck_here'.
*/
testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
jnz swapgs_restore_regs_and_return_to_usermode
/* nothing to check for RSP */
cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
jne swapgs_restore_regs_and_return_to_usermode
ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
/*
* We win! This label is here just for ease of understanding
@ -1509,18 +1447,16 @@ nmi_restore:
iretq
SYM_CODE_END(asm_exc_nmi)
#ifndef CONFIG_IA32_EMULATION
/*
* This handles SYSCALL from 32-bit code. There is no way to program
* MSRs to fully disable 32-bit SYSCALL.
*/
SYM_CODE_START(ignore_sysret)
SYM_CODE_START(entry_SYSCALL32_ignore)
UNWIND_HINT_END_OF_STACK
ENDBR
mov $-ENOSYS, %eax
sysretl
SYM_CODE_END(ignore_sysret)
#endif
SYM_CODE_END(entry_SYSCALL32_ignore)
.pushsection .text, "ax"
__FUNC_ALIGN

View File

@ -118,9 +118,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
movq %rsp, %rdi
call do_SYSENTER_32
/* XEN PV guests always use IRET path */
ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
jmp sysret32_from_system_call
.Lsysenter_fix_flags:
@ -212,13 +209,15 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)
movq %rsp, %rdi
call do_fast_syscall_32
sysret32_from_system_call:
/* XEN PV guests always use IRET path */
ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
/* Opportunistic SYSRET */
sysret32_from_system_call:
/*
* Opportunistic SYSRET
*
* We are not going to return to userspace from the trampoline
* stack. So let's erase the thread stack right now.
*/

View File

@ -7,6 +7,7 @@
*/
#include <linux/thread_info.h>
#include <asm/ia32.h>
#include <asm/ptrace.h>
#include <asm/user.h>
#include <asm/auxvec.h>
@ -149,7 +150,7 @@ do { \
((x)->e_machine == EM_X86_64)
#define compat_elf_check_arch(x) \
(elf_check_arch_ia32(x) || \
((elf_check_arch_ia32(x) && ia32_enabled()) || \
(IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
static inline void elf_common_init(struct thread_struct *t,

View File

@ -68,6 +68,20 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm);
#endif
#endif /* CONFIG_IA32_EMULATION */
extern bool __ia32_enabled;
static inline bool ia32_enabled(void)
{
return __ia32_enabled;
}
#else /* !CONFIG_IA32_EMULATION */
static inline bool ia32_enabled(void)
{
return IS_ENABLED(CONFIG_X86_32);
}
#endif
#endif /* _ASM_X86_IA32_H */

View File

@ -399,7 +399,7 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu)
return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
}
extern asmlinkage void ignore_sysret(void);
extern asmlinkage void entry_SYSCALL32_ignore(void);
/* Save actual FS/GS selectors and bases to current->thread */
void current_save_fsgs(void);

View File

@ -36,6 +36,9 @@ void entry_INT80_compat(void);
#ifdef CONFIG_XEN_PV
void xen_entry_INT80_compat(void);
#endif
#else /* !CONFIG_IA32_EMULATION */
#define entry_SYSCALL_compat NULL
#define entry_SYSENTER_compat NULL
#endif
void x86_configure_nx(void);

View File

@ -126,12 +126,12 @@ static inline int syscall_get_arch(struct task_struct *task)
? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}
void do_syscall_64(struct pt_regs *regs, int nr);
bool do_syscall_64(struct pt_regs *regs, int nr);
#endif /* CONFIG_X86_32 */
void do_int80_syscall_32(struct pt_regs *regs);
long do_fast_syscall_32(struct pt_regs *regs);
long do_SYSENTER_32(struct pt_regs *regs);
bool do_fast_syscall_32(struct pt_regs *regs);
bool do_SYSENTER_32(struct pt_regs *regs);
#endif /* _ASM_X86_SYSCALL_H */

View File

@ -62,6 +62,7 @@
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
#include <asm/uv/uv.h>
#include <asm/ia32.h>
#include <asm/set_memory.h>
#include <asm/traps.h>
#include <asm/sev.h>
@ -2074,24 +2075,24 @@ void syscall_init(void)
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
* This does not cause SYSENTER to jump to the wrong location, because
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
(unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl_cstar((unsigned long)ignore_sysret);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif
if (ia32_enabled()) {
wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
* This does not cause SYSENTER to jump to the wrong location, because
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
(unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
} else {
wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
}
/*
* Flags to clear on syscall; clear as much as possible

View File

@ -10,6 +10,7 @@
#include <asm/proto.h>
#include <asm/desc.h>
#include <asm/hw_irq.h>
#include <asm/ia32.h>
#include <asm/idtentry.h>
#define DPL0 0x0
@ -116,6 +117,9 @@ static const __initconst struct idt_data def_idts[] = {
#endif
SYSG(X86_TRAP_OF, asm_exc_overflow),
};
static const struct idt_data ia32_idt[] __initconst = {
#if defined(CONFIG_IA32_EMULATION)
SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
#elif defined(CONFIG_X86_32)
@ -225,6 +229,9 @@ void __init idt_setup_early_traps(void)
void __init idt_setup_traps(void)
{
idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
if (ia32_enabled())
idt_setup_from_table(idt_table, ia32_idt, ARRAY_SIZE(ia32_idt), true);
}
#ifdef CONFIG_X86_64