Updates for x86 memory management:

- Make LAM enablement safe vs. kernel threads using a process mm
    temporarily as switching back to the process would not update CR3 and
    therefore not enable LAM causing faults in user space when using tagged
    pointers. Cure it by synchronizing LAM enablement via IPIs to all CPUs
    which use the related mm.
 
  - Cure a LAM harmless inconsistency between CR3 and the state during
    context switch. It's both confusing and prone to lead to real bugs
 
  - Handle alt stack handling for threads which run with a non-zero
    protection key. The non-zero key prevents the kernel to access the
    alternate stack. Cure it by temporarily enabling all protection keys for
    the alternate stack setup/restore operations.
 
  - Provide a EFI config table identity mapping for kexec kernel to prevent
    kexec fails because the new kernel cannot access the config table array
 
  - Use GB pages only when a full GB is mapped in the identity map as
    otherwise the CPU can speculate into reserved areas after the end of
    memory which causes malfunction on UV systems.
 
  - Remove the noisy and pointless SRAT table dump during boot
 
  - Use is_ioremap_addr() for iounmap() address range checks instead of
    high_memory. is_ioremap_addr() is more precise.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmbpPpYTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoYddD/9HeH5/rpWS3JU4ZVC+huY28uJuwAFW
 ER48zniRbmuz8y+dZZ6K8uvqoWB+ro+yNjA9Jhm9nHUzhs7kE5O8+bmkUi6HXViW
 6zS6PW95+u80dmSGy1Gna0SU3158OyBf2X61SySJABLLek7WwrR7jakkgrDBVtL5
 ILKS/dUwIrUPoVlszCh9uE0Kj6gdFquooE06sif5EIibnhSgSXfr2EbGj0Qq/YYf
 FYfpggSSVpTXFSkZSB2VCEqK66jaGUfKzZ6v1DkSioChUCsky2OO6zD9pk0dMixO
 a/0XvRUo3OhiXZbj1tPUtxaEBgJdigpsxke7xQSVxSl+DNNuapiybpgAzFM5Xh+m
 yFcP66nIpJcHE10vjVR3jSUlTSb2zk+v9d1Ujj10G1h8RHLTfsTCRHgzs7P0/nkE
 NJleWstYVRV5rFpPLoY0ryQmjW/PzYokkaqWKI12Lhxg4ojijZso3pS8WfOsk1/B
 081tOZERWeGnJEOOJwwYE1wt0Qq8th4S9b2/fz3vk2fsEHIf42s4fKQwy1CxKopb
 PyIrgnZyWx6ueX9QaIGIzGV1GsY4FKMgFJVOyVb0D0stMnr1ty2m3993eNs/nCXy
 +rHPMwFteLcwiWp/C3hq5IQd7uEvmRt/mYJ5hdvCj5wCIkXI3JtgsXfLSVs3Ln4f
 R6HvZehYmbJoNQ==
 =VZcR
 -----END PGP SIGNATURE-----

Merge tag 'x86-mm-2024-09-17' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 memory management updates from Thomas Gleixner:

 - Make LAM enablement safe vs. kernel threads using a process mm
   temporarily as switching back to the process would not update CR3 and
   therefore not enable LAM causing faults in user space when using
   tagged pointers. Cure it by synchronizing LAM enablement via IPIs to
   all CPUs which use the related mm.

 - Cure a LAM harmless inconsistency between CR3 and the state during
   context switch. It's both confusing and prone to lead to real bugs

 - Handle alt stack handling for threads which run with a non-zero
   protection key. The non-zero key prevents the kernel to access the
   alternate stack. Cure it by temporarily enabling all protection keys
   for the alternate stack setup/restore operations.

 - Provide a EFI config table identity mapping for kexec kernel to
   prevent kexec fails because the new kernel cannot access the config
   table array

 - Use GB pages only when a full GB is mapped in the identity map as
   otherwise the CPU can speculate into reserved areas after the end of
   memory which causes malfunction on UV systems.

 - Remove the noisy and pointless SRAT table dump during boot

 - Use is_ioremap_addr() for iounmap() address range checks instead of
   high_memory. is_ioremap_addr() is more precise.

* tag 'x86-mm-2024-09-17' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/ioremap: Improve iounmap() address range checks
  x86/mm: Remove duplicate check from build_cr3()
  x86/mm: Remove unused NX related declarations
  x86/mm: Remove unused CR3_HW_ASID_BITS
  x86/mm: Don't print out SRAT table information
  x86/mm/ident_map: Use gbpages only where full GB page should be mapped.
  x86/kexec: Add EFI config table identity mapping for kexec kernel
  selftests/mm: Add new testcases for pkeys
  x86/pkeys: Restore altstack access in sigreturn()
  x86/pkeys: Update PKRU to enable all pkeys before XSAVE
  x86/pkeys: Add helper functions to update PKRU on the sigframe
  x86/pkeys: Add PKRU as a parameter in signal handling functions
  x86/mm: Cleanup prctl_enable_tagged_addr() nr_bits error checking
  x86/mm: Fix LAM inconsistency during context switch
  x86/mm: Use IPIs to synchronize LAM enablement
This commit is contained in:
Linus Torvalds 2024-09-17 15:03:01 +02:00
commit 70f43ea3a3
20 changed files with 664 additions and 60 deletions

View File

@ -29,7 +29,7 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
unsigned long fpu__get_fpstate_size(void);
extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size, u32 pkru);
extern void fpu__clear_user_states(struct fpu *fpu);
extern bool fpu__restore_sig(void __user *buf, int ia32_frame);

View File

@ -88,7 +88,13 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
#ifdef CONFIG_ADDRESS_MASKING
static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
return mm->context.lam_cr3_mask;
/*
* When switch_mm_irqs_off() is called for a kthread, it may race with
* LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two
* things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it
* reads a single value for both.
*/
return READ_ONCE(mm->context.lam_cr3_mask);
}
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)

View File

@ -517,8 +517,6 @@ typedef struct page *pgtable_t;
extern pteval_t __supported_pte_mask;
extern pteval_t __default_kernel_pte_mask;
extern void set_nx(void);
extern int nx_enabled;
#define pgprot_writecombine pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);

View File

@ -399,11 +399,10 @@ static inline u64 tlbstate_lam_cr3_mask(void)
return lam << X86_CR3_LAM_U57_BIT;
}
static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
{
this_cpu_write(cpu_tlbstate.lam,
mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT);
this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask);
this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT);
this_cpu_write(tlbstate_untag_mask, untag_mask);
}
#else
@ -413,7 +412,7 @@ static inline u64 tlbstate_lam_cr3_mask(void)
return 0;
}
static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
{
}
#endif

View File

@ -63,6 +63,16 @@ setfx:
return true;
}
/*
* Update the value of PKRU register that was already pushed onto the signal frame.
*/
static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru)
{
if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE)))
return 0;
return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU));
}
/*
* Signal frame handlers.
*/
@ -156,10 +166,17 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
return !err;
}
static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru)
{
if (use_xsave())
return xsave_to_user_sigframe(buf);
int err = 0;
if (use_xsave()) {
err = xsave_to_user_sigframe(buf);
if (!err)
err = update_pkru_in_sigframe(buf, pkru);
return err;
}
if (use_fxsr())
return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
else
@ -185,7 +202,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
* For [f]xsave state, update the SW reserved fields in the [f]xsave frame
* indicating the absence/presence of the extended state to the user.
*/
bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru)
{
struct task_struct *tsk = current;
struct fpstate *fpstate = tsk->thread.fpu.fpstate;
@ -228,7 +245,7 @@ retry:
fpregs_restore_userregs();
pagefault_disable();
ret = copy_fpregs_to_sigframe(buf_fx);
ret = copy_fpregs_to_sigframe(buf_fx, pkru);
pagefault_enable();
fpregs_unlock();

View File

@ -999,6 +999,19 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
}
EXPORT_SYMBOL_GPL(get_xsave_addr);
/*
* Given an xstate feature nr, calculate where in the xsave buffer the state is.
* The xsave buffer should be in standard format, not compacted (e.g. user mode
* signal frames).
*/
void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
{
if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
return NULL;
return (void __user *)xsave + xstate_offsets[xfeature_nr];
}
#ifdef CONFIG_ARCH_HAS_PKEYS
/*

View File

@ -54,6 +54,8 @@ extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void
extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system_xstate(unsigned int legacy_size);
extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr);
static inline u64 xfeatures_mask_supervisor(void)
{
return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;

View File

@ -28,6 +28,7 @@
#include <asm/setup.h>
#include <asm/set_memory.h>
#include <asm/cpu.h>
#include <asm/efi.h>
#ifdef CONFIG_ACPI
/*
@ -87,6 +88,8 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
{
#ifdef CONFIG_EFI
unsigned long mstart, mend;
void *kaddr;
int ret;
if (!efi_enabled(EFI_BOOT))
return 0;
@ -102,6 +105,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
if (!mstart)
return 0;
ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
if (ret)
return ret;
kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
if (!kaddr) {
pr_err("Could not map UEFI system table\n");
return -ENOMEM;
}
mstart = efi_config_table;
if (efi_enabled(EFI_64BIT)) {
efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr;
mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables;
} else {
efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr;
mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables;
}
memunmap(kaddr);
return kernel_ident_mapping_init(info, level4p, mstart, mend);
#endif
return 0;

View File

@ -798,6 +798,32 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
#define LAM_U57_BITS 6
static void enable_lam_func(void *__mm)
{
struct mm_struct *mm = __mm;
unsigned long lam;
if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
lam = mm_lam_cr3_mask(mm);
write_cr3(__read_cr3() | lam);
cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
}
}
static void mm_enable_lam(struct mm_struct *mm)
{
mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
mm->context.untag_mask = ~GENMASK(62, 57);
/*
* Even though the process must still be single-threaded at this
* point, kernel threads may be using the mm. IPI those kernel
* threads if they exist.
*/
on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
}
static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
{
if (!cpu_feature_enabled(X86_FEATURE_LAM))
@ -814,25 +840,21 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
if (mmap_write_lock_killable(mm))
return -EINTR;
/*
* MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from
* being enabled unless the process is single threaded:
*/
if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
mmap_write_unlock(mm);
return -EBUSY;
}
if (!nr_bits) {
mmap_write_unlock(mm);
return -EINVAL;
} else if (nr_bits <= LAM_U57_BITS) {
mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
mm->context.untag_mask = ~GENMASK(62, 57);
} else {
if (!nr_bits || nr_bits > LAM_U57_BITS) {
mmap_write_unlock(mm);
return -EINVAL;
}
write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
set_tlbstate_lam_mode(mm);
set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
mm_enable_lam(mm);
mmap_write_unlock(mm);

View File

@ -60,6 +60,24 @@ static inline int is_x32_frame(struct ksignal *ksig)
ksig->ka.sa.sa_flags & SA_X32_ABI;
}
/*
* Enable all pkeys temporarily, so as to ensure that both the current
* execution stack as well as the alternate signal stack are writeable.
* The application can use any of the available pkeys to protect the
* alternate signal stack, and we don't know which one it is, so enable
* all. The PKRU register will be reset to init_pkru later in the flow,
* in fpu__clear_user_states(), and it is the application's responsibility
* to enable the appropriate pkey as the first step in the signal handler
* so that the handler does not segfault.
*/
static inline u32 sig_prepare_pkru(void)
{
u32 orig_pkru = read_pkru();
write_pkru(0);
return orig_pkru;
}
/*
* Set up a signal frame.
*/
@ -84,6 +102,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
unsigned long math_size = 0;
unsigned long sp = regs->sp;
unsigned long buf_fx = 0;
u32 pkru;
/* redzone */
if (!ia32_frame)
@ -138,9 +157,17 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
return (void __user *)-1L;
}
/* Update PKRU to enable access to the alternate signal stack. */
pkru = sig_prepare_pkru();
/* save i387 and extended state */
if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size))
if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) {
/*
* Restore PKRU to the original, user-defined value; disable
* extra pkeys enabled for the alternate signal stack, if any.
*/
write_pkru(pkru);
return (void __user *)-1L;
}
return (void __user *)sp;
}

View File

@ -260,15 +260,15 @@ SYSCALL_DEFINE0(rt_sigreturn)
set_current_blocked(&set);
if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
if (restore_signal_shadow_stack())
goto badframe;
if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
return regs->ax;
badframe:

View File

@ -99,18 +99,31 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
for (; addr < end; addr = next) {
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
bool use_gbpage;
next = (addr & PUD_MASK) + PUD_SIZE;
if (next > end)
next = end;
if (info->direct_gbpages) {
/* if this is already a gbpage, this portion is already mapped */
if (pud_leaf(*pud))
continue;
/* Is using a gbpage allowed? */
use_gbpage = info->direct_gbpages;
/* Don't use gbpage if it maps more than the requested region. */
/* at the begining: */
use_gbpage &= ((addr & ~PUD_MASK) == 0);
/* ... or at the end: */
use_gbpage &= ((next & ~PUD_MASK) == 0);
/* Never overwrite existing mappings */
use_gbpage &= !pud_present(*pud);
if (use_gbpage) {
pud_t pudval;
if (pud_present(*pud))
continue;
addr &= PUD_MASK;
pudval = __pud((addr - info->offset) | info->page_flag);
set_pud(pud, pudval);
continue;

View File

@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/ioport.h>
#include <linux/ioremap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mmiotrace.h>
@ -457,7 +458,7 @@ void iounmap(volatile void __iomem *addr)
{
struct vm_struct *p, *o;
if ((void __force *)addr <= high_memory)
if (WARN_ON_ONCE(!is_ioremap_addr((void __force *)addr)))
return;
/*

View File

@ -57,8 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
}
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
pxm, apic_id, node);
pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node);
}
/* Callback for Proximity Domain -> LAPIC mapping */
@ -98,8 +97,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
pxm, apic_id, node);
pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node);
}
int __init x86_acpi_numa_init(void)

View File

@ -11,6 +11,7 @@
#include <linux/sched/smt.h>
#include <linux/task_work.h>
#include <linux/mmu_notifier.h>
#include <linux/mmu_context.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@ -85,9 +86,6 @@
*
*/
/* There are 12 bits of space for ASIDS in CR3 */
#define CR3_HW_ASID_BITS 12
/*
* When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
@ -160,7 +158,6 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
unsigned long cr3 = __sme_pa(pgd) | lam;
if (static_cpu_has(X86_FEATURE_PCID)) {
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
cr3 |= kern_pcid(asid);
} else {
VM_WARN_ON_ONCE(asid != 0);
@ -503,9 +500,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
{
struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
unsigned long new_lam;
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
@ -619,9 +616,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
cpumask_clear_cpu(cpu, mm_cpumask(prev));
}
/*
* Start remote flushes and then read tlb_gen.
*/
/* Start receiving IPIs and then read tlb_gen (and LAM below) */
if (next != &init_mm)
cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
@ -633,7 +628,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
barrier();
}
set_tlbstate_lam_mode(next);
new_lam = mm_lam_cr3_mask(next);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@ -652,6 +647,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next));
if (next != prev) {
cr4_update_pce_mm(next);
@ -698,6 +694,7 @@ void initialize_tlbstate_and_flush(void)
int i;
struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
unsigned long lam = mm_lam_cr3_mask(mm);
unsigned long cr3 = __read_cr3();
/* Assert that CR3 already references the right mm. */
@ -705,7 +702,7 @@ void initialize_tlbstate_and_flush(void)
/* LAM expected to be disabled */
WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
WARN_ON(mm_lam_cr3_mask(mm));
WARN_ON(lam);
/*
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
@ -724,7 +721,7 @@ void initialize_tlbstate_and_flush(void)
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
set_tlbstate_lam_mode(mm);
cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);

View File

@ -4,6 +4,7 @@
#include <linux/kasan.h>
#include <asm/pgtable.h>
#include <asm/vmalloc.h>
#if defined(CONFIG_HAS_IOMEM) || defined(CONFIG_GENERIC_IOREMAP)
/*

View File

@ -90,6 +90,7 @@ CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_pr
CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie)
VMTARGETS := protection_keys
VMTARGETS += pkey_sighandler_tests
BINARIES_32 := $(VMTARGETS:%=%_32)
BINARIES_64 := $(VMTARGETS:%=%_64)

View File

@ -79,7 +79,18 @@ extern void abort_hooks(void);
} \
} while (0)
__attribute__((noinline)) int read_ptr(int *ptr);
#define barrier() __asm__ __volatile__("": : :"memory")
#ifndef noinline
# define noinline __attribute__((noinline))
#endif
noinline int read_ptr(int *ptr)
{
/* Keep GCC from optimizing this away somehow */
barrier();
return *ptr;
}
void expected_pkey_fault(int pkey);
int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
int sys_pkey_free(unsigned long pkey);

View File

@ -0,0 +1,481 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
*
* The testcases in this file exercise various flows related to signal handling,
* using an alternate signal stack, with the default pkey (pkey 0) disabled.
*
* Compile with:
* gcc -mxsave -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm
* gcc -mxsave -m32 -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm
*/
#define _GNU_SOURCE
#define __SANE_USERSPACE_TYPES__
#include <errno.h>
#include <sys/syscall.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <signal.h>
#include <assert.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <pthread.h>
#include <limits.h>
#include "pkey-helpers.h"
#define STACK_SIZE PTHREAD_STACK_MIN
void expected_pkey_fault(int pkey) {}
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
siginfo_t siginfo = {0};
/*
* We need to use inline assembly instead of glibc's syscall because glibc's
* syscall will attempt to access the PLT in order to call a library function
* which is protected by MPK 0 which we don't have access to.
*/
static inline __always_inline
long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6)
{
unsigned long ret;
#ifdef __x86_64__
register long r10 asm("r10") = a4;
register long r8 asm("r8") = a5;
register long r9 asm("r9") = a6;
asm volatile ("syscall"
: "=a"(ret)
: "a"(n), "D"(a1), "S"(a2), "d"(a3), "r"(r10), "r"(r8), "r"(r9)
: "rcx", "r11", "memory");
#elif defined __i386__
asm volatile ("int $0x80"
: "=a"(ret)
: "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5)
: "memory");
#else
# error syscall_raw() not implemented
#endif
return ret;
}
static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext)
{
pthread_mutex_lock(&mutex);
memcpy(&siginfo, info, sizeof(siginfo_t));
pthread_cond_signal(&cond);
pthread_mutex_unlock(&mutex);
syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
}
static void sigusr1_handler(int signo, siginfo_t *info, void *ucontext)
{
pthread_mutex_lock(&mutex);
memcpy(&siginfo, info, sizeof(siginfo_t));
pthread_cond_signal(&cond);
pthread_mutex_unlock(&mutex);
}
static void sigusr2_handler(int signo, siginfo_t *info, void *ucontext)
{
/*
* pkru should be the init_pkru value which enabled MPK 0 so
* we can use library functions.
*/
printf("%s invoked.\n", __func__);
}
static void raise_sigusr2(void)
{
pid_t tid = 0;
tid = syscall_raw(SYS_gettid, 0, 0, 0, 0, 0, 0);
syscall_raw(SYS_tkill, tid, SIGUSR2, 0, 0, 0, 0);
/*
* We should return from the signal handler here and be able to
* return to the interrupted thread.
*/
}
static void *thread_segv_with_pkey0_disabled(void *ptr)
{
/* Disable MPK 0 (and all others too) */
__write_pkey_reg(0x55555555);
/* Segfault (with SEGV_MAPERR) */
*(int *) (0x1) = 1;
return NULL;
}
static void *thread_segv_pkuerr_stack(void *ptr)
{
/* Disable MPK 0 (and all others too) */
__write_pkey_reg(0x55555555);
/* After we disable MPK 0, we can't access the stack to return */
return NULL;
}
static void *thread_segv_maperr_ptr(void *ptr)
{
stack_t *stack = ptr;
int *bad = (int *)1;
/*
* Setup alternate signal stack, which should be pkey_mprotect()ed by
* MPK 0. The thread's stack cannot be used for signals because it is
* not accessible by the default init_pkru value of 0x55555554.
*/
syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
/* Disable MPK 0. Only MPK 1 is enabled. */
__write_pkey_reg(0x55555551);
/* Segfault */
*bad = 1;
syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
return NULL;
}
/*
* Verify that the sigsegv handler is invoked when pkey 0 is disabled.
* Note that the new thread stack and the alternate signal stack is
* protected by MPK 0.
*/
static void test_sigsegv_handler_with_pkey0_disabled(void)
{
struct sigaction sa;
pthread_attr_t attr;
pthread_t thr;
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = sigsegv_handler;
sigemptyset(&sa.sa_mask);
if (sigaction(SIGSEGV, &sa, NULL) == -1) {
perror("sigaction");
exit(EXIT_FAILURE);
}
memset(&siginfo, 0, sizeof(siginfo));
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
pthread_create(&thr, &attr, thread_segv_with_pkey0_disabled, NULL);
pthread_mutex_lock(&mutex);
while (siginfo.si_signo == 0)
pthread_cond_wait(&cond, &mutex);
pthread_mutex_unlock(&mutex);
ksft_test_result(siginfo.si_signo == SIGSEGV &&
siginfo.si_code == SEGV_MAPERR &&
siginfo.si_addr == (void *)1,
"%s\n", __func__);
}
/*
* Verify that the sigsegv handler is invoked when pkey 0 is disabled.
* Note that the new thread stack and the alternate signal stack is
* protected by MPK 0, which renders them inaccessible when MPK 0
* is disabled. So just the return from the thread should cause a
* segfault with SEGV_PKUERR.
*/
static void test_sigsegv_handler_cannot_access_stack(void)
{
struct sigaction sa;
pthread_attr_t attr;
pthread_t thr;
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = sigsegv_handler;
sigemptyset(&sa.sa_mask);
if (sigaction(SIGSEGV, &sa, NULL) == -1) {
perror("sigaction");
exit(EXIT_FAILURE);
}
memset(&siginfo, 0, sizeof(siginfo));
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
pthread_create(&thr, &attr, thread_segv_pkuerr_stack, NULL);
pthread_mutex_lock(&mutex);
while (siginfo.si_signo == 0)
pthread_cond_wait(&cond, &mutex);
pthread_mutex_unlock(&mutex);
ksft_test_result(siginfo.si_signo == SIGSEGV &&
siginfo.si_code == SEGV_PKUERR,
"%s\n", __func__);
}
/*
* Verify that the sigsegv handler that uses an alternate signal stack
* is correctly invoked for a thread which uses a non-zero MPK to protect
* its own stack, and disables all other MPKs (including 0).
*/
static void test_sigsegv_handler_with_different_pkey_for_stack(void)
{
struct sigaction sa;
static stack_t sigstack;
void *stack;
int pkey;
int parent_pid = 0;
int child_pid = 0;
sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
sa.sa_sigaction = sigsegv_handler;
sigemptyset(&sa.sa_mask);
if (sigaction(SIGSEGV, &sa, NULL) == -1) {
perror("sigaction");
exit(EXIT_FAILURE);
}
stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
assert(stack != MAP_FAILED);
/* Allow access to MPK 0 and MPK 1 */
__write_pkey_reg(0x55555550);
/* Protect the new stack with MPK 1 */
pkey = pkey_alloc(0, 0);
pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
/* Set up alternate signal stack that will use the default MPK */
sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
sigstack.ss_flags = 0;
sigstack.ss_size = STACK_SIZE;
memset(&siginfo, 0, sizeof(siginfo));
/* Use clone to avoid newer glibcs using rseq on new threads */
long ret = syscall_raw(SYS_clone,
CLONE_VM | CLONE_FS | CLONE_FILES |
CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
CLONE_DETACHED,
(long) ((char *)(stack) + STACK_SIZE),
(long) &parent_pid,
(long) &child_pid, 0, 0);
if (ret < 0) {
errno = -ret;
perror("clone");
} else if (ret == 0) {
thread_segv_maperr_ptr(&sigstack);
syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
}
pthread_mutex_lock(&mutex);
while (siginfo.si_signo == 0)
pthread_cond_wait(&cond, &mutex);
pthread_mutex_unlock(&mutex);
ksft_test_result(siginfo.si_signo == SIGSEGV &&
siginfo.si_code == SEGV_MAPERR &&
siginfo.si_addr == (void *)1,
"%s\n", __func__);
}
/*
* Verify that the PKRU value set by the application is correctly
* restored upon return from signal handling.
*/
static void test_pkru_preserved_after_sigusr1(void)
{
struct sigaction sa;
unsigned long pkru = 0x45454544;
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = sigusr1_handler;
sigemptyset(&sa.sa_mask);
if (sigaction(SIGUSR1, &sa, NULL) == -1) {
perror("sigaction");
exit(EXIT_FAILURE);
}
memset(&siginfo, 0, sizeof(siginfo));
__write_pkey_reg(pkru);
raise(SIGUSR1);
pthread_mutex_lock(&mutex);
while (siginfo.si_signo == 0)
pthread_cond_wait(&cond, &mutex);
pthread_mutex_unlock(&mutex);
/* Ensure the pkru value is the same after returning from signal. */
ksft_test_result(pkru == __read_pkey_reg() &&
siginfo.si_signo == SIGUSR1,
"%s\n", __func__);
}
static noinline void *thread_sigusr2_self(void *ptr)
{
/*
* A const char array like "Resuming after SIGUSR2" won't be stored on
* the stack and the code could access it via an offset from the program
* counter. This makes sure it's on the function's stack frame.
*/
char str[] = {'R', 'e', 's', 'u', 'm', 'i', 'n', 'g', ' ',
'a', 'f', 't', 'e', 'r', ' ',
'S', 'I', 'G', 'U', 'S', 'R', '2',
'.', '.', '.', '\n', '\0'};
stack_t *stack = ptr;
/*
* Setup alternate signal stack, which should be pkey_mprotect()ed by
* MPK 0. The thread's stack cannot be used for signals because it is
* not accessible by the default init_pkru value of 0x55555554.
*/
syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
/* Disable MPK 0. Only MPK 2 is enabled. */
__write_pkey_reg(0x55555545);
raise_sigusr2();
/* Do something, to show the thread resumed execution after the signal */
syscall_raw(SYS_write, 1, (long) str, sizeof(str) - 1, 0, 0, 0);
/*
* We can't return to test_pkru_sigreturn because it
* will attempt to use a %rbp value which is on the stack
* of the main thread.
*/
syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
return NULL;
}
/*
* Verify that sigreturn is able to restore altstack even if the thread had
* disabled pkey 0.
*/
static void test_pkru_sigreturn(void)
{
struct sigaction sa = {0};
static stack_t sigstack;
void *stack;
int pkey;
int parent_pid = 0;
int child_pid = 0;
sa.sa_handler = SIG_DFL;
sa.sa_flags = 0;
sigemptyset(&sa.sa_mask);
/*
* For this testcase, we do not want to handle SIGSEGV. Reset handler
* to default so that the application can crash if it receives SIGSEGV.
*/
if (sigaction(SIGSEGV, &sa, NULL) == -1) {
perror("sigaction");
exit(EXIT_FAILURE);
}
sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
sa.sa_sigaction = sigusr2_handler;
sigemptyset(&sa.sa_mask);
if (sigaction(SIGUSR2, &sa, NULL) == -1) {
perror("sigaction");
exit(EXIT_FAILURE);
}
stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
assert(stack != MAP_FAILED);
/*
* Allow access to MPK 0 and MPK 2. The child thread (to be created
* later in this flow) will have its stack protected by MPK 2, whereas
* the current thread's stack is protected by the default MPK 0. Hence
* both need to be enabled.
*/
__write_pkey_reg(0x55555544);
/* Protect the stack with MPK 2 */
pkey = pkey_alloc(0, 0);
pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
/* Set up alternate signal stack that will use the default MPK */
sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
sigstack.ss_flags = 0;
sigstack.ss_size = STACK_SIZE;
/* Use clone to avoid newer glibcs using rseq on new threads */
long ret = syscall_raw(SYS_clone,
CLONE_VM | CLONE_FS | CLONE_FILES |
CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
CLONE_DETACHED,
(long) ((char *)(stack) + STACK_SIZE),
(long) &parent_pid,
(long) &child_pid, 0, 0);
if (ret < 0) {
errno = -ret;
perror("clone");
} else if (ret == 0) {
thread_sigusr2_self(&sigstack);
syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
}
child_pid = ret;
/* Check that thread exited */
do {
sched_yield();
ret = syscall_raw(SYS_tkill, child_pid, 0, 0, 0, 0, 0);
} while (ret != -ESRCH && ret != -EINVAL);
ksft_test_result_pass("%s\n", __func__);
}
static void (*pkey_tests[])(void) = {
test_sigsegv_handler_with_pkey0_disabled,
test_sigsegv_handler_cannot_access_stack,
test_sigsegv_handler_with_different_pkey_for_stack,
test_pkru_preserved_after_sigusr1,
test_pkru_sigreturn
};
int main(int argc, char *argv[])
{
int i;
ksft_print_header();
ksft_set_plan(ARRAY_SIZE(pkey_tests));
for (i = 0; i < ARRAY_SIZE(pkey_tests); i++)
(*pkey_tests[i])();
ksft_finished();
return 0;
}

View File

@ -954,16 +954,6 @@ void close_test_fds(void)
nr_test_fds = 0;
}
#define barrier() __asm__ __volatile__("": : :"memory")
__attribute__((noinline)) int read_ptr(int *ptr)
{
/*
* Keep GCC from optimizing this away somehow
*/
barrier();
return *ptr;
}
void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
{
int i, err;