forked from Minki/linux
Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 fixes from Ingo Molnar: "A landry list of fixes: - fix reboot breakage on some PCID-enabled system - fix crashes/hangs on some PCID-enabled systems - fix microcode loading on certain older CPUs - various unwinder fixes - extend an APIC quirk to more hardware systems and disable APIC related warning on virtualized systems - various Hyper-V fixes - a macro definition robustness fix - remove jprobes IRQ disabling - various mem-encryption fixes" * 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/microcode: Do the family check first x86/mm: Flush more aggressively in lazy TLB mode x86/apic: Update TSC_DEADLINE quirk with additional SKX stepping x86/apic: Silence "FW_BUG TSC_DEADLINE disabled due to Errata" on hypervisors x86/mm: Disable various instrumentations of mm/mem_encrypt.c and mm/tlb.c x86/hyperv: Fix hypercalls with extended CPU ranges for TLB flushing x86/hyperv: Don't use percpu areas for pcpu_flush/pcpu_flush_ex structures x86/hyperv: Clear vCPU banks between calls to avoid flushing unneeded vCPUs x86/unwind: Disable unwinder warnings on 32-bit x86/unwind: Align stack pointer in unwinder dump x86/unwind: Use MSB for frame pointer encoding on 32-bit x86/unwind: Fix dereference of untrusted pointer x86/alternatives: Fix alt_max_short macro to really be a max() x86/mm/64: Fix reboot interaction with CR4.PCIDE kprobes/x86: Remove IRQ disabling from jprobe handlers kprobes/x86: Set up frame pointer in kprobe trampoline
This commit is contained in:
commit
e7a36a6ec9
@ -176,7 +176,7 @@
|
||||
/*
|
||||
* This is a sneaky trick to help the unwinder find pt_regs on the stack. The
|
||||
* frame pointer is replaced with an encoded pointer to pt_regs. The encoding
|
||||
* is just setting the LSB, which makes it an invalid stack address and is also
|
||||
* is just clearing the MSB, which makes it an invalid stack address and is also
|
||||
* a signal to the unwinder that it's a pt_regs pointer in disguise.
|
||||
*
|
||||
* NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
|
||||
@ -185,7 +185,7 @@
|
||||
.macro ENCODE_FRAME_POINTER
|
||||
#ifdef CONFIG_FRAME_POINTER
|
||||
mov %esp, %ebp
|
||||
orl $0x1, %ebp
|
||||
andl $0x7fffffff, %ebp
|
||||
#endif
|
||||
.endm
|
||||
|
||||
|
@ -85,6 +85,8 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
|
||||
u32 *hv_vp_index;
|
||||
EXPORT_SYMBOL_GPL(hv_vp_index);
|
||||
|
||||
u32 hv_max_vp_index;
|
||||
|
||||
static int hv_cpu_init(unsigned int cpu)
|
||||
{
|
||||
u64 msr_vp_index;
|
||||
@ -93,6 +95,9 @@ static int hv_cpu_init(unsigned int cpu)
|
||||
|
||||
hv_vp_index[smp_processor_id()] = msr_vp_index;
|
||||
|
||||
if (msr_vp_index > hv_max_vp_index)
|
||||
hv_max_vp_index = msr_vp_index;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -36,9 +36,9 @@ struct hv_flush_pcpu_ex {
|
||||
/* Each gva in gva_list encodes up to 4096 pages to flush */
|
||||
#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
|
||||
|
||||
static struct hv_flush_pcpu __percpu *pcpu_flush;
|
||||
static struct hv_flush_pcpu __percpu **pcpu_flush;
|
||||
|
||||
static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex;
|
||||
static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex;
|
||||
|
||||
/*
|
||||
* Fills in gva_list starting from offset. Returns the number of items added.
|
||||
@ -76,6 +76,18 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
|
||||
{
|
||||
int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
|
||||
|
||||
/* valid_bank_mask can represent up to 64 banks */
|
||||
if (hv_max_vp_index / 64 >= 64)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex
|
||||
* structs are not cleared between calls, we risk flushing unneeded
|
||||
* vCPUs otherwise.
|
||||
*/
|
||||
for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++)
|
||||
flush->hv_vp_set.bank_contents[vcpu_bank] = 0;
|
||||
|
||||
/*
|
||||
* Some banks may end up being empty but this is acceptable.
|
||||
*/
|
||||
@ -83,11 +95,6 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
|
||||
vcpu = hv_cpu_number_to_vp_number(cpu);
|
||||
vcpu_bank = vcpu / 64;
|
||||
vcpu_offset = vcpu % 64;
|
||||
|
||||
/* valid_bank_mask can represent up to 64 banks */
|
||||
if (vcpu_bank >= 64)
|
||||
return 0;
|
||||
|
||||
__set_bit(vcpu_offset, (unsigned long *)
|
||||
&flush->hv_vp_set.bank_contents[vcpu_bank]);
|
||||
if (vcpu_bank >= nr_bank)
|
||||
@ -102,6 +109,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
|
||||
const struct flush_tlb_info *info)
|
||||
{
|
||||
int cpu, vcpu, gva_n, max_gvas;
|
||||
struct hv_flush_pcpu **flush_pcpu;
|
||||
struct hv_flush_pcpu *flush;
|
||||
u64 status = U64_MAX;
|
||||
unsigned long flags;
|
||||
@ -116,7 +124,17 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
|
||||
|
||||
local_irq_save(flags);
|
||||
|
||||
flush = this_cpu_ptr(pcpu_flush);
|
||||
flush_pcpu = this_cpu_ptr(pcpu_flush);
|
||||
|
||||
if (unlikely(!*flush_pcpu))
|
||||
*flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
|
||||
|
||||
flush = *flush_pcpu;
|
||||
|
||||
if (unlikely(!flush)) {
|
||||
local_irq_restore(flags);
|
||||
goto do_native;
|
||||
}
|
||||
|
||||
if (info->mm) {
|
||||
flush->address_space = virt_to_phys(info->mm->pgd);
|
||||
@ -173,6 +191,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
|
||||
const struct flush_tlb_info *info)
|
||||
{
|
||||
int nr_bank = 0, max_gvas, gva_n;
|
||||
struct hv_flush_pcpu_ex **flush_pcpu;
|
||||
struct hv_flush_pcpu_ex *flush;
|
||||
u64 status = U64_MAX;
|
||||
unsigned long flags;
|
||||
@ -187,7 +206,17 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
|
||||
|
||||
local_irq_save(flags);
|
||||
|
||||
flush = this_cpu_ptr(pcpu_flush_ex);
|
||||
flush_pcpu = this_cpu_ptr(pcpu_flush_ex);
|
||||
|
||||
if (unlikely(!*flush_pcpu))
|
||||
*flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
|
||||
|
||||
flush = *flush_pcpu;
|
||||
|
||||
if (unlikely(!flush)) {
|
||||
local_irq_restore(flags);
|
||||
goto do_native;
|
||||
}
|
||||
|
||||
if (info->mm) {
|
||||
flush->address_space = virt_to_phys(info->mm->pgd);
|
||||
@ -222,18 +251,18 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
|
||||
flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
|
||||
status = hv_do_rep_hypercall(
|
||||
HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
|
||||
0, nr_bank + 2, flush, NULL);
|
||||
0, nr_bank, flush, NULL);
|
||||
} else if (info->end &&
|
||||
((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
|
||||
status = hv_do_rep_hypercall(
|
||||
HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
|
||||
0, nr_bank + 2, flush, NULL);
|
||||
0, nr_bank, flush, NULL);
|
||||
} else {
|
||||
gva_n = fill_gva_list(flush->gva_list, nr_bank,
|
||||
info->start, info->end);
|
||||
status = hv_do_rep_hypercall(
|
||||
HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
|
||||
gva_n, nr_bank + 2, flush, NULL);
|
||||
gva_n, nr_bank, flush, NULL);
|
||||
}
|
||||
|
||||
local_irq_restore(flags);
|
||||
@ -266,7 +295,7 @@ void hyper_alloc_mmu(void)
|
||||
return;
|
||||
|
||||
if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
|
||||
pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
|
||||
pcpu_flush = alloc_percpu(struct hv_flush_pcpu *);
|
||||
else
|
||||
pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
|
||||
pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *);
|
||||
}
|
||||
|
@ -62,8 +62,10 @@
|
||||
#define new_len2 145f-144f
|
||||
|
||||
/*
|
||||
* max without conditionals. Idea adapted from:
|
||||
* gas compatible max based on the idea from:
|
||||
* http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
|
||||
*
|
||||
* The additional "-" is needed because gas uses a "true" value of -1.
|
||||
*/
|
||||
#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
|
||||
|
||||
|
@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
|
||||
alt_end_marker ":\n"
|
||||
|
||||
/*
|
||||
* max without conditionals. Idea adapted from:
|
||||
* gas compatible max based on the idea from:
|
||||
* http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
|
||||
*
|
||||
* The additional "-" is needed because gas works with s32s.
|
||||
* The additional "-" is needed because gas uses a "true" value of -1.
|
||||
*/
|
||||
#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))"
|
||||
#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))"
|
||||
|
||||
/*
|
||||
* Pad the second replacement alternative with additional NOPs if it is
|
||||
|
@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
|
||||
DEBUG_LOCKS_WARN_ON(preemptible());
|
||||
}
|
||||
|
||||
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
|
||||
cpumask_clear_cpu(cpu, mm_cpumask(mm));
|
||||
}
|
||||
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
|
||||
|
||||
static inline int init_new_context(struct task_struct *tsk,
|
||||
struct mm_struct *mm)
|
||||
|
@ -289,6 +289,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
|
||||
* to this information.
|
||||
*/
|
||||
extern u32 *hv_vp_index;
|
||||
extern u32 hv_max_vp_index;
|
||||
|
||||
/**
|
||||
* hv_cpu_number_to_vp_number() - Map CPU to VP.
|
||||
|
@ -82,6 +82,13 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
|
||||
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point
|
||||
* to init_mm when we switch to a kernel thread (e.g. the idle thread). If
|
||||
* it's false, then we immediately switch CR3 when entering a kernel thread.
|
||||
*/
|
||||
DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
|
||||
|
||||
/*
|
||||
* 6 because 6 should be plenty and struct tlb_state will fit in
|
||||
* two cache lines.
|
||||
@ -104,6 +111,23 @@ struct tlb_state {
|
||||
u16 loaded_mm_asid;
|
||||
u16 next_asid;
|
||||
|
||||
/*
|
||||
* We can be in one of several states:
|
||||
*
|
||||
* - Actively using an mm. Our CPU's bit will be set in
|
||||
* mm_cpumask(loaded_mm) and is_lazy == false;
|
||||
*
|
||||
* - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit
|
||||
* will not be set in mm_cpumask(&init_mm) and is_lazy == false.
|
||||
*
|
||||
* - Lazily using a real mm. loaded_mm != &init_mm, our bit
|
||||
* is set in mm_cpumask(loaded_mm), but is_lazy == true.
|
||||
* We're heuristically guessing that the CR3 load we
|
||||
* skipped more than makes up for the overhead added by
|
||||
* lazy mode.
|
||||
*/
|
||||
bool is_lazy;
|
||||
|
||||
/*
|
||||
* Access to this CR4 shadow and to H/W CR4 is protected by
|
||||
* disabling interrupts when modifying either one.
|
||||
|
@ -573,11 +573,21 @@ static u32 bdx_deadline_rev(void)
|
||||
return ~0U;
|
||||
}
|
||||
|
||||
static u32 skx_deadline_rev(void)
|
||||
{
|
||||
switch (boot_cpu_data.x86_mask) {
|
||||
case 0x03: return 0x01000136;
|
||||
case 0x04: return 0x02000014;
|
||||
}
|
||||
|
||||
return ~0U;
|
||||
}
|
||||
|
||||
static const struct x86_cpu_id deadline_match[] = {
|
||||
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev),
|
||||
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020),
|
||||
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev),
|
||||
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014),
|
||||
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev),
|
||||
|
||||
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22),
|
||||
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20),
|
||||
@ -600,7 +610,8 @@ static void apic_check_deadline_errata(void)
|
||||
const struct x86_cpu_id *m;
|
||||
u32 rev;
|
||||
|
||||
if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
|
||||
if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
|
||||
boot_cpu_has(X86_FEATURE_HYPERVISOR))
|
||||
return;
|
||||
|
||||
m = x86_match_cpu(deadline_match);
|
||||
|
@ -122,9 +122,6 @@ static bool __init check_loader_disabled_bsp(void)
|
||||
bool *res = &dis_ucode_ldr;
|
||||
#endif
|
||||
|
||||
if (!have_cpuid_p())
|
||||
return *res;
|
||||
|
||||
/*
|
||||
* CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
|
||||
* completely accurate as xen pv guests don't see that CPUID bit set but
|
||||
@ -166,24 +163,36 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name)
|
||||
void __init load_ucode_bsp(void)
|
||||
{
|
||||
unsigned int cpuid_1_eax;
|
||||
bool intel = true;
|
||||
|
||||
if (check_loader_disabled_bsp())
|
||||
if (!have_cpuid_p())
|
||||
return;
|
||||
|
||||
cpuid_1_eax = native_cpuid_eax(1);
|
||||
|
||||
switch (x86_cpuid_vendor()) {
|
||||
case X86_VENDOR_INTEL:
|
||||
if (x86_family(cpuid_1_eax) >= 6)
|
||||
load_ucode_intel_bsp();
|
||||
if (x86_family(cpuid_1_eax) < 6)
|
||||
return;
|
||||
break;
|
||||
|
||||
case X86_VENDOR_AMD:
|
||||
if (x86_family(cpuid_1_eax) >= 0x10)
|
||||
load_ucode_amd_bsp(cpuid_1_eax);
|
||||
if (x86_family(cpuid_1_eax) < 0x10)
|
||||
return;
|
||||
intel = false;
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
return;
|
||||
}
|
||||
|
||||
if (check_loader_disabled_bsp())
|
||||
return;
|
||||
|
||||
if (intel)
|
||||
load_ucode_intel_bsp();
|
||||
else
|
||||
load_ucode_amd_bsp(cpuid_1_eax);
|
||||
}
|
||||
|
||||
static bool check_loader_disabled_ap(void)
|
||||
|
@ -3,6 +3,15 @@
|
||||
|
||||
/* Kprobes and Optprobes common header */
|
||||
|
||||
#include <asm/asm.h>
|
||||
|
||||
#ifdef CONFIG_FRAME_POINTER
|
||||
# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \
|
||||
" mov %" _ASM_SP ", %" _ASM_BP "\n"
|
||||
#else
|
||||
# define SAVE_RBP_STRING " push %" _ASM_BP "\n"
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
#define SAVE_REGS_STRING \
|
||||
/* Skip cs, ip, orig_ax. */ \
|
||||
@ -17,7 +26,7 @@
|
||||
" pushq %r10\n" \
|
||||
" pushq %r11\n" \
|
||||
" pushq %rbx\n" \
|
||||
" pushq %rbp\n" \
|
||||
SAVE_RBP_STRING \
|
||||
" pushq %r12\n" \
|
||||
" pushq %r13\n" \
|
||||
" pushq %r14\n" \
|
||||
@ -48,7 +57,7 @@
|
||||
" pushl %es\n" \
|
||||
" pushl %ds\n" \
|
||||
" pushl %eax\n" \
|
||||
" pushl %ebp\n" \
|
||||
SAVE_RBP_STRING \
|
||||
" pushl %edi\n" \
|
||||
" pushl %esi\n" \
|
||||
" pushl %edx\n" \
|
||||
|
@ -1080,8 +1080,6 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
|
||||
* raw stack chunk with redzones:
|
||||
*/
|
||||
__memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr));
|
||||
regs->flags &= ~X86_EFLAGS_IF;
|
||||
trace_hardirqs_off();
|
||||
regs->ip = (unsigned long)(jp->entry);
|
||||
|
||||
/*
|
||||
|
@ -105,6 +105,10 @@ void __noreturn machine_real_restart(unsigned int type)
|
||||
load_cr3(initial_page_table);
|
||||
#else
|
||||
write_cr3(real_mode_header->trampoline_pgd);
|
||||
|
||||
/* Exiting long mode will fail if CR4.PCIDE is set. */
|
||||
if (static_cpu_has(X86_FEATURE_PCID))
|
||||
cr4_clear_bits(X86_CR4_PCIDE);
|
||||
#endif
|
||||
|
||||
/* Jump to the identity-mapped low memory code */
|
||||
|
@ -44,7 +44,8 @@ static void unwind_dump(struct unwind_state *state)
|
||||
state->stack_info.type, state->stack_info.next_sp,
|
||||
state->stack_mask, state->graph_idx);
|
||||
|
||||
for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
|
||||
for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp;
|
||||
sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
|
||||
if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
|
||||
break;
|
||||
|
||||
@ -174,6 +175,7 @@ static bool is_last_task_frame(struct unwind_state *state)
|
||||
* This determines if the frame pointer actually contains an encoded pointer to
|
||||
* pt_regs on the stack. See ENCODE_FRAME_POINTER.
|
||||
*/
|
||||
#ifdef CONFIG_X86_64
|
||||
static struct pt_regs *decode_frame_pointer(unsigned long *bp)
|
||||
{
|
||||
unsigned long regs = (unsigned long)bp;
|
||||
@ -183,6 +185,23 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp)
|
||||
|
||||
return (struct pt_regs *)(regs & ~0x1);
|
||||
}
|
||||
#else
|
||||
static struct pt_regs *decode_frame_pointer(unsigned long *bp)
|
||||
{
|
||||
unsigned long regs = (unsigned long)bp;
|
||||
|
||||
if (regs & 0x80000000)
|
||||
return NULL;
|
||||
|
||||
return (struct pt_regs *)(regs | 0x80000000);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long))
|
||||
#else
|
||||
#define KERNEL_REGS_SIZE (sizeof(struct pt_regs))
|
||||
#endif
|
||||
|
||||
static bool update_stack_state(struct unwind_state *state,
|
||||
unsigned long *next_bp)
|
||||
@ -202,7 +221,7 @@ static bool update_stack_state(struct unwind_state *state,
|
||||
regs = decode_frame_pointer(next_bp);
|
||||
if (regs) {
|
||||
frame = (unsigned long *)regs;
|
||||
len = regs_size(regs);
|
||||
len = KERNEL_REGS_SIZE;
|
||||
state->got_irq = true;
|
||||
} else {
|
||||
frame = next_bp;
|
||||
@ -226,6 +245,14 @@ static bool update_stack_state(struct unwind_state *state,
|
||||
frame < prev_frame_end)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* On 32-bit with user mode regs, make sure the last two regs are safe
|
||||
* to access:
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) &&
|
||||
!on_stack(info, frame, len + 2*sizeof(long)))
|
||||
return false;
|
||||
|
||||
/* Move state to the next frame: */
|
||||
if (regs) {
|
||||
state->regs = regs;
|
||||
@ -328,6 +355,13 @@ bad_address:
|
||||
state->regs->sp < (unsigned long)task_pt_regs(state->task))
|
||||
goto the_end;
|
||||
|
||||
/*
|
||||
* There are some known frame pointer issues on 32-bit. Disable
|
||||
* unwinder warnings on 32-bit until it gets objtool support.
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_X86_32))
|
||||
goto the_end;
|
||||
|
||||
if (state->regs) {
|
||||
printk_deferred_once(KERN_WARNING
|
||||
"WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
|
||||
|
@ -1,5 +1,12 @@
|
||||
# Kernel does not boot with instrumentation of tlb.c.
|
||||
KCOV_INSTRUMENT_tlb.o := n
|
||||
# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c
|
||||
KCOV_INSTRUMENT_tlb.o := n
|
||||
KCOV_INSTRUMENT_mem_encrypt.o := n
|
||||
|
||||
KASAN_SANITIZE_mem_encrypt.o := n
|
||||
|
||||
ifdef CONFIG_FUNCTION_TRACER
|
||||
CFLAGS_REMOVE_mem_encrypt.o = -pg
|
||||
endif
|
||||
|
||||
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
|
||||
pat.o pgtable.o physaddr.o setup_nx.o tlb.o
|
||||
|
@ -30,6 +30,8 @@
|
||||
|
||||
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
|
||||
|
||||
DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
|
||||
|
||||
static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
|
||||
u16 *new_asid, bool *need_flush)
|
||||
{
|
||||
@ -80,7 +82,7 @@ void leave_mm(int cpu)
|
||||
return;
|
||||
|
||||
/* Warn if we're not lazy. */
|
||||
WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
|
||||
WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
|
||||
|
||||
switch_mm(NULL, &init_mm, NULL);
|
||||
}
|
||||
@ -142,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
__flush_tlb_all();
|
||||
}
|
||||
#endif
|
||||
this_cpu_write(cpu_tlbstate.is_lazy, false);
|
||||
|
||||
if (real_prev == next) {
|
||||
VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
|
||||
next->context.ctx_id);
|
||||
|
||||
if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
|
||||
/*
|
||||
* There's nothing to do: we weren't lazy, and we
|
||||
* aren't changing our mm. We don't need to flush
|
||||
* anything, nor do we need to update CR3, CR4, or
|
||||
* LDTR.
|
||||
*/
|
||||
return;
|
||||
}
|
||||
|
||||
/* Resume remote flushes and then read tlb_gen. */
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
||||
|
||||
if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
|
||||
next_tlb_gen) {
|
||||
/*
|
||||
* Ideally, we'd have a flush_tlb() variant that
|
||||
* takes the known CR3 value as input. This would
|
||||
* be faster on Xen PV and on hypothetical CPUs
|
||||
* on which INVPCID is fast.
|
||||
*/
|
||||
this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
|
||||
next_tlb_gen);
|
||||
write_cr3(build_cr3(next, prev_asid));
|
||||
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
|
||||
TLB_FLUSH_ALL);
|
||||
}
|
||||
|
||||
/*
|
||||
* We just exited lazy mode, which means that CR4 and/or LDTR
|
||||
* may be stale. (Changes to the required CR4 and LDTR states
|
||||
* are not reflected in tlb_gen.)
|
||||
* We don't currently support having a real mm loaded without
|
||||
* our cpu set in mm_cpumask(). We have all the bookkeeping
|
||||
* in place to figure out whether we would need to flush
|
||||
* if our cpu were cleared in mm_cpumask(), but we don't
|
||||
* currently use it.
|
||||
*/
|
||||
if (WARN_ON_ONCE(real_prev != &init_mm &&
|
||||
!cpumask_test_cpu(cpu, mm_cpumask(next))))
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
return;
|
||||
} else {
|
||||
u16 new_asid;
|
||||
bool need_flush;
|
||||
@ -199,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
}
|
||||
|
||||
/* Stop remote flushes for the previous mm */
|
||||
if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
|
||||
cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
|
||||
|
||||
VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
|
||||
VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
|
||||
real_prev != &init_mm);
|
||||
cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
|
||||
|
||||
/*
|
||||
* Start remote flushes and then read tlb_gen.
|
||||
@ -232,6 +212,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
switch_ldt(real_prev, next);
|
||||
}
|
||||
|
||||
/*
|
||||
* enter_lazy_tlb() is a hint from the scheduler that we are entering a
|
||||
* kernel thread or other context without an mm. Acceptable implementations
|
||||
* include doing nothing whatsoever, switching to init_mm, or various clever
|
||||
* lazy tricks to try to minimize TLB flushes.
|
||||
*
|
||||
* The scheduler reserves the right to call enter_lazy_tlb() several times
|
||||
* in a row. It will notify us that we're going back to a real mm by
|
||||
* calling switch_mm_irqs_off().
|
||||
*/
|
||||
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
||||
{
|
||||
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
|
||||
return;
|
||||
|
||||
if (static_branch_unlikely(&tlb_use_lazy_mode)) {
|
||||
/*
|
||||
* There's a significant optimization that may be possible
|
||||
* here. We have accurate enough TLB flush tracking that we
|
||||
* don't need to maintain coherence of TLB per se when we're
|
||||
* lazy. We do, however, need to maintain coherence of
|
||||
* paging-structure caches. We could, in principle, leave our
|
||||
* old mm loaded and only switch to init_mm when
|
||||
* tlb_remove_page() happens.
|
||||
*/
|
||||
this_cpu_write(cpu_tlbstate.is_lazy, true);
|
||||
} else {
|
||||
switch_mm(NULL, &init_mm, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Call this when reinitializing a CPU. It fixes the following potential
|
||||
* problems:
|
||||
@ -303,16 +314,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
|
||||
/* This code cannot presently handle being reentered. */
|
||||
VM_WARN_ON(!irqs_disabled());
|
||||
|
||||
if (unlikely(loaded_mm == &init_mm))
|
||||
return;
|
||||
|
||||
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
|
||||
loaded_mm->context.ctx_id);
|
||||
|
||||
if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
|
||||
if (this_cpu_read(cpu_tlbstate.is_lazy)) {
|
||||
/*
|
||||
* We're in lazy mode -- don't flush. We can get here on
|
||||
* remote flushes due to races and on local flushes if a
|
||||
* kernel thread coincidentally flushes the mm it's lazily
|
||||
* still using.
|
||||
* We're in lazy mode. We need to at least flush our
|
||||
* paging-structure cache to avoid speculatively reading
|
||||
* garbage into our TLB. Since switching to init_mm is barely
|
||||
* slower than a minimal flush, just switch to init_mm.
|
||||
*/
|
||||
switch_mm_irqs_off(NULL, &init_mm, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -611,3 +626,57 @@ static int __init create_tlb_single_page_flush_ceiling(void)
|
||||
return 0;
|
||||
}
|
||||
late_initcall(create_tlb_single_page_flush_ceiling);
|
||||
|
||||
static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
char buf[2];
|
||||
|
||||
buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0';
|
||||
buf[1] = '\n';
|
||||
|
||||
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
|
||||
}
|
||||
|
||||
static ssize_t tlblazy_write_file(struct file *file,
|
||||
const char __user *user_buf, size_t count, loff_t *ppos)
|
||||
{
|
||||
bool val;
|
||||
|
||||
if (kstrtobool_from_user(user_buf, count, &val))
|
||||
return -EINVAL;
|
||||
|
||||
if (val)
|
||||
static_branch_enable(&tlb_use_lazy_mode);
|
||||
else
|
||||
static_branch_disable(&tlb_use_lazy_mode);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static const struct file_operations fops_tlblazy = {
|
||||
.read = tlblazy_read_file,
|
||||
.write = tlblazy_write_file,
|
||||
.llseek = default_llseek,
|
||||
};
|
||||
|
||||
static int __init init_tlb_use_lazy_mode(void)
|
||||
{
|
||||
if (boot_cpu_has(X86_FEATURE_PCID)) {
|
||||
/*
|
||||
* Heuristic: with PCID on, switching to and from
|
||||
* init_mm is reasonably fast, but remote flush IPIs
|
||||
* as expensive as ever, so turn off lazy TLB mode.
|
||||
*
|
||||
* We can't do this in setup_pcid() because static keys
|
||||
* haven't been initialized yet, and it would blow up
|
||||
* badly.
|
||||
*/
|
||||
static_branch_disable(&tlb_use_lazy_mode);
|
||||
}
|
||||
|
||||
debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR,
|
||||
arch_debugfs_dir, NULL, &fops_tlblazy);
|
||||
return 0;
|
||||
}
|
||||
late_initcall(init_tlb_use_lazy_mode);
|
||||
|
Loading…
Reference in New Issue
Block a user