mirror of
https://github.com/torvalds/linux.git
synced 2024-11-10 22:21:40 +00:00
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar: "The main changes in this cycle were: - Continued work to add support for 5-level paging provided by future Intel CPUs. In particular we switch the x86 GUP code to the generic implementation. (Kirill A. Shutemov) - Continued work to add PCID CPU support to native kernels as well. In this round most of the focus is on reworking/refreshing the TLB flush infrastructure for the upcoming PCID changes. (Andy Lutomirski)" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits) x86/mm: Delete a big outdated comment about TLB flushing x86/mm: Don't reenter flush_tlb_func_common() x86/KASLR: Fix detection 32/64 bit bootloaders for 5-level paging x86/ftrace: Exclude functions in head64.c from function-tracing x86/mmap, ASLR: Do not treat unlimited-stack tasks as legacy mmap x86/mm: Remove reset_lazy_tlbstate() x86/ldt: Simplify the LDT switching logic x86/boot/64: Put __startup_64() into .head.text x86/mm: Add support for 5-level paging for KASLR x86/mm: Make kernel_physical_mapping_init() support 5-level paging x86/mm: Add sync_global_pgds() for configuration with 5-level paging x86/boot/64: Add support of additional page table level during early boot x86/boot/64: Rename init_level4_pgt and early_level4_pgt x86/boot/64: Rewrite startup_64() in C x86/boot/compressed: Enable 5-level paging during decompression stage x86/boot/efi: Define __KERNEL32_CS GDT on 64-bit configurations x86/boot/efi: Fix __KERNEL_CS definition of GDT entry on 64-bit configurations x86/boot/efi: Cleanup initialization of GDT entries x86/asm: Fix comment in return_from_SYSCALL_64() x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation ...
This commit is contained in:
commit
7a69f9c60b
@ -1638,7 +1638,7 @@ config ARCH_SELECT_MEMORY_MODEL
|
||||
config HAVE_ARCH_PFN_VALID
|
||||
def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
|
||||
|
||||
config HAVE_GENERIC_RCU_GUP
|
||||
config HAVE_GENERIC_GUP
|
||||
def_bool y
|
||||
depends on ARM_LPAE
|
||||
|
||||
|
@ -205,7 +205,7 @@ config GENERIC_CALIBRATE_DELAY
|
||||
config ZONE_DMA
|
||||
def_bool y
|
||||
|
||||
config HAVE_GENERIC_RCU_GUP
|
||||
config HAVE_GENERIC_GUP
|
||||
def_bool y
|
||||
|
||||
config ARCH_DMA_ADDR_T_64BIT
|
||||
|
@ -184,7 +184,7 @@ config PPC
|
||||
select HAVE_FUNCTION_GRAPH_TRACER
|
||||
select HAVE_FUNCTION_TRACER
|
||||
select HAVE_GCC_PLUGINS
|
||||
select HAVE_GENERIC_RCU_GUP
|
||||
select HAVE_GENERIC_GUP
|
||||
select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx)
|
||||
select HAVE_IDE
|
||||
select HAVE_IOREMAP_PROT
|
||||
|
@ -69,7 +69,7 @@ config X86
|
||||
select ARCH_USE_BUILTIN_BSWAP
|
||||
select ARCH_USE_QUEUED_RWLOCKS
|
||||
select ARCH_USE_QUEUED_SPINLOCKS
|
||||
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
|
||||
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
||||
select ARCH_WANT_FRAME_POINTERS
|
||||
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
|
||||
select BUILDTIME_EXTABLE_SORT
|
||||
@ -2793,6 +2793,9 @@ config X86_DMA_REMAP
|
||||
bool
|
||||
depends on STA2X11
|
||||
|
||||
config HAVE_GENERIC_GUP
|
||||
def_bool y
|
||||
|
||||
source "net/Kconfig"
|
||||
|
||||
source "drivers/Kconfig"
|
||||
|
@ -1046,9 +1046,31 @@ struct boot_params *efi_main(struct efi_config *c,
|
||||
memset((char *)gdt->address, 0x0, gdt->size);
|
||||
desc = (struct desc_struct *)gdt->address;
|
||||
|
||||
/* The first GDT is a dummy and the second is unused. */
|
||||
desc += 2;
|
||||
/* The first GDT is a dummy. */
|
||||
desc++;
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_64)) {
|
||||
/* __KERNEL32_CS */
|
||||
desc->limit0 = 0xffff;
|
||||
desc->base0 = 0x0000;
|
||||
desc->base1 = 0x0000;
|
||||
desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
|
||||
desc->s = DESC_TYPE_CODE_DATA;
|
||||
desc->dpl = 0;
|
||||
desc->p = 1;
|
||||
desc->limit = 0xf;
|
||||
desc->avl = 0;
|
||||
desc->l = 0;
|
||||
desc->d = SEG_OP_SIZE_32BIT;
|
||||
desc->g = SEG_GRANULARITY_4KB;
|
||||
desc->base2 = 0x00;
|
||||
desc++;
|
||||
} else {
|
||||
/* Second entry is unused on 32-bit */
|
||||
desc++;
|
||||
}
|
||||
|
||||
/* __KERNEL_CS */
|
||||
desc->limit0 = 0xffff;
|
||||
desc->base0 = 0x0000;
|
||||
desc->base1 = 0x0000;
|
||||
@ -1058,12 +1080,18 @@ struct boot_params *efi_main(struct efi_config *c,
|
||||
desc->p = 1;
|
||||
desc->limit = 0xf;
|
||||
desc->avl = 0;
|
||||
desc->l = 0;
|
||||
desc->d = SEG_OP_SIZE_32BIT;
|
||||
if (IS_ENABLED(CONFIG_X86_64)) {
|
||||
desc->l = 1;
|
||||
desc->d = 0;
|
||||
} else {
|
||||
desc->l = 0;
|
||||
desc->d = SEG_OP_SIZE_32BIT;
|
||||
}
|
||||
desc->g = SEG_GRANULARITY_4KB;
|
||||
desc->base2 = 0x00;
|
||||
|
||||
desc++;
|
||||
|
||||
/* __KERNEL_DS */
|
||||
desc->limit0 = 0xffff;
|
||||
desc->base0 = 0x0000;
|
||||
desc->base1 = 0x0000;
|
||||
@ -1077,24 +1105,25 @@ struct boot_params *efi_main(struct efi_config *c,
|
||||
desc->d = SEG_OP_SIZE_32BIT;
|
||||
desc->g = SEG_GRANULARITY_4KB;
|
||||
desc->base2 = 0x00;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* Task segment value */
|
||||
desc++;
|
||||
desc->limit0 = 0x0000;
|
||||
desc->base0 = 0x0000;
|
||||
desc->base1 = 0x0000;
|
||||
desc->type = SEG_TYPE_TSS;
|
||||
desc->s = 0;
|
||||
desc->dpl = 0;
|
||||
desc->p = 1;
|
||||
desc->limit = 0x0;
|
||||
desc->avl = 0;
|
||||
desc->l = 0;
|
||||
desc->d = 0;
|
||||
desc->g = SEG_GRANULARITY_4KB;
|
||||
desc->base2 = 0x00;
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_64)) {
|
||||
/* Task segment value */
|
||||
desc->limit0 = 0x0000;
|
||||
desc->base0 = 0x0000;
|
||||
desc->base1 = 0x0000;
|
||||
desc->type = SEG_TYPE_TSS;
|
||||
desc->s = 0;
|
||||
desc->dpl = 0;
|
||||
desc->p = 1;
|
||||
desc->limit = 0x0;
|
||||
desc->avl = 0;
|
||||
desc->l = 0;
|
||||
desc->d = 0;
|
||||
desc->g = SEG_GRANULARITY_4KB;
|
||||
desc->base2 = 0x00;
|
||||
desc++;
|
||||
}
|
||||
|
||||
asm volatile("cli");
|
||||
asm volatile ("lgdt %0" : : "m" (*gdt));
|
||||
|
@ -346,6 +346,48 @@ preferred_addr:
|
||||
/* Set up the stack */
|
||||
leaq boot_stack_end(%rbx), %rsp
|
||||
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
/* Check if 5-level paging has already enabled */
|
||||
movq %cr4, %rax
|
||||
testl $X86_CR4_LA57, %eax
|
||||
jnz lvl5
|
||||
|
||||
/*
|
||||
* At this point we are in long mode with 4-level paging enabled,
|
||||
* but we want to enable 5-level paging.
|
||||
*
|
||||
* The problem is that we cannot do it directly. Setting LA57 in
|
||||
* long mode would trigger #GP. So we need to switch off long mode
|
||||
* first.
|
||||
*
|
||||
* NOTE: This is not going to work if bootloader put us above 4G
|
||||
* limit.
|
||||
*
|
||||
* The first step is go into compatibility mode.
|
||||
*/
|
||||
|
||||
/* Clear additional page table */
|
||||
leaq lvl5_pgtable(%rbx), %rdi
|
||||
xorq %rax, %rax
|
||||
movq $(PAGE_SIZE/8), %rcx
|
||||
rep stosq
|
||||
|
||||
/*
|
||||
* Setup current CR3 as the first and only entry in a new top level
|
||||
* page table.
|
||||
*/
|
||||
movq %cr3, %rdi
|
||||
leaq 0x7 (%rdi), %rax
|
||||
movq %rax, lvl5_pgtable(%rbx)
|
||||
|
||||
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
|
||||
pushq $__KERNEL32_CS
|
||||
leaq compatible_mode(%rip), %rax
|
||||
pushq %rax
|
||||
lretq
|
||||
lvl5:
|
||||
#endif
|
||||
|
||||
/* Zero EFLAGS */
|
||||
pushq $0
|
||||
popfq
|
||||
@ -429,6 +471,44 @@ relocated:
|
||||
jmp *%rax
|
||||
|
||||
.code32
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
compatible_mode:
|
||||
/* Setup data and stack segments */
|
||||
movl $__KERNEL_DS, %eax
|
||||
movl %eax, %ds
|
||||
movl %eax, %ss
|
||||
|
||||
/* Disable paging */
|
||||
movl %cr0, %eax
|
||||
btrl $X86_CR0_PG_BIT, %eax
|
||||
movl %eax, %cr0
|
||||
|
||||
/* Point CR3 to 5-level paging */
|
||||
leal lvl5_pgtable(%ebx), %eax
|
||||
movl %eax, %cr3
|
||||
|
||||
/* Enable PAE and LA57 mode */
|
||||
movl %cr4, %eax
|
||||
orl $(X86_CR4_PAE | X86_CR4_LA57), %eax
|
||||
movl %eax, %cr4
|
||||
|
||||
/* Calculate address we are running at */
|
||||
call 1f
|
||||
1: popl %edi
|
||||
subl $1b, %edi
|
||||
|
||||
/* Prepare stack for far return to Long Mode */
|
||||
pushl $__KERNEL_CS
|
||||
leal lvl5(%edi), %eax
|
||||
push %eax
|
||||
|
||||
/* Enable paging back */
|
||||
movl $(X86_CR0_PG | X86_CR0_PE), %eax
|
||||
movl %eax, %cr0
|
||||
|
||||
lret
|
||||
#endif
|
||||
|
||||
no_longmode:
|
||||
/* This isn't an x86-64 CPU so hang */
|
||||
1:
|
||||
@ -442,7 +522,7 @@ gdt:
|
||||
.word gdt_end - gdt
|
||||
.long gdt
|
||||
.word 0
|
||||
.quad 0x0000000000000000 /* NULL descriptor */
|
||||
.quad 0x00cf9a000000ffff /* __KERNEL32_CS */
|
||||
.quad 0x00af9a000000ffff /* __KERNEL_CS */
|
||||
.quad 0x00cf92000000ffff /* __KERNEL_DS */
|
||||
.quad 0x0080890000000000 /* TS descriptor */
|
||||
@ -486,3 +566,7 @@ boot_stack_end:
|
||||
.balign 4096
|
||||
pgtable:
|
||||
.fill BOOT_PGT_SIZE, 1, 0
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
lvl5_pgtable:
|
||||
.fill PAGE_SIZE, 1, 0
|
||||
#endif
|
||||
|
@ -63,7 +63,7 @@ static void *alloc_pgt_page(void *context)
|
||||
static struct alloc_pgt_data pgt_data;
|
||||
|
||||
/* The top level page table entry pointer. */
|
||||
static unsigned long level4p;
|
||||
static unsigned long top_level_pgt;
|
||||
|
||||
/*
|
||||
* Mapping information structure passed to kernel_ident_mapping_init().
|
||||
@ -91,9 +91,15 @@ void initialize_identity_maps(void)
|
||||
* If we came here via startup_32(), cr3 will be _pgtable already
|
||||
* and we must append to the existing area instead of entirely
|
||||
* overwriting it.
|
||||
*
|
||||
* With 5-level paging, we use '_pgtable' to allocate the p4d page table,
|
||||
* the top-level page table is allocated separately.
|
||||
*
|
||||
* p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
|
||||
* cases. On 4-level paging it's equal to 'top_level_pgt'.
|
||||
*/
|
||||
level4p = read_cr3();
|
||||
if (level4p == (unsigned long)_pgtable) {
|
||||
top_level_pgt = read_cr3_pa();
|
||||
if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
|
||||
debug_putstr("booted via startup_32()\n");
|
||||
pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
|
||||
pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
|
||||
@ -103,7 +109,7 @@ void initialize_identity_maps(void)
|
||||
pgt_data.pgt_buf = _pgtable;
|
||||
pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
|
||||
memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
|
||||
level4p = (unsigned long)alloc_pgt_page(&pgt_data);
|
||||
top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
|
||||
}
|
||||
}
|
||||
|
||||
@ -123,7 +129,7 @@ void add_identity_map(unsigned long start, unsigned long size)
|
||||
return;
|
||||
|
||||
/* Build the mapping. */
|
||||
kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p,
|
||||
kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
|
||||
start, end);
|
||||
}
|
||||
|
||||
@ -134,5 +140,5 @@ void add_identity_map(unsigned long start, unsigned long size)
|
||||
*/
|
||||
void finalize_identity_maps(void)
|
||||
{
|
||||
write_cr3(level4p);
|
||||
write_cr3(top_level_pgt);
|
||||
}
|
||||
|
@ -265,7 +265,8 @@ return_from_SYSCALL_64:
|
||||
* If width of "canonical tail" ever becomes variable, this will need
|
||||
* to be updated to remain correct on both old and new CPUs.
|
||||
*
|
||||
* Change top 16 bits to be the sign-extension of 47th bit
|
||||
* Change top bits to match most significant bit (47th or 56th bit
|
||||
* depending on paging mode) in the address.
|
||||
*/
|
||||
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
|
||||
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
|
||||
|
@ -2111,8 +2111,7 @@ static int x86_pmu_event_init(struct perf_event *event)
|
||||
|
||||
static void refresh_pce(void *ignored)
|
||||
{
|
||||
if (current->active_mm)
|
||||
load_mm_cr4(current->active_mm);
|
||||
load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
|
||||
}
|
||||
|
||||
static void x86_pmu_event_mapped(struct perf_event *event)
|
||||
@ -2344,7 +2343,7 @@ static unsigned long get_segment_base(unsigned int segment)
|
||||
|
||||
/* IRQs are off, so this synchronizes with smp_store_release */
|
||||
ldt = lockless_dereference(current->active_mm->context.ldt);
|
||||
if (!ldt || idx > ldt->size)
|
||||
if (!ldt || idx > ldt->nr_entries)
|
||||
return 0;
|
||||
|
||||
desc = &ldt->entries[idx];
|
||||
|
@ -74,7 +74,7 @@ struct efi_scratch {
|
||||
__kernel_fpu_begin(); \
|
||||
\
|
||||
if (efi_scratch.use_pgd) { \
|
||||
efi_scratch.prev_cr3 = read_cr3(); \
|
||||
efi_scratch.prev_cr3 = __read_cr3(); \
|
||||
write_cr3((unsigned long)efi_scratch.efi_pgt); \
|
||||
__flush_tlb_all(); \
|
||||
} \
|
||||
|
@ -22,8 +22,8 @@ typedef struct {
|
||||
#ifdef CONFIG_SMP
|
||||
unsigned int irq_resched_count;
|
||||
unsigned int irq_call_count;
|
||||
unsigned int irq_tlb_count;
|
||||
#endif
|
||||
unsigned int irq_tlb_count;
|
||||
#ifdef CONFIG_X86_THERMAL_VECTOR
|
||||
unsigned int irq_thermal_count;
|
||||
#endif
|
||||
|
@ -37,12 +37,6 @@ typedef struct {
|
||||
#endif
|
||||
} mm_context_t;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
void leave_mm(int cpu);
|
||||
#else
|
||||
static inline void leave_mm(int cpu)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_X86_MMU_H */
|
||||
|
@ -47,7 +47,7 @@ struct ldt_struct {
|
||||
* allocations, but it's not worth trying to optimize.
|
||||
*/
|
||||
struct desc_struct *entries;
|
||||
unsigned int size;
|
||||
unsigned int nr_entries;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm)
|
||||
*/
|
||||
|
||||
if (unlikely(ldt))
|
||||
set_ldt(ldt->entries, ldt->size);
|
||||
set_ldt(ldt->entries, ldt->nr_entries);
|
||||
else
|
||||
clear_LDT();
|
||||
#else
|
||||
clear_LDT();
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
|
||||
{
|
||||
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
||||
/*
|
||||
* Load the LDT if either the old or new mm had an LDT.
|
||||
*
|
||||
* An mm will never go from having an LDT to not having an LDT. Two
|
||||
* mms never share an LDT, so we don't gain anything by checking to
|
||||
* see whether the LDT changed. There's also no guarantee that
|
||||
* prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
|
||||
* then prev->context.ldt will also be non-NULL.
|
||||
*
|
||||
* If we really cared, we could optimize the case where prev == next
|
||||
* and we're exiting lazy mode. Most of the time, if this happens,
|
||||
* we don't actually need to reload LDTR, but modify_ldt() is mostly
|
||||
* used by legacy code and emulators where we don't need this level of
|
||||
* performance.
|
||||
*
|
||||
* This uses | instead of || because it generates better code.
|
||||
*/
|
||||
if (unlikely((unsigned long)prev->context.ldt |
|
||||
(unsigned long)next->context.ldt))
|
||||
load_mm_ldt(next);
|
||||
#endif
|
||||
|
||||
DEBUG_LOCKS_WARN_ON(preemptible());
|
||||
}
|
||||
|
||||
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
|
||||
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int init_new_context(struct task_struct *tsk,
|
||||
@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
|
||||
{
|
||||
u32 pkru = read_pkru();
|
||||
|
||||
if (!__pkru_allows_read(pkru, pkey))
|
||||
return false;
|
||||
if (write && !__pkru_allows_write(pkru, pkey))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* We only want to enforce protection keys on the current process
|
||||
* because we effectively have no access to PKRU for other
|
||||
@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
|
||||
return __pkru_allows_pkey(vma_pkey(vma), write);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* This can be used from process context to figure out what the value of
|
||||
* CR3 is without needing to do a (slow) __read_cr3().
|
||||
*
|
||||
* It's intended to be used for code like KVM that sneakily changes CR3
|
||||
* and needs to restore it. It needs to be used very carefully.
|
||||
*/
|
||||
static inline unsigned long __get_current_cr3_fast(void)
|
||||
{
|
||||
unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
|
||||
|
||||
/* For now, be very restrictive about when this can be called. */
|
||||
VM_WARN_ON(in_nmi() || !in_atomic());
|
||||
|
||||
VM_BUG_ON(cr3 != __read_cr3());
|
||||
return cr3;
|
||||
}
|
||||
|
||||
#endif /* _ASM_X86_MMU_CONTEXT_H */
|
||||
|
@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
|
||||
PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
|
||||
}
|
||||
|
||||
static inline unsigned long read_cr3(void)
|
||||
static inline unsigned long __read_cr3(void)
|
||||
{
|
||||
return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
|
||||
}
|
||||
@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr)
|
||||
}
|
||||
|
||||
static inline void flush_tlb_others(const struct cpumask *cpumask,
|
||||
struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long end)
|
||||
const struct flush_tlb_info *info)
|
||||
{
|
||||
PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
|
||||
PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
|
||||
}
|
||||
|
||||
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
|
||||
|
@ -51,6 +51,7 @@ struct mm_struct;
|
||||
struct desc_struct;
|
||||
struct task_struct;
|
||||
struct cpumask;
|
||||
struct flush_tlb_info;
|
||||
|
||||
/*
|
||||
* Wrapper type for pointers to code which uses the non-standard
|
||||
@ -223,9 +224,7 @@ struct pv_mmu_ops {
|
||||
void (*flush_tlb_kernel)(void);
|
||||
void (*flush_tlb_single)(unsigned long addr);
|
||||
void (*flush_tlb_others)(const struct cpumask *cpus,
|
||||
struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long end);
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
/* Hooks for allocating and freeing a pagetable top-level */
|
||||
int (*pgd_alloc)(struct mm_struct *mm);
|
||||
|
@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
|
||||
#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
|
||||
#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
|
||||
|
||||
#define gup_get_pte gup_get_pte
|
||||
/*
|
||||
* WARNING: only to be used in the get_user_pages_fast() implementation.
|
||||
*
|
||||
* With get_user_pages_fast(), we walk down the pagetables without taking
|
||||
* any locks. For this we would like to load the pointers atomically,
|
||||
* but that is not possible (without expensive cmpxchg8b) on PAE. What
|
||||
* we do have is the guarantee that a PTE will only either go from not
|
||||
* present to present, or present to not present or both -- it will not
|
||||
* switch to a completely different present page without a TLB flush in
|
||||
* between; something that we are blocking by holding interrupts off.
|
||||
*
|
||||
* Setting ptes from not present to present goes:
|
||||
*
|
||||
* ptep->pte_high = h;
|
||||
* smp_wmb();
|
||||
* ptep->pte_low = l;
|
||||
*
|
||||
* And present to not present goes:
|
||||
*
|
||||
* ptep->pte_low = 0;
|
||||
* smp_wmb();
|
||||
* ptep->pte_high = 0;
|
||||
*
|
||||
* We must ensure here that the load of pte_low sees 'l' iff pte_high
|
||||
* sees 'h'. We load pte_high *after* loading pte_low, which ensures we
|
||||
* don't see an older value of pte_high. *Then* we recheck pte_low,
|
||||
* which ensures that we haven't picked up a changed pte high. We might
|
||||
* have gotten rubbish values from pte_low and pte_high, but we are
|
||||
* guaranteed that pte_low will not have the present bit set *unless*
|
||||
* it is 'l'. Because get_user_pages_fast() only operates on present ptes
|
||||
* we're safe.
|
||||
*/
|
||||
static inline pte_t gup_get_pte(pte_t *ptep)
|
||||
{
|
||||
pte_t pte;
|
||||
|
||||
do {
|
||||
pte.pte_low = ptep->pte_low;
|
||||
smp_rmb();
|
||||
pte.pte_high = ptep->pte_high;
|
||||
smp_rmb();
|
||||
} while (unlikely(pte.pte_low != ptep->pte_low));
|
||||
|
||||
return pte;
|
||||
}
|
||||
|
||||
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
|
||||
|
@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud)
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int pgd_devmap(pgd_t pgd)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry;
|
||||
static inline void __meminit init_trampoline_default(void)
|
||||
{
|
||||
/* Default trampoline pgd value */
|
||||
trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)];
|
||||
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
|
||||
}
|
||||
# ifdef CONFIG_RANDOMIZE_MEMORY
|
||||
void __meminit init_trampoline(void);
|
||||
@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags)
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
|
||||
{
|
||||
u32 pkru = read_pkru();
|
||||
|
||||
if (!__pkru_allows_read(pkru, pkey))
|
||||
return false;
|
||||
if (write && !__pkru_allows_write(pkru, pkey))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* 'pteval' can come from a PTE, PMD or PUD. We only check
|
||||
* _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
|
||||
* same value on all 3 types.
|
||||
*/
|
||||
static inline bool __pte_access_permitted(unsigned long pteval, bool write)
|
||||
{
|
||||
unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
|
||||
|
||||
if (write)
|
||||
need_pte_bits |= _PAGE_RW;
|
||||
|
||||
if ((pteval & need_pte_bits) != need_pte_bits)
|
||||
return 0;
|
||||
|
||||
return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
|
||||
}
|
||||
|
||||
#define pte_access_permitted pte_access_permitted
|
||||
static inline bool pte_access_permitted(pte_t pte, bool write)
|
||||
{
|
||||
return __pte_access_permitted(pte_val(pte), write);
|
||||
}
|
||||
|
||||
#define pmd_access_permitted pmd_access_permitted
|
||||
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
|
||||
{
|
||||
return __pte_access_permitted(pmd_val(pmd), write);
|
||||
}
|
||||
|
||||
#define pud_access_permitted pud_access_permitted
|
||||
static inline bool pud_access_permitted(pud_t pud, bool write)
|
||||
{
|
||||
return __pte_access_permitted(pud_val(pud), write);
|
||||
}
|
||||
|
||||
#include <asm-generic/pgtable.h>
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
|
@ -14,15 +14,17 @@
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/threads.h>
|
||||
|
||||
extern p4d_t level4_kernel_pgt[512];
|
||||
extern p4d_t level4_ident_pgt[512];
|
||||
extern pud_t level3_kernel_pgt[512];
|
||||
extern pud_t level3_ident_pgt[512];
|
||||
extern pmd_t level2_kernel_pgt[512];
|
||||
extern pmd_t level2_fixmap_pgt[512];
|
||||
extern pmd_t level2_ident_pgt[512];
|
||||
extern pte_t level1_fixmap_pgt[512];
|
||||
extern pgd_t init_level4_pgt[];
|
||||
extern pgd_t init_top_pgt[];
|
||||
|
||||
#define swapper_pg_dir init_level4_pgt
|
||||
#define swapper_pg_dir init_top_pgt
|
||||
|
||||
extern void paging_init(void);
|
||||
|
||||
@ -227,6 +229,20 @@ extern void cleanup_highmap(void);
|
||||
extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
|
||||
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
#define gup_fast_permitted gup_fast_permitted
|
||||
static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
|
||||
int write)
|
||||
{
|
||||
unsigned long len, end;
|
||||
|
||||
len = (unsigned long)nr_pages << PAGE_SHIFT;
|
||||
end = start + len;
|
||||
if (end < start)
|
||||
return false;
|
||||
if (end >> __VIRTUAL_MASK_SHIFT)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
#endif /* _ASM_X86_PGTABLE_64_H */
|
||||
|
@ -8,4 +8,40 @@
|
||||
#else
|
||||
#define X86_VM_MASK 0 /* No VM86 support */
|
||||
#endif
|
||||
|
||||
/*
|
||||
* CR3's layout varies depending on several things.
|
||||
*
|
||||
* If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
|
||||
* If PAE is enabled, then CR3[11:5] is part of the PDPT address
|
||||
* (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
|
||||
* Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
|
||||
* CR3[2:0] and CR3[11:5] are ignored.
|
||||
*
|
||||
* In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
|
||||
*
|
||||
* CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be
|
||||
* written as 1 to prevent the write to CR3 from flushing the TLB.
|
||||
*
|
||||
* On systems with SME, one bit (in a variable position!) is stolen to indicate
|
||||
* that the top-level paging structure is encrypted.
|
||||
*
|
||||
* All of the remaining bits indicate the physical address of the top-level
|
||||
* paging structure.
|
||||
*
|
||||
* CR3_ADDR_MASK is the mask used by read_cr3_pa().
|
||||
*/
|
||||
#ifdef CONFIG_X86_64
|
||||
/* Mask off the address space ID bits. */
|
||||
#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
|
||||
#define CR3_PCID_MASK 0xFFFull
|
||||
#else
|
||||
/*
|
||||
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
|
||||
* a tiny bit of code size by setting all the bits.
|
||||
*/
|
||||
#define CR3_ADDR_MASK 0xFFFFFFFFull
|
||||
#define CR3_PCID_MASK 0ull
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
|
||||
|
@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
|
||||
native_cpuid_reg(ecx)
|
||||
native_cpuid_reg(edx)
|
||||
|
||||
/*
|
||||
* Friendlier CR3 helpers.
|
||||
*/
|
||||
static inline unsigned long read_cr3_pa(void)
|
||||
{
|
||||
return __read_cr3() & CR3_ADDR_MASK;
|
||||
}
|
||||
|
||||
static inline void load_cr3(pgd_t *pgdir)
|
||||
{
|
||||
write_cr3(__pa(pgdir));
|
||||
|
@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
|
||||
asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
|
||||
}
|
||||
|
||||
static inline unsigned long native_read_cr3(void)
|
||||
static inline unsigned long __native_read_cr3(void)
|
||||
{
|
||||
unsigned long val;
|
||||
asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
|
||||
@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
|
||||
native_write_cr2(x);
|
||||
}
|
||||
|
||||
static inline unsigned long read_cr3(void)
|
||||
/*
|
||||
* Careful! CR3 contains more than just an address. You probably want
|
||||
* read_cr3_pa() instead.
|
||||
*/
|
||||
static inline unsigned long __read_cr3(void)
|
||||
{
|
||||
return native_read_cr3();
|
||||
return __native_read_cr3();
|
||||
}
|
||||
|
||||
static inline void write_cr3(unsigned long x)
|
||||
|
14
arch/x86/include/asm/tlbbatch.h
Normal file
14
arch/x86/include/asm/tlbbatch.h
Normal file
@ -0,0 +1,14 @@
|
||||
#ifndef _ARCH_X86_TLBBATCH_H
|
||||
#define _ARCH_X86_TLBBATCH_H
|
||||
|
||||
#include <linux/cpumask.h>
|
||||
|
||||
struct arch_tlbflush_unmap_batch {
|
||||
/*
|
||||
* Each bit set is a CPU that potentially has a TLB entry for one of
|
||||
* the PFNs being flushed..
|
||||
*/
|
||||
struct cpumask cpumask;
|
||||
};
|
||||
|
||||
#endif /* _ARCH_X86_TLBBATCH_H */
|
@ -7,6 +7,7 @@
|
||||
#include <asm/processor.h>
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/special_insns.h>
|
||||
#include <asm/smp.h>
|
||||
|
||||
static inline void __invpcid(unsigned long pcid, unsigned long addr,
|
||||
unsigned long type)
|
||||
@ -65,10 +66,14 @@ static inline void invpcid_flush_all_nonglobals(void)
|
||||
#endif
|
||||
|
||||
struct tlb_state {
|
||||
#ifdef CONFIG_SMP
|
||||
struct mm_struct *active_mm;
|
||||
/*
|
||||
* cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
|
||||
* are on. This means that it may not match current->active_mm,
|
||||
* which will contain the previous user mm when we're in lazy TLB
|
||||
* mode even if we've already switched back to swapper_pg_dir.
|
||||
*/
|
||||
struct mm_struct *loaded_mm;
|
||||
int state;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Access to this CR4 shadow and to H/W CR4 is protected by
|
||||
@ -151,7 +156,7 @@ static inline void __native_flush_tlb(void)
|
||||
* back:
|
||||
*/
|
||||
preempt_disable();
|
||||
native_write_cr3(native_read_cr3());
|
||||
native_write_cr3(__native_read_cr3());
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
@ -220,84 +225,16 @@ static inline void __flush_tlb_one(unsigned long addr)
|
||||
* - flush_tlb_page(vma, vmaddr) flushes one page
|
||||
* - flush_tlb_range(vma, start, end) flushes a range of pages
|
||||
* - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
|
||||
* - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus
|
||||
* - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
|
||||
*
|
||||
* ..but the i386 has somewhat limited tlb flushing capabilities,
|
||||
* and page-granular flushes are available only on i486 and up.
|
||||
*/
|
||||
|
||||
#ifndef CONFIG_SMP
|
||||
|
||||
/* "_up" is for UniProcessor.
|
||||
*
|
||||
* This is a helper for other header functions. *Not* intended to be called
|
||||
* directly. All global TLB flushes need to either call this, or to bump the
|
||||
* vm statistics themselves.
|
||||
*/
|
||||
static inline void __flush_tlb_up(void)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
||||
__flush_tlb();
|
||||
}
|
||||
|
||||
static inline void flush_tlb_all(void)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
||||
__flush_tlb_all();
|
||||
}
|
||||
|
||||
static inline void local_flush_tlb(void)
|
||||
{
|
||||
__flush_tlb_up();
|
||||
}
|
||||
|
||||
static inline void flush_tlb_mm(struct mm_struct *mm)
|
||||
{
|
||||
if (mm == current->active_mm)
|
||||
__flush_tlb_up();
|
||||
}
|
||||
|
||||
static inline void flush_tlb_page(struct vm_area_struct *vma,
|
||||
unsigned long addr)
|
||||
{
|
||||
if (vma->vm_mm == current->active_mm)
|
||||
__flush_tlb_one(addr);
|
||||
}
|
||||
|
||||
static inline void flush_tlb_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
if (vma->vm_mm == current->active_mm)
|
||||
__flush_tlb_up();
|
||||
}
|
||||
|
||||
static inline void flush_tlb_mm_range(struct mm_struct *mm,
|
||||
unsigned long start, unsigned long end, unsigned long vmflag)
|
||||
{
|
||||
if (mm == current->active_mm)
|
||||
__flush_tlb_up();
|
||||
}
|
||||
|
||||
static inline void native_flush_tlb_others(const struct cpumask *cpumask,
|
||||
struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void reset_lazy_tlbstate(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void flush_tlb_kernel_range(unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
flush_tlb_all();
|
||||
}
|
||||
|
||||
#else /* SMP */
|
||||
|
||||
#include <asm/smp.h>
|
||||
struct flush_tlb_info {
|
||||
struct mm_struct *mm;
|
||||
unsigned long start;
|
||||
unsigned long end;
|
||||
};
|
||||
|
||||
#define local_flush_tlb() __flush_tlb()
|
||||
|
||||
@ -307,29 +244,32 @@ static inline void flush_tlb_kernel_range(unsigned long start,
|
||||
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
|
||||
|
||||
extern void flush_tlb_all(void);
|
||||
extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
|
||||
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end, unsigned long vmflag);
|
||||
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
|
||||
|
||||
static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
|
||||
{
|
||||
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
|
||||
}
|
||||
|
||||
void native_flush_tlb_others(const struct cpumask *cpumask,
|
||||
struct mm_struct *mm,
|
||||
unsigned long start, unsigned long end);
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
#define TLBSTATE_OK 1
|
||||
#define TLBSTATE_LAZY 2
|
||||
|
||||
static inline void reset_lazy_tlbstate(void)
|
||||
static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
|
||||
struct mm_struct *mm)
|
||||
{
|
||||
this_cpu_write(cpu_tlbstate.state, 0);
|
||||
this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
|
||||
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
|
||||
}
|
||||
|
||||
#endif /* SMP */
|
||||
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
|
||||
|
||||
#ifndef CONFIG_PARAVIRT
|
||||
#define flush_tlb_others(mask, mm, start, end) \
|
||||
native_flush_tlb_others(mask, mm, start, end)
|
||||
#define flush_tlb_others(mask, info) \
|
||||
native_flush_tlb_others(mask, info)
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_X86_TLBFLUSH_H */
|
||||
|
@ -1,6 +1,8 @@
|
||||
#ifndef _ASM_X86_UV_UV_H
|
||||
#define _ASM_X86_UV_UV_H
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
|
||||
|
||||
struct cpumask;
|
||||
@ -15,10 +17,7 @@ extern void uv_cpu_init(void);
|
||||
extern void uv_nmi_init(void);
|
||||
extern void uv_system_init(void);
|
||||
extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
|
||||
struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long end,
|
||||
unsigned int cpu);
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
#else /* X86_UV */
|
||||
|
||||
@ -28,8 +27,8 @@ static inline int is_uv_hubless(void) { return 0; }
|
||||
static inline void uv_cpu_init(void) { }
|
||||
static inline void uv_system_init(void) { }
|
||||
static inline const struct cpumask *
|
||||
uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long end, unsigned int cpu)
|
||||
uv_flush_tlb_others(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info)
|
||||
{ return cpumask; }
|
||||
|
||||
#endif /* X86_UV */
|
||||
|
@ -104,6 +104,8 @@
|
||||
#define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT)
|
||||
#define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */
|
||||
#define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT)
|
||||
#define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */
|
||||
#define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT)
|
||||
#define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */
|
||||
#define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT)
|
||||
#define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */
|
||||
|
@ -18,6 +18,7 @@ CFLAGS_REMOVE_pvclock.o = -pg
|
||||
CFLAGS_REMOVE_kvmclock.o = -pg
|
||||
CFLAGS_REMOVE_ftrace.o = -pg
|
||||
CFLAGS_REMOVE_early_printk.o = -pg
|
||||
CFLAGS_REMOVE_head64.o = -pg
|
||||
endif
|
||||
|
||||
KASAN_SANITIZE_head$(BITS).o := n
|
||||
|
@ -125,7 +125,7 @@ void __init init_espfix_bsp(void)
|
||||
p4d_t *p4d;
|
||||
|
||||
/* Install the espfix pud into the kernel page directory */
|
||||
pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
|
||||
pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
|
||||
p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
|
||||
p4d_populate(&init_mm, p4d, espfix_pud_page);
|
||||
|
||||
|
@ -33,17 +33,120 @@
|
||||
/*
|
||||
* Manage page tables very early on.
|
||||
*/
|
||||
extern pgd_t early_level4_pgt[PTRS_PER_PGD];
|
||||
extern pgd_t early_top_pgt[PTRS_PER_PGD];
|
||||
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
|
||||
static unsigned int __initdata next_early_pgt = 2;
|
||||
static unsigned int __initdata next_early_pgt;
|
||||
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
|
||||
|
||||
#define __head __section(.head.text)
|
||||
|
||||
static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
|
||||
{
|
||||
return ptr - (void *)_text + (void *)physaddr;
|
||||
}
|
||||
|
||||
void __head __startup_64(unsigned long physaddr)
|
||||
{
|
||||
unsigned long load_delta, *p;
|
||||
pgdval_t *pgd;
|
||||
p4dval_t *p4d;
|
||||
pudval_t *pud;
|
||||
pmdval_t *pmd, pmd_entry;
|
||||
int i;
|
||||
|
||||
/* Is the address too large? */
|
||||
if (physaddr >> MAX_PHYSMEM_BITS)
|
||||
for (;;);
|
||||
|
||||
/*
|
||||
* Compute the delta between the address I am compiled to run at
|
||||
* and the address I am actually running at.
|
||||
*/
|
||||
load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
|
||||
|
||||
/* Is the address not 2M aligned? */
|
||||
if (load_delta & ~PMD_PAGE_MASK)
|
||||
for (;;);
|
||||
|
||||
/* Fixup the physical addresses in the page table */
|
||||
|
||||
pgd = fixup_pointer(&early_top_pgt, physaddr);
|
||||
pgd[pgd_index(__START_KERNEL_map)] += load_delta;
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
|
||||
p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
|
||||
p4d[511] += load_delta;
|
||||
}
|
||||
|
||||
pud = fixup_pointer(&level3_kernel_pgt, physaddr);
|
||||
pud[510] += load_delta;
|
||||
pud[511] += load_delta;
|
||||
|
||||
pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
|
||||
pmd[506] += load_delta;
|
||||
|
||||
/*
|
||||
* Set up the identity mapping for the switchover. These
|
||||
* entries should *NOT* have the global bit set! This also
|
||||
* creates a bunch of nonsense entries but that is fine --
|
||||
* it avoids problems around wraparound.
|
||||
*/
|
||||
|
||||
pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
|
||||
pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
|
||||
p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
|
||||
|
||||
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
|
||||
pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
|
||||
pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
|
||||
|
||||
i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
|
||||
p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
|
||||
p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
|
||||
} else {
|
||||
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
|
||||
pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
|
||||
pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
|
||||
}
|
||||
|
||||
i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
|
||||
pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
|
||||
pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
|
||||
|
||||
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
|
||||
pmd_entry += physaddr;
|
||||
|
||||
for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
|
||||
int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
|
||||
pmd[idx] = pmd_entry + i * PMD_SIZE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fixup the kernel text+data virtual addresses. Note that
|
||||
* we might write invalid pmds, when the kernel is relocated
|
||||
* cleanup_highmap() fixes this up along with the mappings
|
||||
* beyond _end.
|
||||
*/
|
||||
|
||||
pmd = fixup_pointer(level2_kernel_pgt, physaddr);
|
||||
for (i = 0; i < PTRS_PER_PMD; i++) {
|
||||
if (pmd[i] & _PAGE_PRESENT)
|
||||
pmd[i] += load_delta;
|
||||
}
|
||||
|
||||
/* Fixup phys_base */
|
||||
p = fixup_pointer(&phys_base, physaddr);
|
||||
*p += load_delta;
|
||||
}
|
||||
|
||||
/* Wipe all early page tables except for the kernel symbol map */
|
||||
static void __init reset_early_page_tables(void)
|
||||
{
|
||||
memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
|
||||
memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
|
||||
next_early_pgt = 0;
|
||||
write_cr3(__pa_nodebug(early_level4_pgt));
|
||||
write_cr3(__pa_nodebug(early_top_pgt));
|
||||
}
|
||||
|
||||
/* Create a new PMD entry */
|
||||
@ -51,15 +154,16 @@ int __init early_make_pgtable(unsigned long address)
|
||||
{
|
||||
unsigned long physaddr = address - __PAGE_OFFSET;
|
||||
pgdval_t pgd, *pgd_p;
|
||||
p4dval_t p4d, *p4d_p;
|
||||
pudval_t pud, *pud_p;
|
||||
pmdval_t pmd, *pmd_p;
|
||||
|
||||
/* Invalid address or early pgt is done ? */
|
||||
if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
|
||||
if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
|
||||
return -1;
|
||||
|
||||
again:
|
||||
pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
|
||||
pgd_p = &early_top_pgt[pgd_index(address)].pgd;
|
||||
pgd = *pgd_p;
|
||||
|
||||
/*
|
||||
@ -67,8 +171,25 @@ again:
|
||||
* critical -- __PAGE_OFFSET would point us back into the dynamic
|
||||
* range and we might end up looping forever...
|
||||
*/
|
||||
if (pgd)
|
||||
pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
|
||||
if (!IS_ENABLED(CONFIG_X86_5LEVEL))
|
||||
p4d_p = pgd_p;
|
||||
else if (pgd)
|
||||
p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
|
||||
else {
|
||||
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
|
||||
reset_early_page_tables();
|
||||
goto again;
|
||||
}
|
||||
|
||||
p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
|
||||
memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
|
||||
*pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
|
||||
}
|
||||
p4d_p += p4d_index(address);
|
||||
p4d = *p4d_p;
|
||||
|
||||
if (p4d)
|
||||
pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
|
||||
else {
|
||||
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
|
||||
reset_early_page_tables();
|
||||
@ -77,7 +198,7 @@ again:
|
||||
|
||||
pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
|
||||
memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
|
||||
*pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
|
||||
*p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
|
||||
}
|
||||
pud_p += pud_index(address);
|
||||
pud = *pud_p;
|
||||
@ -156,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
|
||||
|
||||
clear_bss();
|
||||
|
||||
clear_page(init_level4_pgt);
|
||||
clear_page(init_top_pgt);
|
||||
|
||||
kasan_early_init();
|
||||
|
||||
@ -171,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
|
||||
*/
|
||||
load_ucode_bsp();
|
||||
|
||||
/* set init_level4_pgt kernel high mapping*/
|
||||
init_level4_pgt[511] = early_level4_pgt[511];
|
||||
/* set init_top_pgt kernel high mapping*/
|
||||
init_top_pgt[511] = early_top_pgt[511];
|
||||
|
||||
x86_64_start_reservations(real_mode_data);
|
||||
}
|
||||
|
@ -37,10 +37,11 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
|
||||
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
|
||||
|
||||
L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
|
||||
L4_START_KERNEL = pgd_index(__START_KERNEL_map)
|
||||
PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
|
||||
PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
|
||||
L3_START_KERNEL = pud_index(__START_KERNEL_map)
|
||||
|
||||
.text
|
||||
@ -72,101 +73,12 @@ startup_64:
|
||||
/* Sanitize CPU configuration */
|
||||
call verify_cpu
|
||||
|
||||
/*
|
||||
* Compute the delta between the address I am compiled to run at and the
|
||||
* address I am actually running at.
|
||||
*/
|
||||
leaq _text(%rip), %rbp
|
||||
subq $_text - __START_KERNEL_map, %rbp
|
||||
|
||||
/* Is the address not 2M aligned? */
|
||||
testl $~PMD_PAGE_MASK, %ebp
|
||||
jnz bad_address
|
||||
|
||||
/*
|
||||
* Is the address too large?
|
||||
*/
|
||||
leaq _text(%rip), %rax
|
||||
shrq $MAX_PHYSMEM_BITS, %rax
|
||||
jnz bad_address
|
||||
|
||||
/*
|
||||
* Fixup the physical addresses in the page table
|
||||
*/
|
||||
addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
|
||||
|
||||
addq %rbp, level3_kernel_pgt + (510*8)(%rip)
|
||||
addq %rbp, level3_kernel_pgt + (511*8)(%rip)
|
||||
|
||||
addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
|
||||
|
||||
/*
|
||||
* Set up the identity mapping for the switchover. These
|
||||
* entries should *NOT* have the global bit set! This also
|
||||
* creates a bunch of nonsense entries but that is fine --
|
||||
* it avoids problems around wraparound.
|
||||
*/
|
||||
leaq _text(%rip), %rdi
|
||||
leaq early_level4_pgt(%rip), %rbx
|
||||
pushq %rsi
|
||||
call __startup_64
|
||||
popq %rsi
|
||||
|
||||
movq %rdi, %rax
|
||||
shrq $PGDIR_SHIFT, %rax
|
||||
|
||||
leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx
|
||||
movq %rdx, 0(%rbx,%rax,8)
|
||||
movq %rdx, 8(%rbx,%rax,8)
|
||||
|
||||
addq $PAGE_SIZE, %rdx
|
||||
movq %rdi, %rax
|
||||
shrq $PUD_SHIFT, %rax
|
||||
andl $(PTRS_PER_PUD-1), %eax
|
||||
movq %rdx, PAGE_SIZE(%rbx,%rax,8)
|
||||
incl %eax
|
||||
andl $(PTRS_PER_PUD-1), %eax
|
||||
movq %rdx, PAGE_SIZE(%rbx,%rax,8)
|
||||
|
||||
addq $PAGE_SIZE * 2, %rbx
|
||||
movq %rdi, %rax
|
||||
shrq $PMD_SHIFT, %rdi
|
||||
addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
|
||||
leaq (_end - 1)(%rip), %rcx
|
||||
shrq $PMD_SHIFT, %rcx
|
||||
subq %rdi, %rcx
|
||||
incl %ecx
|
||||
|
||||
1:
|
||||
andq $(PTRS_PER_PMD - 1), %rdi
|
||||
movq %rax, (%rbx,%rdi,8)
|
||||
incq %rdi
|
||||
addq $PMD_SIZE, %rax
|
||||
decl %ecx
|
||||
jnz 1b
|
||||
|
||||
test %rbp, %rbp
|
||||
jz .Lskip_fixup
|
||||
|
||||
/*
|
||||
* Fixup the kernel text+data virtual addresses. Note that
|
||||
* we might write invalid pmds, when the kernel is relocated
|
||||
* cleanup_highmap() fixes this up along with the mappings
|
||||
* beyond _end.
|
||||
*/
|
||||
leaq level2_kernel_pgt(%rip), %rdi
|
||||
leaq PAGE_SIZE(%rdi), %r8
|
||||
/* See if it is a valid page table entry */
|
||||
1: testb $_PAGE_PRESENT, 0(%rdi)
|
||||
jz 2f
|
||||
addq %rbp, 0(%rdi)
|
||||
/* Go to the next page */
|
||||
2: addq $8, %rdi
|
||||
cmp %r8, %rdi
|
||||
jne 1b
|
||||
|
||||
/* Fixup phys_base */
|
||||
addq %rbp, phys_base(%rip)
|
||||
|
||||
.Lskip_fixup:
|
||||
movq $(early_level4_pgt - __START_KERNEL_map), %rax
|
||||
movq $(early_top_pgt - __START_KERNEL_map), %rax
|
||||
jmp 1f
|
||||
ENTRY(secondary_startup_64)
|
||||
/*
|
||||
@ -186,14 +98,17 @@ ENTRY(secondary_startup_64)
|
||||
/* Sanitize CPU configuration */
|
||||
call verify_cpu
|
||||
|
||||
movq $(init_level4_pgt - __START_KERNEL_map), %rax
|
||||
movq $(init_top_pgt - __START_KERNEL_map), %rax
|
||||
1:
|
||||
|
||||
/* Enable PAE mode and PGE */
|
||||
/* Enable PAE mode, PGE and LA57 */
|
||||
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
orl $X86_CR4_LA57, %ecx
|
||||
#endif
|
||||
movq %rcx, %cr4
|
||||
|
||||
/* Setup early boot stage 4 level pagetables. */
|
||||
/* Setup early boot stage 4-/5-level pagetables. */
|
||||
addq phys_base(%rip), %rax
|
||||
movq %rax, %cr3
|
||||
|
||||
@ -417,9 +332,13 @@ GLOBAL(name)
|
||||
.endr
|
||||
|
||||
__INITDATA
|
||||
NEXT_PAGE(early_level4_pgt)
|
||||
NEXT_PAGE(early_top_pgt)
|
||||
.fill 511,8,0
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
.quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
|
||||
#else
|
||||
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
|
||||
#endif
|
||||
|
||||
NEXT_PAGE(early_dynamic_pgts)
|
||||
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
|
||||
@ -427,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts)
|
||||
.data
|
||||
|
||||
#ifndef CONFIG_XEN
|
||||
NEXT_PAGE(init_level4_pgt)
|
||||
NEXT_PAGE(init_top_pgt)
|
||||
.fill 512,8,0
|
||||
#else
|
||||
NEXT_PAGE(init_level4_pgt)
|
||||
NEXT_PAGE(init_top_pgt)
|
||||
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
|
||||
.org init_level4_pgt + L4_PAGE_OFFSET*8, 0
|
||||
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
|
||||
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
|
||||
.org init_level4_pgt + L4_START_KERNEL*8, 0
|
||||
.org init_top_pgt + PGD_START_KERNEL*8, 0
|
||||
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
|
||||
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
|
||||
|
||||
@ -448,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt)
|
||||
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
NEXT_PAGE(level4_kernel_pgt)
|
||||
.fill 511,8,0
|
||||
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
|
||||
#endif
|
||||
|
||||
NEXT_PAGE(level3_kernel_pgt)
|
||||
.fill L3_START_KERNEL,8,0
|
||||
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
|
||||
|
@ -22,24 +22,25 @@
|
||||
#include <asm/syscalls.h>
|
||||
|
||||
/* context.lock is held for us, so we don't need any locking. */
|
||||
static void flush_ldt(void *current_mm)
|
||||
static void flush_ldt(void *__mm)
|
||||
{
|
||||
struct mm_struct *mm = __mm;
|
||||
mm_context_t *pc;
|
||||
|
||||
if (current->active_mm != current_mm)
|
||||
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
|
||||
return;
|
||||
|
||||
pc = ¤t->active_mm->context;
|
||||
set_ldt(pc->ldt->entries, pc->ldt->size);
|
||||
pc = &mm->context;
|
||||
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
|
||||
}
|
||||
|
||||
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
|
||||
static struct ldt_struct *alloc_ldt_struct(unsigned int size)
|
||||
static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
|
||||
{
|
||||
struct ldt_struct *new_ldt;
|
||||
unsigned int alloc_size;
|
||||
|
||||
if (size > LDT_ENTRIES)
|
||||
if (num_entries > LDT_ENTRIES)
|
||||
return NULL;
|
||||
|
||||
new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
|
||||
@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
|
||||
return NULL;
|
||||
|
||||
BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
|
||||
alloc_size = size * LDT_ENTRY_SIZE;
|
||||
alloc_size = num_entries * LDT_ENTRY_SIZE;
|
||||
|
||||
/*
|
||||
* Xen is very picky: it requires a page-aligned LDT that has no
|
||||
@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
new_ldt->size = size;
|
||||
new_ldt->nr_entries = num_entries;
|
||||
return new_ldt;
|
||||
}
|
||||
|
||||
/* After calling this, the LDT is immutable. */
|
||||
static void finalize_ldt_struct(struct ldt_struct *ldt)
|
||||
{
|
||||
paravirt_alloc_ldt(ldt->entries, ldt->size);
|
||||
paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
|
||||
}
|
||||
|
||||
/* context.lock is held */
|
||||
@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt)
|
||||
if (likely(!ldt))
|
||||
return;
|
||||
|
||||
paravirt_free_ldt(ldt->entries, ldt->size);
|
||||
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
|
||||
paravirt_free_ldt(ldt->entries, ldt->nr_entries);
|
||||
if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
|
||||
vfree_atomic(ldt->entries);
|
||||
else
|
||||
free_page((unsigned long)ldt->entries);
|
||||
@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
new_ldt = alloc_ldt_struct(old_mm->context.ldt->size);
|
||||
new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
|
||||
if (!new_ldt) {
|
||||
retval = -ENOMEM;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
memcpy(new_ldt->entries, old_mm->context.ldt->entries,
|
||||
new_ldt->size * LDT_ENTRY_SIZE);
|
||||
new_ldt->nr_entries * LDT_ENTRY_SIZE);
|
||||
finalize_ldt_struct(new_ldt);
|
||||
|
||||
mm->context.ldt = new_ldt;
|
||||
@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm)
|
||||
|
||||
static int read_ldt(void __user *ptr, unsigned long bytecount)
|
||||
{
|
||||
int retval;
|
||||
unsigned long size;
|
||||
struct mm_struct *mm = current->mm;
|
||||
unsigned long entries_size;
|
||||
int retval;
|
||||
|
||||
mutex_lock(&mm->context.lock);
|
||||
|
||||
@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
|
||||
if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
|
||||
bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
|
||||
|
||||
size = mm->context.ldt->size * LDT_ENTRY_SIZE;
|
||||
if (size > bytecount)
|
||||
size = bytecount;
|
||||
entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
|
||||
if (entries_size > bytecount)
|
||||
entries_size = bytecount;
|
||||
|
||||
if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
|
||||
if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
|
||||
retval = -EFAULT;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
if (size != bytecount) {
|
||||
if (entries_size != bytecount) {
|
||||
/* Zero-fill the rest and pretend we read bytecount bytes. */
|
||||
if (clear_user(ptr + size, bytecount - size)) {
|
||||
if (clear_user(ptr + entries_size, bytecount - entries_size)) {
|
||||
retval = -EFAULT;
|
||||
goto out_unlock;
|
||||
}
|
||||
@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct ldt_struct *new_ldt, *old_ldt;
|
||||
unsigned int oldsize, newsize;
|
||||
unsigned int old_nr_entries, new_nr_entries;
|
||||
struct user_desc ldt_info;
|
||||
struct desc_struct ldt;
|
||||
int error;
|
||||
@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
|
||||
|
||||
mutex_lock(&mm->context.lock);
|
||||
|
||||
old_ldt = mm->context.ldt;
|
||||
oldsize = old_ldt ? old_ldt->size : 0;
|
||||
newsize = max(ldt_info.entry_number + 1, oldsize);
|
||||
old_ldt = mm->context.ldt;
|
||||
old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
|
||||
new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
|
||||
|
||||
error = -ENOMEM;
|
||||
new_ldt = alloc_ldt_struct(newsize);
|
||||
new_ldt = alloc_ldt_struct(new_nr_entries);
|
||||
if (!new_ldt)
|
||||
goto out_unlock;
|
||||
|
||||
if (old_ldt)
|
||||
memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE);
|
||||
memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
|
||||
|
||||
new_ldt->entries[ldt_info.entry_number] = ldt;
|
||||
finalize_ldt_struct(new_ldt);
|
||||
|
||||
|
@ -347,7 +347,7 @@ void machine_kexec(struct kimage *image)
|
||||
void arch_crash_save_vmcoreinfo(void)
|
||||
{
|
||||
VMCOREINFO_NUMBER(phys_base);
|
||||
VMCOREINFO_SYMBOL(init_level4_pgt);
|
||||
VMCOREINFO_SYMBOL(init_top_pgt);
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
VMCOREINFO_SYMBOL(node_data);
|
||||
|
@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
|
||||
|
||||
.read_cr2 = native_read_cr2,
|
||||
.write_cr2 = native_write_cr2,
|
||||
.read_cr3 = native_read_cr3,
|
||||
.read_cr3 = __native_read_cr3,
|
||||
.write_cr3 = native_write_cr3,
|
||||
|
||||
.flush_tlb_user = native_flush_tlb,
|
||||
|
@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
|
||||
|
||||
cr0 = read_cr0();
|
||||
cr2 = read_cr2();
|
||||
cr3 = read_cr3();
|
||||
cr3 = __read_cr3();
|
||||
cr4 = __read_cr4();
|
||||
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
|
||||
cr0, cr2, cr3, cr4);
|
||||
|
@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
|
||||
|
||||
cr0 = read_cr0();
|
||||
cr2 = read_cr2();
|
||||
cr3 = read_cr3();
|
||||
cr3 = __read_cr3();
|
||||
cr4 = __read_cr4();
|
||||
|
||||
printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
|
||||
@ -142,7 +142,7 @@ void release_thread(struct task_struct *dead_task)
|
||||
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
|
||||
dead_task->comm,
|
||||
dead_task->mm->context.ldt->entries,
|
||||
dead_task->mm->context.ldt->size);
|
||||
dead_task->mm->context.ldt->nr_entries);
|
||||
BUG();
|
||||
}
|
||||
#endif
|
||||
|
@ -1589,7 +1589,6 @@ void native_cpu_die(unsigned int cpu)
|
||||
void play_dead_common(void)
|
||||
{
|
||||
idle_task_exit();
|
||||
reset_lazy_tlbstate();
|
||||
|
||||
/* Ack it */
|
||||
(void)cpu_report_death();
|
||||
|
@ -34,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
|
||||
|
||||
mutex_lock(&child->mm->context.lock);
|
||||
if (unlikely(!child->mm->context.ldt ||
|
||||
seg >= child->mm->context.ldt->size))
|
||||
seg >= child->mm->context.ldt->nr_entries))
|
||||
addr = -1L; /* bogus selector, access would fault */
|
||||
else {
|
||||
desc = &child->mm->context.ldt->entries[seg];
|
||||
|
@ -49,6 +49,7 @@
|
||||
#include <asm/kexec.h>
|
||||
#include <asm/apic.h>
|
||||
#include <asm/irq_remapping.h>
|
||||
#include <asm/mmu_context.h>
|
||||
|
||||
#include "trace.h"
|
||||
#include "pmu.h"
|
||||
@ -597,6 +598,7 @@ struct vcpu_vmx {
|
||||
int gs_ldt_reload_needed;
|
||||
int fs_reload_needed;
|
||||
u64 msr_host_bndcfgs;
|
||||
unsigned long vmcs_host_cr3; /* May not match real cr3 */
|
||||
unsigned long vmcs_host_cr4; /* May not match real cr4 */
|
||||
} host_state;
|
||||
struct {
|
||||
@ -5013,12 +5015,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
|
||||
u32 low32, high32;
|
||||
unsigned long tmpl;
|
||||
struct desc_ptr dt;
|
||||
unsigned long cr0, cr4;
|
||||
unsigned long cr0, cr3, cr4;
|
||||
|
||||
cr0 = read_cr0();
|
||||
WARN_ON(cr0 & X86_CR0_TS);
|
||||
vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
|
||||
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
|
||||
|
||||
/*
|
||||
* Save the most likely value for this task's CR3 in the VMCS.
|
||||
* We can't use __get_current_cr3_fast() because we're not atomic.
|
||||
*/
|
||||
cr3 = __read_cr3();
|
||||
vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
|
||||
vmx->host_state.vmcs_host_cr3 = cr3;
|
||||
|
||||
/* Save the most likely value for this task's CR4 in the VMCS. */
|
||||
cr4 = cr4_read_shadow();
|
||||
@ -8822,7 +8831,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
|
||||
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
unsigned long debugctlmsr, cr4;
|
||||
unsigned long debugctlmsr, cr3, cr4;
|
||||
|
||||
/* Don't enter VMX if guest state is invalid, let the exit handler
|
||||
start emulation until we arrive back to a valid state */
|
||||
@ -8844,6 +8853,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
|
||||
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
|
||||
|
||||
cr3 = __get_current_cr3_fast();
|
||||
if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
|
||||
vmcs_writel(HOST_CR3, cr3);
|
||||
vmx->host_state.vmcs_host_cr3 = cr3;
|
||||
}
|
||||
|
||||
cr4 = cr4_read_shadow();
|
||||
if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
|
||||
vmcs_writel(HOST_CR4, cr4);
|
||||
|
@ -27,7 +27,7 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg)
|
||||
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
||||
seg >>= 3;
|
||||
mutex_lock(¤t->mm->context.lock);
|
||||
if (current->mm->context.ldt && seg < current->mm->context.ldt->size)
|
||||
if (current->mm->context.ldt && seg < current->mm->context.ldt->nr_entries)
|
||||
ret = current->mm->context.ldt->entries[seg];
|
||||
mutex_unlock(¤t->mm->context.lock);
|
||||
#endif
|
||||
|
@ -2,7 +2,7 @@
|
||||
KCOV_INSTRUMENT_tlb.o := n
|
||||
|
||||
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
|
||||
pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
|
||||
pat.o pgtable.o physaddr.o setup_nx.o tlb.o
|
||||
|
||||
# Make sure __phys_addr has no stackprotector
|
||||
nostackp := $(call cc-option, -fno-stack-protector)
|
||||
|
@ -431,7 +431,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
|
||||
bool checkwx)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
pgd_t *start = (pgd_t *) &init_level4_pgt;
|
||||
pgd_t *start = (pgd_t *) &init_top_pgt;
|
||||
#else
|
||||
pgd_t *start = swapper_pg_dir;
|
||||
#endif
|
||||
|
@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
|
||||
* Do _not_ use "current" here. We might be inside
|
||||
* an interrupt in the middle of a task switch..
|
||||
*/
|
||||
pgd_paddr = read_cr3();
|
||||
pgd_paddr = read_cr3_pa();
|
||||
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
|
||||
if (!pmd_k)
|
||||
return -1;
|
||||
@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
|
||||
|
||||
static void dump_pagetable(unsigned long address)
|
||||
{
|
||||
pgd_t *base = __va(read_cr3());
|
||||
pgd_t *base = __va(read_cr3_pa());
|
||||
pgd_t *pgd = &base[pgd_index(address)];
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
|
||||
* happen within a race in page table update. In the later
|
||||
* case just flush:
|
||||
*/
|
||||
pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
|
||||
pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
|
||||
pgd_ref = pgd_offset_k(address);
|
||||
if (pgd_none(*pgd_ref))
|
||||
return -1;
|
||||
@ -555,7 +555,7 @@ static int bad_address(void *p)
|
||||
|
||||
static void dump_pagetable(unsigned long address)
|
||||
{
|
||||
pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
|
||||
pgd_t *base = __va(read_cr3_pa());
|
||||
pgd_t *pgd = base + pgd_index(address);
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
|
||||
pgd_t *pgd;
|
||||
pte_t *pte;
|
||||
|
||||
pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
|
||||
pgd = __va(read_cr3_pa());
|
||||
pgd += pgd_index(address);
|
||||
|
||||
pte = lookup_address_in_pgd(pgd, address, &level);
|
||||
|
@ -1,496 +0,0 @@
|
||||
/*
|
||||
* Lockless get_user_pages_fast for x86
|
||||
*
|
||||
* Copyright (C) 2008 Nick Piggin
|
||||
* Copyright (C) 2008 Novell Inc.
|
||||
*/
|
||||
#include <linux/sched.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/vmstat.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/memremap.h>
|
||||
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
static inline pte_t gup_get_pte(pte_t *ptep)
|
||||
{
|
||||
#ifndef CONFIG_X86_PAE
|
||||
return READ_ONCE(*ptep);
|
||||
#else
|
||||
/*
|
||||
* With get_user_pages_fast, we walk down the pagetables without taking
|
||||
* any locks. For this we would like to load the pointers atomically,
|
||||
* but that is not possible (without expensive cmpxchg8b) on PAE. What
|
||||
* we do have is the guarantee that a pte will only either go from not
|
||||
* present to present, or present to not present or both -- it will not
|
||||
* switch to a completely different present page without a TLB flush in
|
||||
* between; something that we are blocking by holding interrupts off.
|
||||
*
|
||||
* Setting ptes from not present to present goes:
|
||||
* ptep->pte_high = h;
|
||||
* smp_wmb();
|
||||
* ptep->pte_low = l;
|
||||
*
|
||||
* And present to not present goes:
|
||||
* ptep->pte_low = 0;
|
||||
* smp_wmb();
|
||||
* ptep->pte_high = 0;
|
||||
*
|
||||
* We must ensure here that the load of pte_low sees l iff pte_high
|
||||
* sees h. We load pte_high *after* loading pte_low, which ensures we
|
||||
* don't see an older value of pte_high. *Then* we recheck pte_low,
|
||||
* which ensures that we haven't picked up a changed pte high. We might
|
||||
* have got rubbish values from pte_low and pte_high, but we are
|
||||
* guaranteed that pte_low will not have the present bit set *unless*
|
||||
* it is 'l'. And get_user_pages_fast only operates on present ptes, so
|
||||
* we're safe.
|
||||
*
|
||||
* gup_get_pte should not be used or copied outside gup.c without being
|
||||
* very careful -- it does not atomically load the pte or anything that
|
||||
* is likely to be useful for you.
|
||||
*/
|
||||
pte_t pte;
|
||||
|
||||
retry:
|
||||
pte.pte_low = ptep->pte_low;
|
||||
smp_rmb();
|
||||
pte.pte_high = ptep->pte_high;
|
||||
smp_rmb();
|
||||
if (unlikely(pte.pte_low != ptep->pte_low))
|
||||
goto retry;
|
||||
|
||||
return pte;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
|
||||
{
|
||||
while ((*nr) - nr_start) {
|
||||
struct page *page = pages[--(*nr)];
|
||||
|
||||
ClearPageReferenced(page);
|
||||
put_page(page);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* 'pteval' can come from a pte, pmd, pud or p4d. We only check
|
||||
* _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
|
||||
* same value on all 4 types.
|
||||
*/
|
||||
static inline int pte_allows_gup(unsigned long pteval, int write)
|
||||
{
|
||||
unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
|
||||
|
||||
if (write)
|
||||
need_pte_bits |= _PAGE_RW;
|
||||
|
||||
if ((pteval & need_pte_bits) != need_pte_bits)
|
||||
return 0;
|
||||
|
||||
/* Check memory protection keys permissions. */
|
||||
if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* The performance critical leaf functions are made noinline otherwise gcc
|
||||
* inlines everything into a single function which results in too much
|
||||
* register pressure.
|
||||
*/
|
||||
static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
|
||||
unsigned long end, int write, struct page **pages, int *nr)
|
||||
{
|
||||
struct dev_pagemap *pgmap = NULL;
|
||||
int nr_start = *nr, ret = 0;
|
||||
pte_t *ptep, *ptem;
|
||||
|
||||
/*
|
||||
* Keep the original mapped PTE value (ptem) around since we
|
||||
* might increment ptep off the end of the page when finishing
|
||||
* our loop iteration.
|
||||
*/
|
||||
ptem = ptep = pte_offset_map(&pmd, addr);
|
||||
do {
|
||||
pte_t pte = gup_get_pte(ptep);
|
||||
struct page *page;
|
||||
|
||||
/* Similar to the PMD case, NUMA hinting must take slow path */
|
||||
if (pte_protnone(pte))
|
||||
break;
|
||||
|
||||
if (!pte_allows_gup(pte_val(pte), write))
|
||||
break;
|
||||
|
||||
if (pte_devmap(pte)) {
|
||||
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
|
||||
if (unlikely(!pgmap)) {
|
||||
undo_dev_pagemap(nr, nr_start, pages);
|
||||
break;
|
||||
}
|
||||
} else if (pte_special(pte))
|
||||
break;
|
||||
|
||||
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
|
||||
page = pte_page(pte);
|
||||
get_page(page);
|
||||
put_dev_pagemap(pgmap);
|
||||
SetPageReferenced(page);
|
||||
pages[*nr] = page;
|
||||
(*nr)++;
|
||||
|
||||
} while (ptep++, addr += PAGE_SIZE, addr != end);
|
||||
if (addr == end)
|
||||
ret = 1;
|
||||
pte_unmap(ptem);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void get_head_page_multiple(struct page *page, int nr)
|
||||
{
|
||||
VM_BUG_ON_PAGE(page != compound_head(page), page);
|
||||
VM_BUG_ON_PAGE(page_count(page) == 0, page);
|
||||
page_ref_add(page, nr);
|
||||
SetPageReferenced(page);
|
||||
}
|
||||
|
||||
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
|
||||
unsigned long end, struct page **pages, int *nr)
|
||||
{
|
||||
int nr_start = *nr;
|
||||
struct dev_pagemap *pgmap = NULL;
|
||||
|
||||
do {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
pgmap = get_dev_pagemap(pfn, pgmap);
|
||||
if (unlikely(!pgmap)) {
|
||||
undo_dev_pagemap(nr, nr_start, pages);
|
||||
return 0;
|
||||
}
|
||||
SetPageReferenced(page);
|
||||
pages[*nr] = page;
|
||||
get_page(page);
|
||||
put_dev_pagemap(pgmap);
|
||||
(*nr)++;
|
||||
pfn++;
|
||||
} while (addr += PAGE_SIZE, addr != end);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
|
||||
unsigned long end, struct page **pages, int *nr)
|
||||
{
|
||||
unsigned long fault_pfn;
|
||||
|
||||
fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
|
||||
return __gup_device_huge(fault_pfn, addr, end, pages, nr);
|
||||
}
|
||||
|
||||
static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
|
||||
unsigned long end, struct page **pages, int *nr)
|
||||
{
|
||||
unsigned long fault_pfn;
|
||||
|
||||
fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
|
||||
return __gup_device_huge(fault_pfn, addr, end, pages, nr);
|
||||
}
|
||||
|
||||
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
|
||||
unsigned long end, int write, struct page **pages, int *nr)
|
||||
{
|
||||
struct page *head, *page;
|
||||
int refs;
|
||||
|
||||
if (!pte_allows_gup(pmd_val(pmd), write))
|
||||
return 0;
|
||||
|
||||
VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
|
||||
if (pmd_devmap(pmd))
|
||||
return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
|
||||
|
||||
/* hugepages are never "special" */
|
||||
VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
|
||||
|
||||
refs = 0;
|
||||
head = pmd_page(pmd);
|
||||
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
|
||||
do {
|
||||
VM_BUG_ON_PAGE(compound_head(page) != head, page);
|
||||
pages[*nr] = page;
|
||||
(*nr)++;
|
||||
page++;
|
||||
refs++;
|
||||
} while (addr += PAGE_SIZE, addr != end);
|
||||
get_head_page_multiple(head, refs);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
|
||||
int write, struct page **pages, int *nr)
|
||||
{
|
||||
unsigned long next;
|
||||
pmd_t *pmdp;
|
||||
|
||||
pmdp = pmd_offset(&pud, addr);
|
||||
do {
|
||||
pmd_t pmd = *pmdp;
|
||||
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (pmd_none(pmd))
|
||||
return 0;
|
||||
if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
|
||||
/*
|
||||
* NUMA hinting faults need to be handled in the GUP
|
||||
* slowpath for accounting purposes and so that they
|
||||
* can be serialised against THP migration.
|
||||
*/
|
||||
if (pmd_protnone(pmd))
|
||||
return 0;
|
||||
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
|
||||
return 0;
|
||||
} else {
|
||||
if (!gup_pte_range(pmd, addr, next, write, pages, nr))
|
||||
return 0;
|
||||
}
|
||||
} while (pmdp++, addr = next, addr != end);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
|
||||
unsigned long end, int write, struct page **pages, int *nr)
|
||||
{
|
||||
struct page *head, *page;
|
||||
int refs;
|
||||
|
||||
if (!pte_allows_gup(pud_val(pud), write))
|
||||
return 0;
|
||||
|
||||
VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
|
||||
if (pud_devmap(pud))
|
||||
return __gup_device_huge_pud(pud, addr, end, pages, nr);
|
||||
|
||||
/* hugepages are never "special" */
|
||||
VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
|
||||
|
||||
refs = 0;
|
||||
head = pud_page(pud);
|
||||
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
|
||||
do {
|
||||
VM_BUG_ON_PAGE(compound_head(page) != head, page);
|
||||
pages[*nr] = page;
|
||||
(*nr)++;
|
||||
page++;
|
||||
refs++;
|
||||
} while (addr += PAGE_SIZE, addr != end);
|
||||
get_head_page_multiple(head, refs);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
|
||||
int write, struct page **pages, int *nr)
|
||||
{
|
||||
unsigned long next;
|
||||
pud_t *pudp;
|
||||
|
||||
pudp = pud_offset(&p4d, addr);
|
||||
do {
|
||||
pud_t pud = *pudp;
|
||||
|
||||
next = pud_addr_end(addr, end);
|
||||
if (pud_none(pud))
|
||||
return 0;
|
||||
if (unlikely(pud_large(pud))) {
|
||||
if (!gup_huge_pud(pud, addr, next, write, pages, nr))
|
||||
return 0;
|
||||
} else {
|
||||
if (!gup_pmd_range(pud, addr, next, write, pages, nr))
|
||||
return 0;
|
||||
}
|
||||
} while (pudp++, addr = next, addr != end);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
|
||||
int write, struct page **pages, int *nr)
|
||||
{
|
||||
unsigned long next;
|
||||
p4d_t *p4dp;
|
||||
|
||||
p4dp = p4d_offset(&pgd, addr);
|
||||
do {
|
||||
p4d_t p4d = *p4dp;
|
||||
|
||||
next = p4d_addr_end(addr, end);
|
||||
if (p4d_none(p4d))
|
||||
return 0;
|
||||
BUILD_BUG_ON(p4d_large(p4d));
|
||||
if (!gup_pud_range(p4d, addr, next, write, pages, nr))
|
||||
return 0;
|
||||
} while (p4dp++, addr = next, addr != end);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
|
||||
* back to the regular GUP.
|
||||
*/
|
||||
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
|
||||
struct page **pages)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
unsigned long addr, len, end;
|
||||
unsigned long next;
|
||||
unsigned long flags;
|
||||
pgd_t *pgdp;
|
||||
int nr = 0;
|
||||
|
||||
start &= PAGE_MASK;
|
||||
addr = start;
|
||||
len = (unsigned long) nr_pages << PAGE_SHIFT;
|
||||
end = start + len;
|
||||
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
|
||||
(void __user *)start, len)))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* XXX: batch / limit 'nr', to avoid large irq off latency
|
||||
* needs some instrumenting to determine the common sizes used by
|
||||
* important workloads (eg. DB2), and whether limiting the batch size
|
||||
* will decrease performance.
|
||||
*
|
||||
* It seems like we're in the clear for the moment. Direct-IO is
|
||||
* the main guy that batches up lots of get_user_pages, and even
|
||||
* they are limited to 64-at-a-time which is not so many.
|
||||
*/
|
||||
/*
|
||||
* This doesn't prevent pagetable teardown, but does prevent
|
||||
* the pagetables and pages from being freed on x86.
|
||||
*
|
||||
* So long as we atomically load page table pointers versus teardown
|
||||
* (which we do on x86, with the above PAE exception), we can follow the
|
||||
* address down to the the page and take a ref on it.
|
||||
*/
|
||||
local_irq_save(flags);
|
||||
pgdp = pgd_offset(mm, addr);
|
||||
do {
|
||||
pgd_t pgd = *pgdp;
|
||||
|
||||
next = pgd_addr_end(addr, end);
|
||||
if (pgd_none(pgd))
|
||||
break;
|
||||
if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
|
||||
break;
|
||||
} while (pgdp++, addr = next, addr != end);
|
||||
local_irq_restore(flags);
|
||||
|
||||
return nr;
|
||||
}
|
||||
|
||||
/**
|
||||
* get_user_pages_fast() - pin user pages in memory
|
||||
* @start: starting user address
|
||||
* @nr_pages: number of pages from start to pin
|
||||
* @write: whether pages will be written to
|
||||
* @pages: array that receives pointers to the pages pinned.
|
||||
* Should be at least nr_pages long.
|
||||
*
|
||||
* Attempt to pin user pages in memory without taking mm->mmap_sem.
|
||||
* If not successful, it will fall back to taking the lock and
|
||||
* calling get_user_pages().
|
||||
*
|
||||
* Returns number of pages pinned. This may be fewer than the number
|
||||
* requested. If nr_pages is 0 or negative, returns 0. If no pages
|
||||
* were pinned, returns -errno.
|
||||
*/
|
||||
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
|
||||
struct page **pages)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
unsigned long addr, len, end;
|
||||
unsigned long next;
|
||||
pgd_t *pgdp;
|
||||
int nr = 0;
|
||||
|
||||
start &= PAGE_MASK;
|
||||
addr = start;
|
||||
len = (unsigned long) nr_pages << PAGE_SHIFT;
|
||||
|
||||
end = start + len;
|
||||
if (end < start)
|
||||
goto slow_irqon;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
if (end >> __VIRTUAL_MASK_SHIFT)
|
||||
goto slow_irqon;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* XXX: batch / limit 'nr', to avoid large irq off latency
|
||||
* needs some instrumenting to determine the common sizes used by
|
||||
* important workloads (eg. DB2), and whether limiting the batch size
|
||||
* will decrease performance.
|
||||
*
|
||||
* It seems like we're in the clear for the moment. Direct-IO is
|
||||
* the main guy that batches up lots of get_user_pages, and even
|
||||
* they are limited to 64-at-a-time which is not so many.
|
||||
*/
|
||||
/*
|
||||
* This doesn't prevent pagetable teardown, but does prevent
|
||||
* the pagetables and pages from being freed on x86.
|
||||
*
|
||||
* So long as we atomically load page table pointers versus teardown
|
||||
* (which we do on x86, with the above PAE exception), we can follow the
|
||||
* address down to the the page and take a ref on it.
|
||||
*/
|
||||
local_irq_disable();
|
||||
pgdp = pgd_offset(mm, addr);
|
||||
do {
|
||||
pgd_t pgd = *pgdp;
|
||||
|
||||
next = pgd_addr_end(addr, end);
|
||||
if (pgd_none(pgd))
|
||||
goto slow;
|
||||
if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
|
||||
goto slow;
|
||||
} while (pgdp++, addr = next, addr != end);
|
||||
local_irq_enable();
|
||||
|
||||
VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
|
||||
return nr;
|
||||
|
||||
{
|
||||
int ret;
|
||||
|
||||
slow:
|
||||
local_irq_enable();
|
||||
slow_irqon:
|
||||
/* Try to get the remaining pages with get_user_pages */
|
||||
start += nr << PAGE_SHIFT;
|
||||
pages += nr;
|
||||
|
||||
ret = get_user_pages_unlocked(start,
|
||||
(end - start) >> PAGE_SHIFT,
|
||||
pages, write ? FOLL_WRITE : 0);
|
||||
|
||||
/* Have to be a bit careful with return values */
|
||||
if (nr > 0) {
|
||||
if (ret < 0)
|
||||
ret = nr;
|
||||
else
|
||||
ret += nr;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
}
|
@ -811,10 +811,8 @@ void __init zone_sizes_init(void)
|
||||
}
|
||||
|
||||
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
|
||||
#ifdef CONFIG_SMP
|
||||
.active_mm = &init_mm,
|
||||
.loaded_mm = &init_mm,
|
||||
.state = 0,
|
||||
#endif
|
||||
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
|
||||
};
|
||||
EXPORT_SYMBOL_GPL(cpu_tlbstate);
|
||||
|
@ -92,6 +92,44 @@ __setup("noexec32=", nonx32_setup);
|
||||
* When memory was added make sure all the processes MM have
|
||||
* suitable PGD entries in the local PGD level page.
|
||||
*/
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
void sync_global_pgds(unsigned long start, unsigned long end)
|
||||
{
|
||||
unsigned long addr;
|
||||
|
||||
for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
|
||||
const pgd_t *pgd_ref = pgd_offset_k(addr);
|
||||
struct page *page;
|
||||
|
||||
/* Check for overflow */
|
||||
if (addr < start)
|
||||
break;
|
||||
|
||||
if (pgd_none(*pgd_ref))
|
||||
continue;
|
||||
|
||||
spin_lock(&pgd_lock);
|
||||
list_for_each_entry(page, &pgd_list, lru) {
|
||||
pgd_t *pgd;
|
||||
spinlock_t *pgt_lock;
|
||||
|
||||
pgd = (pgd_t *)page_address(page) + pgd_index(addr);
|
||||
/* the pgt_lock only for Xen */
|
||||
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
|
||||
spin_lock(pgt_lock);
|
||||
|
||||
if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
|
||||
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
|
||||
|
||||
if (pgd_none(*pgd))
|
||||
set_pgd(pgd, *pgd_ref);
|
||||
|
||||
spin_unlock(pgt_lock);
|
||||
}
|
||||
spin_unlock(&pgd_lock);
|
||||
}
|
||||
}
|
||||
#else
|
||||
void sync_global_pgds(unsigned long start, unsigned long end)
|
||||
{
|
||||
unsigned long addr;
|
||||
@ -135,6 +173,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
|
||||
spin_unlock(&pgd_lock);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* NOTE: This function is marked __ref because it calls __init function
|
||||
@ -585,6 +624,57 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
|
||||
return paddr_last;
|
||||
}
|
||||
|
||||
static unsigned long __meminit
|
||||
phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
|
||||
unsigned long page_size_mask)
|
||||
{
|
||||
unsigned long paddr_next, paddr_last = paddr_end;
|
||||
unsigned long vaddr = (unsigned long)__va(paddr);
|
||||
int i = p4d_index(vaddr);
|
||||
|
||||
if (!IS_ENABLED(CONFIG_X86_5LEVEL))
|
||||
return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
|
||||
|
||||
for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
|
||||
vaddr = (unsigned long)__va(paddr);
|
||||
p4d = p4d_page + p4d_index(vaddr);
|
||||
paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
|
||||
|
||||
if (paddr >= paddr_end) {
|
||||
if (!after_bootmem &&
|
||||
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
|
||||
E820_TYPE_RAM) &&
|
||||
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
|
||||
E820_TYPE_RESERVED_KERN))
|
||||
set_p4d(p4d, __p4d(0));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!p4d_none(*p4d)) {
|
||||
pud = pud_offset(p4d, 0);
|
||||
paddr_last = phys_pud_init(pud, paddr,
|
||||
paddr_end,
|
||||
page_size_mask);
|
||||
__flush_tlb_all();
|
||||
continue;
|
||||
}
|
||||
|
||||
pud = alloc_low_page();
|
||||
paddr_last = phys_pud_init(pud, paddr, paddr_end,
|
||||
page_size_mask);
|
||||
|
||||
spin_lock(&init_mm.page_table_lock);
|
||||
p4d_populate(&init_mm, p4d, pud);
|
||||
spin_unlock(&init_mm.page_table_lock);
|
||||
}
|
||||
__flush_tlb_all();
|
||||
|
||||
return paddr_last;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create page table mapping for the physical memory for specific physical
|
||||
* addresses. The virtual and physical addresses have to be aligned on PMD level
|
||||
@ -606,26 +696,26 @@ kernel_physical_mapping_init(unsigned long paddr_start,
|
||||
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
|
||||
pgd_t *pgd = pgd_offset_k(vaddr);
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
|
||||
vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
|
||||
|
||||
BUILD_BUG_ON(pgd_none(*pgd));
|
||||
p4d = p4d_offset(pgd, vaddr);
|
||||
if (p4d_val(*p4d)) {
|
||||
pud = (pud_t *)p4d_page_vaddr(*p4d);
|
||||
paddr_last = phys_pud_init(pud, __pa(vaddr),
|
||||
if (pgd_val(*pgd)) {
|
||||
p4d = (p4d_t *)pgd_page_vaddr(*pgd);
|
||||
paddr_last = phys_p4d_init(p4d, __pa(vaddr),
|
||||
__pa(vaddr_end),
|
||||
page_size_mask);
|
||||
continue;
|
||||
}
|
||||
|
||||
pud = alloc_low_page();
|
||||
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
|
||||
p4d = alloc_low_page();
|
||||
paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
|
||||
page_size_mask);
|
||||
|
||||
spin_lock(&init_mm.page_table_lock);
|
||||
p4d_populate(&init_mm, p4d, pud);
|
||||
if (IS_ENABLED(CONFIG_X86_5LEVEL))
|
||||
pgd_populate(&init_mm, pgd, p4d);
|
||||
else
|
||||
p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
|
||||
spin_unlock(&init_mm.page_table_lock);
|
||||
pgd_changed = true;
|
||||
}
|
||||
|
@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
|
||||
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
|
||||
{
|
||||
/* Don't assume we're using swapper_pg_dir at this point */
|
||||
pgd_t *base = __va(read_cr3());
|
||||
pgd_t *base = __va(read_cr3_pa());
|
||||
pgd_t *pgd = &base[pgd_index(addr)];
|
||||
p4d_t *p4d = p4d_offset(pgd, addr);
|
||||
pud_t *pud = pud_offset(p4d, addr);
|
||||
|
@ -12,7 +12,7 @@
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/sections.h>
|
||||
|
||||
extern pgd_t early_level4_pgt[PTRS_PER_PGD];
|
||||
extern pgd_t early_top_pgt[PTRS_PER_PGD];
|
||||
extern struct range pfn_mapped[E820_MAX_ENTRIES];
|
||||
|
||||
static int __init map_range(struct range *range)
|
||||
@ -109,8 +109,8 @@ void __init kasan_early_init(void)
|
||||
for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
|
||||
kasan_zero_p4d[i] = __p4d(p4d_val);
|
||||
|
||||
kasan_map_early_shadow(early_level4_pgt);
|
||||
kasan_map_early_shadow(init_level4_pgt);
|
||||
kasan_map_early_shadow(early_top_pgt);
|
||||
kasan_map_early_shadow(init_top_pgt);
|
||||
}
|
||||
|
||||
void __init kasan_init(void)
|
||||
@ -121,8 +121,8 @@ void __init kasan_init(void)
|
||||
register_die_notifier(&kasan_die_notifier);
|
||||
#endif
|
||||
|
||||
memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
|
||||
load_cr3(early_level4_pgt);
|
||||
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
|
||||
load_cr3(early_top_pgt);
|
||||
__flush_tlb_all();
|
||||
|
||||
clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
|
||||
@ -148,7 +148,7 @@ void __init kasan_init(void)
|
||||
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
|
||||
(void *)KASAN_SHADOW_END);
|
||||
|
||||
load_cr3(init_level4_pgt);
|
||||
load_cr3(init_top_pgt);
|
||||
__flush_tlb_all();
|
||||
|
||||
/*
|
||||
|
@ -6,12 +6,12 @@
|
||||
*
|
||||
* Entropy is generated using the KASLR early boot functions now shared in
|
||||
* the lib directory (originally written by Kees Cook). Randomization is
|
||||
* done on PGD & PUD page table levels to increase possible addresses. The
|
||||
* physical memory mapping code was adapted to support PUD level virtual
|
||||
* addresses. This implementation on the best configuration provides 30,000
|
||||
* possible virtual addresses in average for each memory region. An additional
|
||||
* low memory page is used to ensure each CPU can start with a PGD aligned
|
||||
* virtual address (for realmode).
|
||||
* done on PGD & P4D/PUD page table levels to increase possible addresses.
|
||||
* The physical memory mapping code was adapted to support P4D/PUD level
|
||||
* virtual addresses. This implementation on the best configuration provides
|
||||
* 30,000 possible virtual addresses in average for each memory region.
|
||||
* An additional low memory page is used to ensure each CPU can start with
|
||||
* a PGD aligned virtual address (for realmode).
|
||||
*
|
||||
* The order of each memory region is not changed. The feature looks at
|
||||
* the available space for the regions based on different configuration
|
||||
@ -70,7 +70,7 @@ static __initdata struct kaslr_memory_region {
|
||||
unsigned long *base;
|
||||
unsigned long size_tb;
|
||||
} kaslr_regions[] = {
|
||||
{ &page_offset_base, 64/* Maximum */ },
|
||||
{ &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
|
||||
{ &vmalloc_base, VMALLOC_SIZE_TB },
|
||||
{ &vmemmap_base, 1 },
|
||||
};
|
||||
@ -142,7 +142,10 @@ void __init kernel_randomize_memory(void)
|
||||
*/
|
||||
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
|
||||
prandom_bytes_state(&rand_state, &rand, sizeof(rand));
|
||||
entropy = (rand % (entropy + 1)) & PUD_MASK;
|
||||
if (IS_ENABLED(CONFIG_X86_5LEVEL))
|
||||
entropy = (rand % (entropy + 1)) & P4D_MASK;
|
||||
else
|
||||
entropy = (rand % (entropy + 1)) & PUD_MASK;
|
||||
vaddr += entropy;
|
||||
*kaslr_regions[i].base = vaddr;
|
||||
|
||||
@ -151,27 +154,21 @@ void __init kernel_randomize_memory(void)
|
||||
* randomization alignment.
|
||||
*/
|
||||
vaddr += get_padding(&kaslr_regions[i]);
|
||||
vaddr = round_up(vaddr + 1, PUD_SIZE);
|
||||
if (IS_ENABLED(CONFIG_X86_5LEVEL))
|
||||
vaddr = round_up(vaddr + 1, P4D_SIZE);
|
||||
else
|
||||
vaddr = round_up(vaddr + 1, PUD_SIZE);
|
||||
remain_entropy -= entropy;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Create PGD aligned trampoline table to allow real mode initialization
|
||||
* of additional CPUs. Consume only 1 low memory page.
|
||||
*/
|
||||
void __meminit init_trampoline(void)
|
||||
static void __meminit init_trampoline_pud(void)
|
||||
{
|
||||
unsigned long paddr, paddr_next;
|
||||
pgd_t *pgd;
|
||||
pud_t *pud_page, *pud_page_tramp;
|
||||
int i;
|
||||
|
||||
if (!kaslr_memory_enabled()) {
|
||||
init_trampoline_default();
|
||||
return;
|
||||
}
|
||||
|
||||
pud_page_tramp = alloc_low_page();
|
||||
|
||||
paddr = 0;
|
||||
@ -192,3 +189,49 @@ void __meminit init_trampoline(void)
|
||||
set_pgd(&trampoline_pgd_entry,
|
||||
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
|
||||
}
|
||||
|
||||
static void __meminit init_trampoline_p4d(void)
|
||||
{
|
||||
unsigned long paddr, paddr_next;
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d_page, *p4d_page_tramp;
|
||||
int i;
|
||||
|
||||
p4d_page_tramp = alloc_low_page();
|
||||
|
||||
paddr = 0;
|
||||
pgd = pgd_offset_k((unsigned long)__va(paddr));
|
||||
p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
|
||||
|
||||
for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
|
||||
p4d_t *p4d, *p4d_tramp;
|
||||
unsigned long vaddr = (unsigned long)__va(paddr);
|
||||
|
||||
p4d_tramp = p4d_page_tramp + p4d_index(paddr);
|
||||
p4d = p4d_page + p4d_index(vaddr);
|
||||
paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
|
||||
|
||||
*p4d_tramp = *p4d;
|
||||
}
|
||||
|
||||
set_pgd(&trampoline_pgd_entry,
|
||||
__pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
|
||||
}
|
||||
|
||||
/*
|
||||
* Create PGD aligned trampoline table to allow real mode initialization
|
||||
* of additional CPUs. Consume only 1 low memory page.
|
||||
*/
|
||||
void __meminit init_trampoline(void)
|
||||
{
|
||||
|
||||
if (!kaslr_memory_enabled()) {
|
||||
init_trampoline_default();
|
||||
return;
|
||||
}
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_5LEVEL))
|
||||
init_trampoline_p4d();
|
||||
else
|
||||
init_trampoline_pud();
|
||||
}
|
||||
|
@ -74,9 +74,6 @@ static int mmap_is_legacy(void)
|
||||
if (current->personality & ADDR_COMPAT_LAYOUT)
|
||||
return 1;
|
||||
|
||||
if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
|
||||
return 1;
|
||||
|
||||
return sysctl_legacy_va_layout;
|
||||
}
|
||||
|
||||
|
@ -15,7 +15,7 @@
|
||||
#include <linux/debugfs.h>
|
||||
|
||||
/*
|
||||
* Smarter SMP flushing macros.
|
||||
* TLB flushing, formerly SMP-only
|
||||
* c/o Linus Torvalds.
|
||||
*
|
||||
* These mean you can really definitely utterly forget about
|
||||
@ -28,39 +28,28 @@
|
||||
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
struct flush_tlb_info {
|
||||
struct mm_struct *flush_mm;
|
||||
unsigned long flush_start;
|
||||
unsigned long flush_end;
|
||||
};
|
||||
|
||||
/*
|
||||
* We cannot call mmdrop() because we are in interrupt context,
|
||||
* instead update mm->cpu_vm_mask.
|
||||
*/
|
||||
void leave_mm(int cpu)
|
||||
{
|
||||
struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
|
||||
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
|
||||
/*
|
||||
* It's plausible that we're in lazy TLB mode while our mm is init_mm.
|
||||
* If so, our callers still expect us to flush the TLB, but there
|
||||
* aren't any user TLB entries in init_mm to worry about.
|
||||
*
|
||||
* This needs to happen before any other sanity checks due to
|
||||
* intel_idle's shenanigans.
|
||||
*/
|
||||
if (loaded_mm == &init_mm)
|
||||
return;
|
||||
|
||||
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
|
||||
BUG();
|
||||
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
|
||||
cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
|
||||
load_cr3(swapper_pg_dir);
|
||||
/*
|
||||
* This gets called in the idle path where RCU
|
||||
* functions differently. Tracing normally
|
||||
* uses RCU, so we have to call the tracepoint
|
||||
* specially here.
|
||||
*/
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
}
|
||||
|
||||
switch_mm(NULL, &init_mm, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(leave_mm);
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
||||
struct task_struct *tsk)
|
||||
{
|
||||
@ -75,216 +64,167 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
struct task_struct *tsk)
|
||||
{
|
||||
unsigned cpu = smp_processor_id();
|
||||
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
|
||||
if (likely(prev != next)) {
|
||||
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
|
||||
/*
|
||||
* If our current stack is in vmalloc space and isn't
|
||||
* mapped in the new pgd, we'll double-fault. Forcibly
|
||||
* map it.
|
||||
*/
|
||||
unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
|
||||
/*
|
||||
* NB: The scheduler will call us with prev == next when
|
||||
* switching from lazy TLB mode to normal mode if active_mm
|
||||
* isn't changing. When this happens, there is no guarantee
|
||||
* that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
|
||||
*
|
||||
* NB: leave_mm() calls us with prev == NULL and tsk == NULL.
|
||||
*/
|
||||
|
||||
pgd_t *pgd = next->pgd + stack_pgd_index;
|
||||
|
||||
if (unlikely(pgd_none(*pgd)))
|
||||
set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
|
||||
this_cpu_write(cpu_tlbstate.active_mm, next);
|
||||
#endif
|
||||
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
|
||||
|
||||
if (real_prev == next) {
|
||||
/*
|
||||
* Re-load page tables.
|
||||
*
|
||||
* This logic has an ordering constraint:
|
||||
*
|
||||
* CPU 0: Write to a PTE for 'next'
|
||||
* CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
|
||||
* CPU 1: set bit 1 in next's mm_cpumask
|
||||
* CPU 1: load from the PTE that CPU 0 writes (implicit)
|
||||
*
|
||||
* We need to prevent an outcome in which CPU 1 observes
|
||||
* the new PTE value and CPU 0 observes bit 1 clear in
|
||||
* mm_cpumask. (If that occurs, then the IPI will never
|
||||
* be sent, and CPU 0's TLB will contain a stale entry.)
|
||||
*
|
||||
* The bad outcome can occur if either CPU's load is
|
||||
* reordered before that CPU's store, so both CPUs must
|
||||
* execute full barriers to prevent this from happening.
|
||||
*
|
||||
* Thus, switch_mm needs a full barrier between the
|
||||
* store to mm_cpumask and any operation that could load
|
||||
* from next->pgd. TLB fills are special and can happen
|
||||
* due to instruction fetches or for no reason at all,
|
||||
* and neither LOCK nor MFENCE orders them.
|
||||
* Fortunately, load_cr3() is serializing and gives the
|
||||
* ordering guarantee we need.
|
||||
*
|
||||
* There's nothing to do: we always keep the per-mm control
|
||||
* regs in sync with cpu_tlbstate.loaded_mm. Just
|
||||
* sanity-check mm_cpumask.
|
||||
*/
|
||||
load_cr3(next->pgd);
|
||||
|
||||
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
|
||||
/* Stop flush ipis for the previous mm */
|
||||
cpumask_clear_cpu(cpu, mm_cpumask(prev));
|
||||
|
||||
/* Load per-mm CR4 state */
|
||||
load_mm_cr4(next);
|
||||
|
||||
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
||||
/*
|
||||
* Load the LDT, if the LDT is different.
|
||||
*
|
||||
* It's possible that prev->context.ldt doesn't match
|
||||
* the LDT register. This can happen if leave_mm(prev)
|
||||
* was called and then modify_ldt changed
|
||||
* prev->context.ldt but suppressed an IPI to this CPU.
|
||||
* In this case, prev->context.ldt != NULL, because we
|
||||
* never set context.ldt to NULL while the mm still
|
||||
* exists. That means that next->context.ldt !=
|
||||
* prev->context.ldt, because mms never share an LDT.
|
||||
*/
|
||||
if (unlikely(prev->context.ldt != next->context.ldt))
|
||||
load_mm_ldt(next);
|
||||
#endif
|
||||
}
|
||||
#ifdef CONFIG_SMP
|
||||
else {
|
||||
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
|
||||
BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
|
||||
|
||||
if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
|
||||
/*
|
||||
* On established mms, the mm_cpumask is only changed
|
||||
* from irq context, from ptep_clear_flush() while in
|
||||
* lazy tlb mode, and here. Irqs are blocked during
|
||||
* schedule, protecting us from simultaneous changes.
|
||||
*/
|
||||
if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
/*
|
||||
* We were in lazy tlb mode and leave_mm disabled
|
||||
* tlb flush IPI delivery. We must reload CR3
|
||||
* to make sure to use no freed page tables.
|
||||
*
|
||||
* As above, load_cr3() is serializing and orders TLB
|
||||
* fills with respect to the mm_cpumask write.
|
||||
*/
|
||||
load_cr3(next->pgd);
|
||||
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
load_mm_cr4(next);
|
||||
load_mm_ldt(next);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
|
||||
/*
|
||||
* If our current stack is in vmalloc space and isn't
|
||||
* mapped in the new pgd, we'll double-fault. Forcibly
|
||||
* map it.
|
||||
*/
|
||||
unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
|
||||
|
||||
pgd_t *pgd = next->pgd + stack_pgd_index;
|
||||
|
||||
if (unlikely(pgd_none(*pgd)))
|
||||
set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
|
||||
}
|
||||
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm, next);
|
||||
|
||||
WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
/*
|
||||
* Re-load page tables.
|
||||
*
|
||||
* This logic has an ordering constraint:
|
||||
*
|
||||
* CPU 0: Write to a PTE for 'next'
|
||||
* CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
|
||||
* CPU 1: set bit 1 in next's mm_cpumask
|
||||
* CPU 1: load from the PTE that CPU 0 writes (implicit)
|
||||
*
|
||||
* We need to prevent an outcome in which CPU 1 observes
|
||||
* the new PTE value and CPU 0 observes bit 1 clear in
|
||||
* mm_cpumask. (If that occurs, then the IPI will never
|
||||
* be sent, and CPU 0's TLB will contain a stale entry.)
|
||||
*
|
||||
* The bad outcome can occur if either CPU's load is
|
||||
* reordered before that CPU's store, so both CPUs must
|
||||
* execute full barriers to prevent this from happening.
|
||||
*
|
||||
* Thus, switch_mm needs a full barrier between the
|
||||
* store to mm_cpumask and any operation that could load
|
||||
* from next->pgd. TLB fills are special and can happen
|
||||
* due to instruction fetches or for no reason at all,
|
||||
* and neither LOCK nor MFENCE orders them.
|
||||
* Fortunately, load_cr3() is serializing and gives the
|
||||
* ordering guarantee we need.
|
||||
*/
|
||||
load_cr3(next->pgd);
|
||||
|
||||
/*
|
||||
* This gets called via leave_mm() in the idle path where RCU
|
||||
* functions differently. Tracing normally uses RCU, so we have to
|
||||
* call the tracepoint specially here.
|
||||
*/
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
|
||||
/* Stop flush ipis for the previous mm */
|
||||
WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
|
||||
real_prev != &init_mm);
|
||||
cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
|
||||
|
||||
/* Load per-mm CR4 and LDTR state */
|
||||
load_mm_cr4(next);
|
||||
switch_ldt(real_prev, next);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
/*
|
||||
* The flush IPI assumes that a thread switch happens in this order:
|
||||
* [cpu0: the cpu that switches]
|
||||
* 1) switch_mm() either 1a) or 1b)
|
||||
* 1a) thread switch to a different mm
|
||||
* 1a1) set cpu_tlbstate to TLBSTATE_OK
|
||||
* Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
|
||||
* if cpu0 was in lazy tlb mode.
|
||||
* 1a2) update cpu active_mm
|
||||
* Now cpu0 accepts tlb flushes for the new mm.
|
||||
* 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
|
||||
* Now the other cpus will send tlb flush ipis.
|
||||
* 1a4) change cr3.
|
||||
* 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
|
||||
* Stop ipi delivery for the old mm. This is not synchronized with
|
||||
* the other cpus, but flush_tlb_func ignore flush ipis for the wrong
|
||||
* mm, and in the worst case we perform a superfluous tlb flush.
|
||||
* 1b) thread switch without mm change
|
||||
* cpu active_mm is correct, cpu0 already handles flush ipis.
|
||||
* 1b1) set cpu_tlbstate to TLBSTATE_OK
|
||||
* 1b2) test_and_set the cpu bit in cpu_vm_mask.
|
||||
* Atomically set the bit [other cpus will start sending flush ipis],
|
||||
* and test the bit.
|
||||
* 1b3) if the bit was 0: leave_mm was called, flush the tlb.
|
||||
* 2) switch %%esp, ie current
|
||||
*
|
||||
* The interrupt must handle 2 special cases:
|
||||
* - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
|
||||
* - the cpu performs speculative tlb reads, i.e. even if the cpu only
|
||||
* runs in kernel space, the cpu could load tlb entries for user space
|
||||
* pages.
|
||||
*
|
||||
* The good news is that cpu_tlbstate is local to each cpu, no
|
||||
* write/read ordering problems.
|
||||
*/
|
||||
|
||||
/*
|
||||
* TLB flush funcation:
|
||||
* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
|
||||
* 2) Leave the mm if we are in the lazy tlb mode.
|
||||
*/
|
||||
static void flush_tlb_func(void *info)
|
||||
static void flush_tlb_func_common(const struct flush_tlb_info *f,
|
||||
bool local, enum tlb_flush_reason reason)
|
||||
{
|
||||
struct flush_tlb_info *f = info;
|
||||
/* This code cannot presently handle being reentered. */
|
||||
VM_WARN_ON(!irqs_disabled());
|
||||
|
||||
if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
|
||||
leave_mm(smp_processor_id());
|
||||
return;
|
||||
}
|
||||
|
||||
if (f->end == TLB_FLUSH_ALL) {
|
||||
local_flush_tlb();
|
||||
if (local)
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
||||
trace_tlb_flush(reason, TLB_FLUSH_ALL);
|
||||
} else {
|
||||
unsigned long addr;
|
||||
unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
|
||||
addr = f->start;
|
||||
while (addr < f->end) {
|
||||
__flush_tlb_single(addr);
|
||||
addr += PAGE_SIZE;
|
||||
}
|
||||
if (local)
|
||||
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
|
||||
trace_tlb_flush(reason, nr_pages);
|
||||
}
|
||||
}
|
||||
|
||||
static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
|
||||
{
|
||||
const struct flush_tlb_info *f = info;
|
||||
|
||||
flush_tlb_func_common(f, true, reason);
|
||||
}
|
||||
|
||||
static void flush_tlb_func_remote(void *info)
|
||||
{
|
||||
const struct flush_tlb_info *f = info;
|
||||
|
||||
inc_irq_stat(irq_tlb_count);
|
||||
|
||||
if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
|
||||
if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
|
||||
return;
|
||||
|
||||
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
|
||||
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
|
||||
if (f->flush_end == TLB_FLUSH_ALL) {
|
||||
local_flush_tlb();
|
||||
trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
|
||||
} else {
|
||||
unsigned long addr;
|
||||
unsigned long nr_pages =
|
||||
(f->flush_end - f->flush_start) / PAGE_SIZE;
|
||||
addr = f->flush_start;
|
||||
while (addr < f->flush_end) {
|
||||
__flush_tlb_single(addr);
|
||||
addr += PAGE_SIZE;
|
||||
}
|
||||
trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
|
||||
}
|
||||
} else
|
||||
leave_mm(smp_processor_id());
|
||||
|
||||
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
|
||||
}
|
||||
|
||||
void native_flush_tlb_others(const struct cpumask *cpumask,
|
||||
struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end)
|
||||
const struct flush_tlb_info *info)
|
||||
{
|
||||
struct flush_tlb_info info;
|
||||
|
||||
info.flush_mm = mm;
|
||||
info.flush_start = start;
|
||||
info.flush_end = end;
|
||||
|
||||
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
|
||||
if (end == TLB_FLUSH_ALL)
|
||||
if (info->end == TLB_FLUSH_ALL)
|
||||
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
|
||||
else
|
||||
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
|
||||
(end - start) >> PAGE_SHIFT);
|
||||
(info->end - info->start) >> PAGE_SHIFT);
|
||||
|
||||
if (is_uv_system()) {
|
||||
unsigned int cpu;
|
||||
|
||||
cpu = smp_processor_id();
|
||||
cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
|
||||
cpumask = uv_flush_tlb_others(cpumask, info);
|
||||
if (cpumask)
|
||||
smp_call_function_many(cpumask, flush_tlb_func,
|
||||
&info, 1);
|
||||
smp_call_function_many(cpumask, flush_tlb_func_remote,
|
||||
(void *)info, 1);
|
||||
return;
|
||||
}
|
||||
smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
|
||||
smp_call_function_many(cpumask, flush_tlb_func_remote,
|
||||
(void *)info, 1);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -302,84 +242,40 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
|
||||
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end, unsigned long vmflag)
|
||||
{
|
||||
unsigned long addr;
|
||||
/* do a global flush by default */
|
||||
unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
|
||||
int cpu;
|
||||
|
||||
preempt_disable();
|
||||
struct flush_tlb_info info = {
|
||||
.mm = mm,
|
||||
};
|
||||
|
||||
if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
|
||||
base_pages_to_flush = (end - start) >> PAGE_SHIFT;
|
||||
if (base_pages_to_flush > tlb_single_page_flush_ceiling)
|
||||
base_pages_to_flush = TLB_FLUSH_ALL;
|
||||
cpu = get_cpu();
|
||||
|
||||
if (current->active_mm != mm) {
|
||||
/* Synchronize with switch_mm. */
|
||||
smp_mb();
|
||||
/* Synchronize with switch_mm. */
|
||||
smp_mb();
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!current->mm) {
|
||||
leave_mm(smp_processor_id());
|
||||
|
||||
/* Synchronize with switch_mm. */
|
||||
smp_mb();
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Both branches below are implicit full barriers (MOV to CR or
|
||||
* INVLPG) that synchronize with switch_mm.
|
||||
*/
|
||||
if (base_pages_to_flush == TLB_FLUSH_ALL) {
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
||||
local_flush_tlb();
|
||||
/* Should we flush just the requested range? */
|
||||
if ((end != TLB_FLUSH_ALL) &&
|
||||
!(vmflag & VM_HUGETLB) &&
|
||||
((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
|
||||
info.start = start;
|
||||
info.end = end;
|
||||
} else {
|
||||
/* flush range by one by one 'invlpg' */
|
||||
for (addr = start; addr < end; addr += PAGE_SIZE) {
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
|
||||
__flush_tlb_single(addr);
|
||||
}
|
||||
info.start = 0UL;
|
||||
info.end = TLB_FLUSH_ALL;
|
||||
}
|
||||
trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
|
||||
out:
|
||||
if (base_pages_to_flush == TLB_FLUSH_ALL) {
|
||||
start = 0UL;
|
||||
end = TLB_FLUSH_ALL;
|
||||
|
||||
if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
|
||||
VM_WARN_ON(irqs_disabled());
|
||||
local_irq_disable();
|
||||
flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
|
||||
local_irq_enable();
|
||||
}
|
||||
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
|
||||
flush_tlb_others(mm_cpumask(mm), mm, start, end);
|
||||
preempt_enable();
|
||||
|
||||
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
|
||||
flush_tlb_others(mm_cpumask(mm), &info);
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
|
||||
preempt_disable();
|
||||
|
||||
if (current->active_mm == mm) {
|
||||
if (current->mm) {
|
||||
/*
|
||||
* Implicit full barrier (INVLPG) that synchronizes
|
||||
* with switch_mm.
|
||||
*/
|
||||
__flush_tlb_one(start);
|
||||
} else {
|
||||
leave_mm(smp_processor_id());
|
||||
|
||||
/* Synchronize with switch_mm. */
|
||||
smp_mb();
|
||||
}
|
||||
}
|
||||
|
||||
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
|
||||
flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
|
||||
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void do_flush_tlb_all(void *info)
|
||||
{
|
||||
@ -401,7 +297,7 @@ static void do_kernel_range_flush(void *info)
|
||||
unsigned long addr;
|
||||
|
||||
/* flush range by one by one 'invlpg' */
|
||||
for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
|
||||
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
|
||||
__flush_tlb_single(addr);
|
||||
}
|
||||
|
||||
@ -410,16 +306,40 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
||||
|
||||
/* Balance as user space task's flush, a bit conservative */
|
||||
if (end == TLB_FLUSH_ALL ||
|
||||
(end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
|
||||
(end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
|
||||
on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
} else {
|
||||
struct flush_tlb_info info;
|
||||
info.flush_start = start;
|
||||
info.flush_end = end;
|
||||
info.start = start;
|
||||
info.end = end;
|
||||
on_each_cpu(do_kernel_range_flush, &info, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
|
||||
{
|
||||
struct flush_tlb_info info = {
|
||||
.mm = NULL,
|
||||
.start = 0UL,
|
||||
.end = TLB_FLUSH_ALL,
|
||||
};
|
||||
|
||||
int cpu = get_cpu();
|
||||
|
||||
if (cpumask_test_cpu(cpu, &batch->cpumask)) {
|
||||
VM_WARN_ON(irqs_disabled());
|
||||
local_irq_disable();
|
||||
flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
|
||||
local_irq_enable();
|
||||
}
|
||||
|
||||
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
|
||||
flush_tlb_others(&batch->cpumask, &info);
|
||||
cpumask_clear(&batch->cpumask);
|
||||
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
@ -465,5 +385,3 @@ static int __init create_tlb_single_page_flush_ceiling(void)
|
||||
return 0;
|
||||
}
|
||||
late_initcall(create_tlb_single_page_flush_ceiling);
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void)
|
||||
int n_pgds, i, j;
|
||||
|
||||
if (!efi_enabled(EFI_OLD_MEMMAP)) {
|
||||
save_pgd = (pgd_t *)read_cr3();
|
||||
save_pgd = (pgd_t *)__read_cr3();
|
||||
write_cr3((unsigned long)efi_scratch.efi_pgt);
|
||||
goto out;
|
||||
}
|
||||
@ -649,7 +649,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
|
||||
efi_sync_low_kernel_mappings();
|
||||
local_irq_save(flags);
|
||||
|
||||
efi_scratch.prev_cr3 = read_cr3();
|
||||
efi_scratch.prev_cr3 = __read_cr3();
|
||||
write_cr3((unsigned long)efi_scratch.efi_pgt);
|
||||
__flush_tlb_all();
|
||||
|
||||
|
@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
|
||||
|
||||
asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
|
||||
{
|
||||
void *pgd_addr = __va(read_cr3());
|
||||
void *pgd_addr = __va(read_cr3_pa());
|
||||
|
||||
/* Program wakeup mask (using dword access to CS5536_PM1_EN) */
|
||||
outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
|
||||
|
@ -1123,11 +1123,9 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
|
||||
* done. The returned pointer is valid till preemption is re-enabled.
|
||||
*/
|
||||
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
|
||||
struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long end,
|
||||
unsigned int cpu)
|
||||
const struct flush_tlb_info *info)
|
||||
{
|
||||
unsigned int cpu = smp_processor_id();
|
||||
int locals = 0, remotes = 0, hubs = 0;
|
||||
struct bau_desc *bau_desc;
|
||||
struct cpumask *flush_mask;
|
||||
@ -1181,8 +1179,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
|
||||
|
||||
record_send_statistics(stat, locals, hubs, remotes, bau_desc);
|
||||
|
||||
if (!end || (end - start) <= PAGE_SIZE)
|
||||
address = start;
|
||||
if (!info->end || (info->end - info->start) <= PAGE_SIZE)
|
||||
address = info->start;
|
||||
else
|
||||
address = TLB_FLUSH_ALL;
|
||||
|
||||
|
@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
|
||||
*/
|
||||
ctxt->cr0 = read_cr0();
|
||||
ctxt->cr2 = read_cr2();
|
||||
ctxt->cr3 = read_cr3();
|
||||
ctxt->cr3 = __read_cr3();
|
||||
ctxt->cr4 = __read_cr4();
|
||||
#ifdef CONFIG_X86_64
|
||||
ctxt->cr8 = read_cr8();
|
||||
|
@ -150,7 +150,8 @@ static int relocate_restore_code(void)
|
||||
memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
|
||||
|
||||
/* Make the page containing the relocated code executable */
|
||||
pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
|
||||
pgd = (pgd_t *)__va(read_cr3_pa()) +
|
||||
pgd_index(relocated_restore_code);
|
||||
p4d = p4d_offset(pgd, relocated_restore_code);
|
||||
if (p4d_large(*p4d)) {
|
||||
set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
|
||||
|
@ -102,7 +102,7 @@ static void __init setup_real_mode(void)
|
||||
|
||||
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
|
||||
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
|
||||
trampoline_pgd[511] = init_level4_pgt[511].pgd;
|
||||
trampoline_pgd[511] = init_top_pgt[511].pgd;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -975,37 +975,32 @@ static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
}
|
||||
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/* Another cpu may still have their %cr3 pointing at the pagetable, so
|
||||
we need to repoint it somewhere else before we can unpin it. */
|
||||
static void drop_other_mm_ref(void *info)
|
||||
static void drop_mm_ref_this_cpu(void *info)
|
||||
{
|
||||
struct mm_struct *mm = info;
|
||||
struct mm_struct *active_mm;
|
||||
|
||||
active_mm = this_cpu_read(cpu_tlbstate.active_mm);
|
||||
|
||||
if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
|
||||
if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
|
||||
leave_mm(smp_processor_id());
|
||||
|
||||
/* If this cpu still has a stale cr3 reference, then make sure
|
||||
it has been flushed. */
|
||||
/*
|
||||
* If this cpu still has a stale cr3 reference, then make sure
|
||||
* it has been flushed.
|
||||
*/
|
||||
if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
|
||||
load_cr3(swapper_pg_dir);
|
||||
xen_mc_flush();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
* Another cpu may still have their %cr3 pointing at the pagetable, so
|
||||
* we need to repoint it somewhere else before we can unpin it.
|
||||
*/
|
||||
static void xen_drop_mm_ref(struct mm_struct *mm)
|
||||
{
|
||||
cpumask_var_t mask;
|
||||
unsigned cpu;
|
||||
|
||||
if (current->active_mm == mm) {
|
||||
if (current->mm == mm)
|
||||
load_cr3(swapper_pg_dir);
|
||||
else
|
||||
leave_mm(smp_processor_id());
|
||||
}
|
||||
drop_mm_ref_this_cpu(mm);
|
||||
|
||||
/* Get the "official" set of cpus referring to our pagetable. */
|
||||
if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
|
||||
@ -1013,31 +1008,31 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
|
||||
if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
|
||||
&& per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
|
||||
continue;
|
||||
smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
|
||||
smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
cpumask_copy(mask, mm_cpumask(mm));
|
||||
|
||||
/* It's possible that a vcpu may have a stale reference to our
|
||||
cr3, because its in lazy mode, and it hasn't yet flushed
|
||||
its set of pending hypercalls yet. In this case, we can
|
||||
look at its actual current cr3 value, and force it to flush
|
||||
if needed. */
|
||||
/*
|
||||
* It's possible that a vcpu may have a stale reference to our
|
||||
* cr3, because its in lazy mode, and it hasn't yet flushed
|
||||
* its set of pending hypercalls yet. In this case, we can
|
||||
* look at its actual current cr3 value, and force it to flush
|
||||
* if needed.
|
||||
*/
|
||||
for_each_online_cpu(cpu) {
|
||||
if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
|
||||
cpumask_set_cpu(cpu, mask);
|
||||
}
|
||||
|
||||
if (!cpumask_empty(mask))
|
||||
smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
|
||||
smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1);
|
||||
free_cpumask_var(mask);
|
||||
}
|
||||
#else
|
||||
static void xen_drop_mm_ref(struct mm_struct *mm)
|
||||
{
|
||||
if (current->active_mm == mm)
|
||||
load_cr3(swapper_pg_dir);
|
||||
drop_mm_ref_this_cpu(mm);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1366,8 +1361,7 @@ static void xen_flush_tlb_single(unsigned long addr)
|
||||
}
|
||||
|
||||
static void xen_flush_tlb_others(const struct cpumask *cpus,
|
||||
struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end)
|
||||
const struct flush_tlb_info *info)
|
||||
{
|
||||
struct {
|
||||
struct mmuext_op op;
|
||||
@ -1379,7 +1373,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
|
||||
} *args;
|
||||
struct multicall_space mcs;
|
||||
|
||||
trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
|
||||
trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
|
||||
|
||||
if (cpumask_empty(cpus))
|
||||
return; /* nothing to do */
|
||||
@ -1393,9 +1387,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
|
||||
cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
|
||||
|
||||
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
|
||||
if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
|
||||
if (info->end != TLB_FLUSH_ALL &&
|
||||
(info->end - info->start) <= PAGE_SIZE) {
|
||||
args->op.cmd = MMUEXT_INVLPG_MULTI;
|
||||
args->op.arg1.linear_addr = start;
|
||||
args->op.arg1.linear_addr = info->start;
|
||||
}
|
||||
|
||||
MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
|
||||
@ -1470,8 +1465,8 @@ static void xen_write_cr3(unsigned long cr3)
|
||||
* At the start of the day - when Xen launches a guest, it has already
|
||||
* built pagetables for the guest. We diligently look over them
|
||||
* in xen_setup_kernel_pagetable and graft as appropriate them in the
|
||||
* init_level4_pgt and its friends. Then when we are happy we load
|
||||
* the new init_level4_pgt - and continue on.
|
||||
* init_top_pgt and its friends. Then when we are happy we load
|
||||
* the new init_top_pgt - and continue on.
|
||||
*
|
||||
* The generic code starts (start_kernel) and 'init_mem_mapping' sets
|
||||
* up the rest of the pagetables. When it has completed it loads the cr3.
|
||||
@ -1914,12 +1909,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
|
||||
pt_end = pt_base + xen_start_info->nr_pt_frames;
|
||||
|
||||
/* Zap identity mapping */
|
||||
init_level4_pgt[0] = __pgd(0);
|
||||
init_top_pgt[0] = __pgd(0);
|
||||
|
||||
/* Pre-constructed entries are in pfn, so convert to mfn */
|
||||
/* L4[272] -> level3_ident_pgt */
|
||||
/* L4[511] -> level3_kernel_pgt */
|
||||
convert_pfn_mfn(init_level4_pgt);
|
||||
convert_pfn_mfn(init_top_pgt);
|
||||
|
||||
/* L3_i[0] -> level2_ident_pgt */
|
||||
convert_pfn_mfn(level3_ident_pgt);
|
||||
@ -1950,10 +1945,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
|
||||
/* Copy the initial P->M table mappings if necessary. */
|
||||
i = pgd_index(xen_start_info->mfn_list);
|
||||
if (i && i < pgd_index(__START_KERNEL_map))
|
||||
init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
|
||||
init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
|
||||
|
||||
/* Make pagetable pieces RO */
|
||||
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
|
||||
@ -1964,7 +1959,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
|
||||
|
||||
/* Pin down new L4 */
|
||||
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
|
||||
PFN_DOWN(__pa_symbol(init_level4_pgt)));
|
||||
PFN_DOWN(__pa_symbol(init_top_pgt)));
|
||||
|
||||
/* Unpin Xen-provided one */
|
||||
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
|
||||
@ -1974,7 +1969,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
|
||||
* attach it to, so make sure we just set kernel pgd.
|
||||
*/
|
||||
xen_mc_batch();
|
||||
__xen_write_cr3(true, __pa(init_level4_pgt));
|
||||
__xen_write_cr3(true, __pa(init_top_pgt));
|
||||
xen_mc_issue(PARAVIRT_LAZY_CPU);
|
||||
|
||||
/* We can't that easily rip out L3 and L2, as the Xen pagetables are
|
||||
@ -2022,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
|
||||
pmd_t pmd;
|
||||
pte_t pte;
|
||||
|
||||
pa = read_cr3();
|
||||
pa = read_cr3_pa();
|
||||
pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
|
||||
sizeof(pgd)));
|
||||
if (!pgd_present(pgd))
|
||||
@ -2102,7 +2097,7 @@ void __init xen_relocate_p2m(void)
|
||||
pt_phys = pmd_phys + PFN_PHYS(n_pmd);
|
||||
p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
|
||||
|
||||
pgd = __va(read_cr3());
|
||||
pgd = __va(read_cr3_pa());
|
||||
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
|
||||
idx_p4d = 0;
|
||||
save_pud = n_pud;
|
||||
@ -2209,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
|
||||
{
|
||||
unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
|
||||
|
||||
BUG_ON(read_cr3() != __pa(initial_page_table));
|
||||
BUG_ON(read_cr3_pa() != __pa(initial_page_table));
|
||||
BUG_ON(cr3 != __pa(swapper_pg_dir));
|
||||
|
||||
/*
|
||||
|
@ -87,7 +87,7 @@ ENTRY(pvh_start_xen)
|
||||
wrmsr
|
||||
|
||||
/* Enable pre-constructed page tables. */
|
||||
mov $_pa(init_level4_pgt), %eax
|
||||
mov $_pa(init_top_pgt), %eax
|
||||
mov %eax, %cr3
|
||||
mov $(X86_CR0_PG | X86_CR0_PE), %eax
|
||||
mov %eax, %cr0
|
||||
|
@ -14,6 +14,10 @@
|
||||
|
||||
#include <asm/page.h>
|
||||
|
||||
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
||||
#include <asm/tlbbatch.h>
|
||||
#endif
|
||||
|
||||
#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
|
||||
#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
|
||||
IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
|
||||
@ -67,12 +71,15 @@ struct page_frag {
|
||||
struct tlbflush_unmap_batch {
|
||||
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
||||
/*
|
||||
* Each bit set is a CPU that potentially has a TLB entry for one of
|
||||
* the PFNs being flushed. See set_tlb_ubc_flush_pending().
|
||||
* The arch code makes the following promise: generic code can modify a
|
||||
* PTE, then call arch_tlbbatch_add_mm() (which internally provides all
|
||||
* needed barriers), then call arch_tlbbatch_flush(), and the entries
|
||||
* will be flushed on all CPUs by the time that arch_tlbbatch_flush()
|
||||
* returns.
|
||||
*/
|
||||
struct cpumask cpumask;
|
||||
struct arch_tlbflush_unmap_batch arch;
|
||||
|
||||
/* True if any bit in cpumask is set */
|
||||
/* True if a flush is needed. */
|
||||
bool flush_required;
|
||||
|
||||
/*
|
||||
|
@ -93,10 +93,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
|
||||
#endif
|
||||
#endif
|
||||
#ifdef CONFIG_DEBUG_TLBFLUSH
|
||||
#ifdef CONFIG_SMP
|
||||
NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */
|
||||
NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
|
||||
#endif /* CONFIG_SMP */
|
||||
NR_TLB_LOCAL_FLUSH_ALL,
|
||||
NR_TLB_LOCAL_FLUSH_ONE,
|
||||
#endif /* CONFIG_DEBUG_TLBFLUSH */
|
||||
|
@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP
|
||||
config HAVE_MEMBLOCK_PHYS_MAP
|
||||
bool
|
||||
|
||||
config HAVE_GENERIC_RCU_GUP
|
||||
config HAVE_GENERIC_GUP
|
||||
bool
|
||||
|
||||
config ARCH_DISCARD_MEMBLOCK
|
||||
|
10
mm/gup.c
10
mm/gup.c
@ -1146,7 +1146,7 @@ struct page *get_dump_page(unsigned long addr)
|
||||
#endif /* CONFIG_ELF_CORE */
|
||||
|
||||
/*
|
||||
* Generic RCU Fast GUP
|
||||
* Generic Fast GUP
|
||||
*
|
||||
* get_user_pages_fast attempts to pin user pages by walking the page
|
||||
* tables directly and avoids taking locks. Thus the walker needs to be
|
||||
@ -1167,8 +1167,8 @@ struct page *get_dump_page(unsigned long addr)
|
||||
* Before activating this code, please be aware that the following assumptions
|
||||
* are currently made:
|
||||
*
|
||||
* *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
|
||||
* pages containing page tables.
|
||||
* *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
|
||||
* free pages containing page tables or TLB flushing requires IPI broadcast.
|
||||
*
|
||||
* *) ptes can be read atomically by the architecture.
|
||||
*
|
||||
@ -1178,7 +1178,7 @@ struct page *get_dump_page(unsigned long addr)
|
||||
*
|
||||
* This code is based heavily on the PowerPC implementation by Nick Piggin.
|
||||
*/
|
||||
#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
|
||||
#ifdef CONFIG_HAVE_GENERIC_GUP
|
||||
|
||||
#ifndef gup_get_pte
|
||||
/*
|
||||
@ -1668,4 +1668,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
|
||||
#endif /* CONFIG_HAVE_GENERIC_GUP */
|
||||
|
16
mm/rmap.c
16
mm/rmap.c
@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
|
||||
void try_to_unmap_flush(void)
|
||||
{
|
||||
struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
|
||||
int cpu;
|
||||
|
||||
if (!tlb_ubc->flush_required)
|
||||
return;
|
||||
|
||||
cpu = get_cpu();
|
||||
|
||||
if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
||||
local_flush_tlb();
|
||||
trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
|
||||
}
|
||||
|
||||
if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
|
||||
flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
|
||||
cpumask_clear(&tlb_ubc->cpumask);
|
||||
arch_tlbbatch_flush(&tlb_ubc->arch);
|
||||
tlb_ubc->flush_required = false;
|
||||
tlb_ubc->writable = false;
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
/* Flush iff there are potentially writable TLB entries that can race with IO */
|
||||
@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
|
||||
{
|
||||
struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
|
||||
|
||||
cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
|
||||
arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
|
||||
tlb_ubc->flush_required = true;
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user