Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
 "Misc changes:
   - fix lguest bug
   - fix /proc/meminfo output on certain configs
   - fix pvclock bug
   - fix reboot on certain iMacs by adding new reboot quirk
   - fix bootup crash
   - fix FPU boot line option parsing
   - add more x86 self-tests
   - small cleanups, documentation improvements, etc"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/cpu/amd: Remove an unneeded condition in srat_detect_node()
  x86/vdso/pvclock: Protect STABLE check with the seqcount
  x86/mm: Improve switch_mm() barrier comments
  selftests/x86: Test __kernel_sigreturn and __kernel_rt_sigreturn
  x86/reboot/quirks: Add iMac10,1 to pci_reboot_dmi_table[]
  lguest: Map switcher text R/O
  x86/boot: Hide local labels in verify_cpu()
  x86/fpu: Disable AVX when eagerfpu is off
  x86/fpu: Disable MPX when eagerfpu is off
  x86/fpu: Disable XGETBV1 when no XSAVE
  x86/fpu: Fix early FPU command-line parsing
  x86/mm: Use PAGE_ALIGNED instead of IS_ALIGNED
  selftests/x86: Disable the ldt_gdt_64 test for now
  x86/mm/pat: Make split_page_count() check for empty levels to fix /proc/meminfo output
  x86/boot: Double BOOT_HEAP_SIZE to 64KB
  x86/mm: Add barriers and document switch_mm()-vs-flush synchronization
This commit is contained in:
Linus Torvalds 2016-01-14 11:57:22 -08:00
commit 10a0c0f059
17 changed files with 343 additions and 152 deletions

View File

@ -126,23 +126,23 @@ static notrace cycle_t vread_pvclock(int *mode)
*
* On Xen, we don't appear to have that guarantee, but Xen still
* supplies a valid seqlock using the version field.
*
* We only do pvclock vdso timing at all if
* PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
* mean that all vCPUs have matching pvti and that the TSC is
* synced, so we can just look at vCPU 0's pvti.
*/
if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
*mode = VCLOCK_NONE;
return 0;
}
do {
version = pvti->version;
smp_rmb();
if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
*mode = VCLOCK_NONE;
return 0;
}
tsc = rdtsc_ordered();
pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
pvti_tsc_shift = pvti->tsc_shift;

View File

@ -27,7 +27,7 @@
#define BOOT_HEAP_SIZE 0x400000
#else /* !CONFIG_KERNEL_BZIP2 */
#define BOOT_HEAP_SIZE 0x8000
#define BOOT_HEAP_SIZE 0x10000
#endif /* !CONFIG_KERNEL_BZIP2 */

View File

@ -42,6 +42,7 @@ extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(struct cpuinfo_x86 *c);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
extern u64 fpu__get_supported_xfeatures_mask(void);
/*
* Debugging facility:

View File

@ -20,14 +20,15 @@
/* Supported features which support lazy state saving */
#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \
XFEATURE_MASK_SSE | \
XFEATURE_MASK_YMM | \
XFEATURE_MASK_OPMASK | \
XFEATURE_MASK_ZMM_Hi256 | \
XFEATURE_MASK_Hi16_ZMM)
XFEATURE_MASK_SSE)
/* Supported features which require eager state saving */
#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \
XFEATURE_MASK_BNDCSR | \
XFEATURE_MASK_YMM | \
XFEATURE_MASK_OPMASK | \
XFEATURE_MASK_ZMM_Hi256 | \
XFEATURE_MASK_Hi16_ZMM)
/* All currently supported features */
#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)

View File

@ -12,7 +12,9 @@
#define GUEST_PL 1
/* Page for Switcher text itself, then two pages per cpu */
#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids)
#define SWITCHER_TEXT_PAGES (1)
#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
/* Where we map the Switcher, in both Host and Guest. */
extern unsigned long switcher_addr;

View File

@ -116,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
#endif
cpumask_set_cpu(cpu, mm_cpumask(next));
/* Re-load page tables */
/*
* Re-load page tables.
*
* This logic has an ordering constraint:
*
* CPU 0: Write to a PTE for 'next'
* CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
* CPU 1: set bit 1 in next's mm_cpumask
* CPU 1: load from the PTE that CPU 0 writes (implicit)
*
* We need to prevent an outcome in which CPU 1 observes
* the new PTE value and CPU 0 observes bit 1 clear in
* mm_cpumask. (If that occurs, then the IPI will never
* be sent, and CPU 0's TLB will contain a stale entry.)
*
* The bad outcome can occur if either CPU's load is
* reordered before that CPU's store, so both CPUs must
* execute full barriers to prevent this from happening.
*
* Thus, switch_mm needs a full barrier between the
* store to mm_cpumask and any operation that could load
* from next->pgd. TLB fills are special and can happen
* due to instruction fetches or for no reason at all,
* and neither LOCK nor MFENCE orders them.
* Fortunately, load_cr3() is serializing and gives the
* ordering guarantee we need.
*
*/
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
/* Stop flush ipis for the previous mm */
@ -156,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* schedule, protecting us from simultaneous changes.
*/
cpumask_set_cpu(cpu, mm_cpumask(next));
/*
* We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3
* to make sure to use no freed page tables.
*
* As above, load_cr3() is serializing and orders TLB
* fills with respect to the mm_cpumask write.
*/
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);

View File

@ -434,8 +434,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
*/
int ht_nodeid = c->initial_apicid;
if (ht_nodeid >= 0 &&
__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
/* Pick a nearby node */
if (!node_online(node))

View File

@ -3,8 +3,11 @@
*/
#include <asm/fpu/internal.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
#include <asm/cmdline.h>
#include <linux/sched.h>
#include <linux/init.h>
/*
* Initialize the TS bit in CR0 according to the style of context-switches
@ -270,20 +273,52 @@ static void __init fpu__init_system_xstate_size_legacy(void)
*/
static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
static int __init eager_fpu_setup(char *s)
/*
* Find supported xfeatures based on cpu features and command-line input.
* This must be called after fpu__init_parse_early_param() is called and
* xfeatures_mask is enumerated.
*/
u64 __init fpu__get_supported_xfeatures_mask(void)
{
if (!strcmp(s, "on"))
eagerfpu = ENABLE;
else if (!strcmp(s, "off"))
eagerfpu = DISABLE;
else if (!strcmp(s, "auto"))
eagerfpu = AUTO;
return 1;
/* Support all xfeatures known to us */
if (eagerfpu != DISABLE)
return XCNTXT_MASK;
/* Warning of xfeatures being disabled for no eagerfpu mode */
if (xfeatures_mask & XFEATURE_MASK_EAGER) {
pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
xfeatures_mask & XFEATURE_MASK_EAGER);
}
/* Return a mask that masks out all features requiring eagerfpu mode */
return ~XFEATURE_MASK_EAGER;
}
/*
* Disable features dependent on eagerfpu.
*/
static void __init fpu__clear_eager_fpu_features(void)
{
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
setup_clear_cpu_cap(X86_FEATURE_AVX512F);
setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
}
__setup("eagerfpu=", eager_fpu_setup);
/*
* Pick the FPU context switching strategy:
*
* When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
* the following is true:
*
* (1) the cpu has xsaveopt, as it has the optimization and doing eager
* FPU switching has a relatively low cost compared to a plain xsave;
* (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
* switching. Should the kernel boot with noxsaveopt, we support MPX
* with eager FPU switching at a higher cost.
*/
static void __init fpu__init_system_ctx_switch(void)
{
@ -295,19 +330,11 @@ static void __init fpu__init_system_ctx_switch(void)
WARN_ON_FPU(current->thread.fpu.fpstate_active);
current_thread_info()->status = 0;
/* Auto enable eagerfpu for xsaveopt */
if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
eagerfpu = ENABLE;
if (xfeatures_mask & XFEATURE_MASK_EAGER) {
if (eagerfpu == DISABLE) {
pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
xfeatures_mask & XFEATURE_MASK_EAGER);
xfeatures_mask &= ~XFEATURE_MASK_EAGER;
} else {
eagerfpu = ENABLE;
}
}
if (xfeatures_mask & XFEATURE_MASK_EAGER)
eagerfpu = ENABLE;
if (eagerfpu == ENABLE)
setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
@ -315,12 +342,49 @@ static void __init fpu__init_system_ctx_switch(void)
printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy");
}
/*
* We parse fpu parameters early because fpu__init_system() is executed
* before parse_early_param().
*/
static void __init fpu__init_parse_early_param(void)
{
/*
* No need to check "eagerfpu=auto" again, since it is the
* initial default.
*/
if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
eagerfpu = DISABLE;
fpu__clear_eager_fpu_features();
} else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
eagerfpu = ENABLE;
}
if (cmdline_find_option_bool(boot_command_line, "no387"))
setup_clear_cpu_cap(X86_FEATURE_FPU);
if (cmdline_find_option_bool(boot_command_line, "nofxsr")) {
setup_clear_cpu_cap(X86_FEATURE_FXSR);
setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
setup_clear_cpu_cap(X86_FEATURE_XMM);
}
if (cmdline_find_option_bool(boot_command_line, "noxsave"))
fpu__xstate_clear_all_cpu_caps();
if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
}
/*
* Called on the boot CPU once per system bootup, to set up the initial
* FPU state that is later cloned into all processes:
*/
void __init fpu__init_system(struct cpuinfo_x86 *c)
{
fpu__init_parse_early_param();
fpu__init_system_early_generic(c);
/*
@ -344,62 +408,3 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_ctx_switch();
}
/*
* Boot parameter to turn off FPU support and fall back to math-emu:
*/
static int __init no_387(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_FPU);
return 1;
}
__setup("no387", no_387);
/*
* Disable all xstate CPU features:
*/
static int __init x86_noxsave_setup(char *s)
{
if (strlen(s))
return 0;
fpu__xstate_clear_all_cpu_caps();
return 1;
}
__setup("noxsave", x86_noxsave_setup);
/*
* Disable the XSAVEOPT instruction specifically:
*/
static int __init x86_noxsaveopt_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
return 1;
}
__setup("noxsaveopt", x86_noxsaveopt_setup);
/*
* Disable the XSAVES instruction:
*/
static int __init x86_noxsaves_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
return 1;
}
__setup("noxsaves", x86_noxsaves_setup);
/*
* Disable FX save/restore and SSE support:
*/
static int __init x86_nofxsr_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_FXSR);
setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
setup_clear_cpu_cap(X86_FEATURE_XMM);
return 1;
}
__setup("nofxsr", x86_nofxsr_setup);

View File

@ -52,6 +52,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
}
/*
@ -632,8 +633,7 @@ void __init fpu__init_system_xstate(void)
BUG();
}
/* Support only the state known to the OS: */
xfeatures_mask = xfeatures_mask & XCNTXT_MASK;
xfeatures_mask &= fpu__get_supported_xfeatures_mask();
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();

View File

@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
},
},
{ /* Handle problems with rebooting on the iMac10,1. */
.callback = set_pci_reboot,
.ident = "Apple iMac10,1",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
},
},
/* ASRock */
{ /* Handle problems with rebooting on ASRock Q1900DC-ITX */

View File

@ -48,31 +48,31 @@ verify_cpu:
pushfl
popl %eax
cmpl %eax,%ebx
jz verify_cpu_no_longmode # cpu has no cpuid
jz .Lverify_cpu_no_longmode # cpu has no cpuid
#endif
movl $0x0,%eax # See if cpuid 1 is implemented
cpuid
cmpl $0x1,%eax
jb verify_cpu_no_longmode # no cpuid 1
jb .Lverify_cpu_no_longmode # no cpuid 1
xor %di,%di
cmpl $0x68747541,%ebx # AuthenticAMD
jnz verify_cpu_noamd
jnz .Lverify_cpu_noamd
cmpl $0x69746e65,%edx
jnz verify_cpu_noamd
jnz .Lverify_cpu_noamd
cmpl $0x444d4163,%ecx
jnz verify_cpu_noamd
jnz .Lverify_cpu_noamd
mov $1,%di # cpu is from AMD
jmp verify_cpu_check
jmp .Lverify_cpu_check
verify_cpu_noamd:
.Lverify_cpu_noamd:
cmpl $0x756e6547,%ebx # GenuineIntel?
jnz verify_cpu_check
jnz .Lverify_cpu_check
cmpl $0x49656e69,%edx
jnz verify_cpu_check
jnz .Lverify_cpu_check
cmpl $0x6c65746e,%ecx
jnz verify_cpu_check
jnz .Lverify_cpu_check
# only call IA32_MISC_ENABLE when:
# family > 6 || (family == 6 && model >= 0xd)
@ -83,59 +83,59 @@ verify_cpu_noamd:
andl $0x0ff00f00, %eax # mask family and extended family
shrl $8, %eax
cmpl $6, %eax
ja verify_cpu_clear_xd # family > 6, ok
jb verify_cpu_check # family < 6, skip
ja .Lverify_cpu_clear_xd # family > 6, ok
jb .Lverify_cpu_check # family < 6, skip
andl $0x000f00f0, %ecx # mask model and extended model
shrl $4, %ecx
cmpl $0xd, %ecx
jb verify_cpu_check # family == 6, model < 0xd, skip
jb .Lverify_cpu_check # family == 6, model < 0xd, skip
verify_cpu_clear_xd:
.Lverify_cpu_clear_xd:
movl $MSR_IA32_MISC_ENABLE, %ecx
rdmsr
btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
jnc verify_cpu_check # only write MSR if bit was changed
jnc .Lverify_cpu_check # only write MSR if bit was changed
wrmsr
verify_cpu_check:
.Lverify_cpu_check:
movl $0x1,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK0,%edx
xorl $REQUIRED_MASK0,%edx
jnz verify_cpu_no_longmode
jnz .Lverify_cpu_no_longmode
movl $0x80000000,%eax # See if extended cpuid is implemented
cpuid
cmpl $0x80000001,%eax
jb verify_cpu_no_longmode # no extended cpuid
jb .Lverify_cpu_no_longmode # no extended cpuid
movl $0x80000001,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK1,%edx
xorl $REQUIRED_MASK1,%edx
jnz verify_cpu_no_longmode
jnz .Lverify_cpu_no_longmode
verify_cpu_sse_test:
.Lverify_cpu_sse_test:
movl $1,%eax
cpuid
andl $SSE_MASK,%edx
cmpl $SSE_MASK,%edx
je verify_cpu_sse_ok
je .Lverify_cpu_sse_ok
test %di,%di
jz verify_cpu_no_longmode # only try to force SSE on AMD
jz .Lverify_cpu_no_longmode # only try to force SSE on AMD
movl $MSR_K7_HWCR,%ecx
rdmsr
btr $15,%eax # enable SSE
wrmsr
xor %di,%di # don't loop
jmp verify_cpu_sse_test # try again
jmp .Lverify_cpu_sse_test # try again
verify_cpu_no_longmode:
.Lverify_cpu_no_longmode:
popf # Restore caller passed flags
movl $1,%eax
ret
verify_cpu_sse_ok:
.Lverify_cpu_sse_ok:
popf # Restore caller passed flags
xorl %eax, %eax
ret

View File

@ -814,8 +814,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
if (phys_addr < (phys_addr_t)0x40000000)
return;
if (IS_ALIGNED(addr, PAGE_SIZE) &&
IS_ALIGNED(next, PAGE_SIZE)) {
if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
/*
* Do not free direct mapping pages since they were
* freed when offlining, or simplely not in use.

View File

@ -66,6 +66,9 @@ void update_page_count(int level, unsigned long pages)
static void split_page_count(int level)
{
if (direct_pages_count[level] == 0)
return;
direct_pages_count[level]--;
direct_pages_count[level - 1] += PTRS_PER_PTE;
}

View File

@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
preempt_disable();
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
/* This is an implicit full barrier that synchronizes with switch_mm. */
local_flush_tlb();
trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
preempt_disable();
if (current->active_mm != mm)
if (current->active_mm != mm) {
/* Synchronize with switch_mm. */
smp_mb();
goto out;
}
if (!current->mm) {
leave_mm(smp_processor_id());
/* Synchronize with switch_mm. */
smp_mb();
goto out;
}
if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
base_pages_to_flush = (end - start) >> PAGE_SHIFT;
/*
* Both branches below are implicit full barriers (MOV to CR or
* INVLPG) that synchronize with switch_mm.
*/
if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
base_pages_to_flush = TLB_FLUSH_ALL;
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
preempt_disable();
if (current->active_mm == mm) {
if (current->mm)
if (current->mm) {
/*
* Implicit full barrier (INVLPG) that synchronizes
* with switch_mm.
*/
__flush_tlb_one(start);
else
} else {
leave_mm(smp_processor_id());
/* Synchronize with switch_mm. */
smp_mb();
}
}
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)

View File

@ -22,7 +22,8 @@
unsigned long switcher_addr;
struct page **lg_switcher_pages;
static struct vm_struct *switcher_vma;
static struct vm_struct *switcher_text_vma;
static struct vm_struct *switcher_stacks_vma;
/* This One Big lock protects all inter-guest data structures. */
DEFINE_MUTEX(lguest_lock);
@ -82,55 +83,81 @@ static __init int map_switcher(void)
}
}
/*
* Copy in the compiled-in Switcher code (from x86/switcher_32.S).
* It goes in the first page, which we map in momentarily.
*/
memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
end_switcher_text - start_switcher_text);
kunmap(lg_switcher_pages[0]);
/*
* We place the Switcher underneath the fixmap area, which is the
* highest virtual address we can get. This is important, since we
* tell the Guest it can't access this memory, so we want its ceiling
* as high as possible.
*/
switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;
/*
* Now we reserve the "virtual memory area" we want. We might
* not get it in theory, but in practice it's worked so far.
* The end address needs +1 because __get_vm_area allocates an
* extra guard page, so we need space for that.
* Now we reserve the "virtual memory area"s we want. We might
* not get them in theory, but in practice it's worked so far.
*
* We want the switcher text to be read-only and executable, and
* the stacks to be read-write and non-executable.
*/
switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
VM_ALLOC, switcher_addr, switcher_addr
+ (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
if (!switcher_vma) {
switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
switcher_addr,
switcher_addr + PAGE_SIZE);
if (!switcher_text_vma) {
err = -ENOMEM;
printk("lguest: could not map switcher pages high\n");
goto free_pages;
}
switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
VM_ALLOC|VM_NO_GUARD,
switcher_addr + PAGE_SIZE,
switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
if (!switcher_stacks_vma) {
err = -ENOMEM;
printk("lguest: could not map switcher pages high\n");
goto free_text_vma;
}
/*
* This code actually sets up the pages we've allocated to appear at
* switcher_addr. map_vm_area() takes the vma we allocated above, the
* kind of pages we're mapping (kernel pages), and a pointer to our
* array of struct pages.
* kind of pages we're mapping (kernel text pages and kernel writable
* pages respectively), and a pointer to our array of struct pages.
*/
err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, lg_switcher_pages);
err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
if (err) {
printk("lguest: map_vm_area failed: %i\n", err);
goto free_vma;
printk("lguest: text map_vm_area failed: %i\n", err);
goto free_vmas;
}
err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
lg_switcher_pages + SWITCHER_TEXT_PAGES);
if (err) {
printk("lguest: stacks map_vm_area failed: %i\n", err);
goto free_vmas;
}
/*
* Now the Switcher is mapped at the right address, we can't fail!
* Copy in the compiled-in Switcher code (from x86/switcher_32.S).
*/
memcpy(switcher_vma->addr, start_switcher_text,
end_switcher_text - start_switcher_text);
printk(KERN_INFO "lguest: mapped switcher at %p\n",
switcher_vma->addr);
switcher_text_vma->addr);
/* And we succeeded... */
return 0;
free_vma:
vunmap(switcher_vma->addr);
free_vmas:
/* Undoes map_vm_area and __get_vm_area */
vunmap(switcher_stacks_vma->addr);
free_text_vma:
vunmap(switcher_text_vma->addr);
free_pages:
i = TOTAL_SWITCHER_PAGES;
free_some_pages:
@ -148,7 +175,8 @@ static void unmap_switcher(void)
unsigned int i;
/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
vunmap(switcher_vma->addr);
vunmap(switcher_text_vma->addr);
vunmap(switcher_stacks_vma->addr);
/* Now we just need to free the pages we copied the switcher into */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
__free_pages(lg_switcher_pages[i], 0);

View File

@ -4,9 +4,11 @@ include ../lib.mk
.PHONY: all all_32 all_64 warn_32bit_failure clean
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP
test_FCMOV test_FCOMI test_FISTTP \
ldt_gdt \
vdso_restorer
TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)

View File

@ -0,0 +1,88 @@
/*
* vdso_restorer.c - tests vDSO-based signal restore
* Copyright (c) 2015 Andrew Lutomirski
*
* This program is free software; you can redistribute it and/or modify
* it under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* This makes sure that sa_restorer == NULL keeps working on 32-bit
* configurations. Modern glibc doesn't use it under any circumstances,
* so it's easy to overlook breakage.
*
* 64-bit userspace has never supported sa_restorer == NULL, so this is
* 32-bit only.
*/
#define _GNU_SOURCE
#include <err.h>
#include <stdio.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <syscall.h>
#include <sys/syscall.h>
/* Open-code this -- the headers are too messy to easily use them. */
struct real_sigaction {
void *handler;
unsigned long flags;
void *restorer;
unsigned int mask[2];
};
static volatile sig_atomic_t handler_called;
static void handler_with_siginfo(int sig, siginfo_t *info, void *ctx_void)
{
handler_called = 1;
}
static void handler_without_siginfo(int sig)
{
handler_called = 1;
}
int main()
{
int nerrs = 0;
struct real_sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.handler = handler_with_siginfo;
sa.flags = SA_SIGINFO;
sa.restorer = NULL; /* request kernel-provided restorer */
if (syscall(SYS_rt_sigaction, SIGUSR1, &sa, NULL, 8) != 0)
err(1, "raw rt_sigaction syscall");
raise(SIGUSR1);
if (handler_called) {
printf("[OK]\tSA_SIGINFO handler returned successfully\n");
} else {
printf("[FAIL]\tSA_SIGINFO handler was not called\n");
nerrs++;
}
sa.flags = 0;
sa.handler = handler_without_siginfo;
if (syscall(SYS_sigaction, SIGUSR1, &sa, 0) != 0)
err(1, "raw sigaction syscall");
handler_called = 0;
raise(SIGUSR1);
if (handler_called) {
printf("[OK]\t!SA_SIGINFO handler returned successfully\n");
} else {
printf("[FAIL]\t!SA_SIGINFO handler was not called\n");
nerrs++;
}
}