Merge branch 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Gleb Natapov: "The highlights of the release are nested EPT and pv-ticketlocks support (hypervisor part, guest part, which is most of the code, goes through tip tree). Apart of that there are many fixes for all arches" Fix up semantic conflicts as discussed in the pull request thread.. * 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (88 commits) ARM: KVM: Add newlines to panic strings ARM: KVM: Work around older compiler bug ARM: KVM: Simplify tracepoint text ARM: KVM: Fix kvm_set_pte assignment ARM: KVM: vgic: Bump VGIC_NR_IRQS to 256 ARM: KVM: Bugfix: vgic_bytemap_get_reg per cpu regs ARM: KVM: vgic: fix GICD_ICFGRn access ARM: KVM: vgic: simplify vgic_get_target_reg KVM: MMU: remove unused parameter KVM: PPC: Book3S PR: Rework kvmppc_mmu_book3s_64_xlate() KVM: PPC: Book3S PR: Make instruction fetch fallback work for system calls KVM: PPC: Book3S PR: Don't corrupt guest state when kernel uses VMX KVM: x86: update masterclock when kvmclock_offset is calculated (v2) KVM: PPC: Book3S: Fix compile error in XICS emulation KVM: PPC: Book3S PR: return appropriate error when allocation fails arch: powerpc: kvm: add signed type cast for comparation KVM: x86: add comments where MMIO does not return to the emulator KVM: vmx: count exits to userspace during invalid guest emulation KVM: rename __kvm_io_bus_sort_cmp to kvm_io_bus_cmp kvm: optimize away THP checks in kvm_is_mmio_pfn() ...
2024-11-10 22:21:40 +00:00 · 2013-09-04 18:15:06 -07:00 · 2013-09-04 18:15:06 -07:00 · ae7a835cc5
commit ae7a835cc5
parent cf39c8e535 6b9e4fa074
73 changed files with 2358 additions and 1348 deletions
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@ -43,6 +43,10 @@ KVM_FEATURE_CLOCKSOURCE2           ||     3 || kvmclock available at msrs
 KVM_FEATURE_ASYNC_PF               ||     4 || async pf can be enabled by
                                   ||       || writing to msr 0x4b564d02
 ------------------------------------------------------------------------------
+KVM_FEATURE_PV_UNHALT              ||     7 || guest checks this feature bit
+                                   ||       || before enabling paravirtualized
+                                   ||       || spinlock support.
+------------------------------------------------------------------------------
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
                                   ||       || per-cpu warps are expected in
                                   ||       || kvmclock.
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@ -64,3 +64,17 @@ Purpose: To enable communication between the hypervisor and guest there is a
 shared page that contains parts of supervisor visible register state.
 The guest can map this shared page to access its supervisor register through
 memory using this hypercall.
+
+5. KVM_HC_KICK_CPU
+------------------------
+Architecture: x86
+Status: active
+Purpose: Hypercall used to wakeup a vcpu from HLT state
+Usage example : A vcpu of a paravirtualized guest that is busywaiting in guest
+kernel mode for an event to occur (ex: a spinlock to become available) can
+execute HLT instruction once it has busy-waited for more than a threshold
+time-interval. Execution of HLT instruction would cause the hypervisor to put
+the vcpu to sleep until occurence of an appropriate event. Another vcpu of the
+same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall,
+specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0)
+is used in the hypercall for future use.
--- a/arch/arm/configs/keystone_defconfig
+++ b/arch/arm/configs/keystone_defconfig
@ -104,6 +104,7 @@ CONFIG_IP_SCTP=y
 CONFIG_VLAN_8021Q=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_CMA=y
+CONFIG_DMA_CMA=y
 CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@ -78,6 +78,7 @@ CONFIG_MAC80211_RC_PID=y
 CONFIG_MAC80211_RC_DEFAULT_PID=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_CMA=y
+CONFIG_DMA_CMA=y
 CONFIG_CONNECTOR=y
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
--- a/arch/arm/configs/tegra_defconfig
+++ b/arch/arm/configs/tegra_defconfig
@ -79,6 +79,7 @@ CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
 # CONFIG_FIRMWARE_IN_KERNEL is not set
 CONFIG_CMA=y
+CONFIG_DMA_CMA=y
 CONFIG_MTD=y
 CONFIG_MTD_M25P80=y
 CONFIG_PROC_DEVICETREE=y
--- a/arch/arm/include/asm/dma-contiguous.h
+++ b/arch/arm/include/asm/dma-contiguous.h
@ -2,7 +2,7 @@
 #define ASMARM_DMA_CONTIGUOUS_H

 #ifdef __KERNEL__
-#ifdef CONFIG_CMA
+#ifdef CONFIG_DMA_CMA

 #include <linux/types.h>
 #include <asm-generic/dma-contiguous.h>
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@ -64,7 +64,7 @@ void kvm_clear_hyp_idmap(void);

 static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
 {
-	pte_val(*pte) = new_pte;
+	*pte = new_pte;
 	/*
 	 * flush_pmd_entry just takes a void pointer and cleans the necessary
 	 * cache entries, so we can reuse the function for ptes.
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@ -219,6 +219,10 @@ long kvm_arch_dev_ioctl(struct file *filp,
 	return -EINVAL;
 }

+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
 				   struct kvm_userspace_memory_region *mem,
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@ -492,10 +492,10 @@ __kvm_hyp_code_end:
 	.section ".rodata"

 und_die_str:
-	.ascii	"unexpected undefined exception in Hyp mode at: %#08x"
+	.ascii	"unexpected undefined exception in Hyp mode at: %#08x\n"
 pabt_die_str:
-	.ascii	"unexpected prefetch abort in Hyp mode at: %#08x"
+	.ascii	"unexpected prefetch abort in Hyp mode at: %#08x\n"
 dabt_die_str:
-	.ascii	"unexpected data abort in Hyp mode at: %#08x"
+	.ascii	"unexpected data abort in Hyp mode at: %#08x\n"
 svc_die_str:
-	.ascii	"unexpected HVC/SVC trap in Hyp mode at: %#08x"
+	.ascii	"unexpected HVC/SVC trap in Hyp mode at: %#08x\n"
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@ -40,7 +40,7 @@ static struct kvm_regs a15_regs_reset = {
 };

 static const struct kvm_irq_level a15_vtimer_irq = {
-	.irq = 27,
+	{ .irq = 27 },
 	.level = 1,
 };

--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@ -59,10 +59,9 @@ TRACE_EVENT(kvm_guest_fault,
 		__entry->ipa			= ipa;
 	),

-	TP_printk("guest fault at PC %#08lx (hxfar %#08lx, "
-		  "ipa %#16llx, hsr %#08lx",
-		  __entry->vcpu_pc, __entry->hxfar,
-		  __entry->ipa, __entry->hsr)
+	TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
+		  __entry->ipa, __entry->hsr,
+		  __entry->hxfar, __entry->vcpu_pc)
 );

 TRACE_EVENT(kvm_irq_line,
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@ -358,7 +358,7 @@ static int __init atomic_pool_init(void)
 	if (!pages)
 		goto no_pages;

-	if (IS_ENABLED(CONFIG_CMA))
+	if (IS_ENABLED(CONFIG_DMA_CMA))
 		ptr = __alloc_from_contiguous(NULL, pool->size, prot, &page,
 					      atomic_pool_init);
 	else
@ -670,7 +670,7 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 		addr = __alloc_simple_buffer(dev, size, gfp, &page);
 	else if (!(gfp & __GFP_WAIT))
 		addr = __alloc_from_pool(size, &page);
-	else if (!IS_ENABLED(CONFIG_CMA))
+	else if (!IS_ENABLED(CONFIG_DMA_CMA))
 		addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, caller);
 	else
 		addr = __alloc_from_contiguous(dev, size, prot, &page, caller);
@ -759,7 +759,7 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
 		__dma_free_buffer(page, size);
 	} else if (__free_from_pool(cpu_addr, size)) {
 		return;
-	} else if (!IS_ENABLED(CONFIG_CMA)) {
+	} else if (!IS_ENABLED(CONFIG_DMA_CMA)) {
 		__dma_free_remap(cpu_addr, size);
 		__dma_free_buffer(page, size);
 	} else {
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@ -1560,6 +1560,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 	return 0;
 }

+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		struct kvm_memory_slot *memslot,
 		struct kvm_userspace_memory_region *mem,
--- a/arch/mips/kvm/kvm_locore.S
+++ b/arch/mips/kvm/kvm_locore.S
@ -1,13 +1,13 @@
 /*
-* This file is subject to the terms and conditions of the GNU General Public
-* License.  See the file "COPYING" in the main directory of this archive
-* for more details.
-*
-* Main entry point for the guest, exception handling.
-*
-* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
-* Authors: Sanjay Lal <sanjayl@kymasys.com>
-*/
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Main entry point for the guest, exception handling.
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
+ */

 #include <asm/asm.h>
 #include <asm/asmmacro.h>
@ -55,195 +55,193 @@
 * a0: run
 * a1: vcpu
 */
+	.set	noreorder
+	.set	noat

 FEXPORT(__kvm_mips_vcpu_run)
-    .set    push
-    .set    noreorder
-    .set    noat
+	/* k0/k1 not being used in host kernel context */
+	INT_ADDIU k1, sp, -PT_SIZE
+	LONG_S	$0, PT_R0(k1)
+	LONG_S	$1, PT_R1(k1)
+	LONG_S	$2, PT_R2(k1)
+	LONG_S	$3, PT_R3(k1)

-    /* k0/k1 not being used in host kernel context */
-	addiu  		k1,sp, -PT_SIZE
-    LONG_S	    $0, PT_R0(k1)
-    LONG_S     	$1, PT_R1(k1)
-    LONG_S     	$2, PT_R2(k1)
-    LONG_S     	$3, PT_R3(k1)
+	LONG_S	$4, PT_R4(k1)
+	LONG_S	$5, PT_R5(k1)
+	LONG_S	$6, PT_R6(k1)
+	LONG_S	$7, PT_R7(k1)

-    LONG_S     	$4, PT_R4(k1)
-    LONG_S     	$5, PT_R5(k1)
-    LONG_S     	$6, PT_R6(k1)
-    LONG_S     	$7, PT_R7(k1)
+	LONG_S	$8,  PT_R8(k1)
+	LONG_S	$9,  PT_R9(k1)
+	LONG_S	$10, PT_R10(k1)
+	LONG_S	$11, PT_R11(k1)
+	LONG_S	$12, PT_R12(k1)
+	LONG_S	$13, PT_R13(k1)
+	LONG_S	$14, PT_R14(k1)
+	LONG_S	$15, PT_R15(k1)
+	LONG_S	$16, PT_R16(k1)
+	LONG_S	$17, PT_R17(k1)

-    LONG_S     	$8,  PT_R8(k1)
-    LONG_S     	$9,  PT_R9(k1)
-    LONG_S     	$10, PT_R10(k1)
-    LONG_S     	$11, PT_R11(k1)
-    LONG_S     	$12, PT_R12(k1)
-    LONG_S     	$13, PT_R13(k1)
-    LONG_S     	$14, PT_R14(k1)
-    LONG_S     	$15, PT_R15(k1)
-    LONG_S     	$16, PT_R16(k1)
-    LONG_S     	$17, PT_R17(k1)
-
-    LONG_S     	$18, PT_R18(k1)
-    LONG_S     	$19, PT_R19(k1)
-    LONG_S     	$20, PT_R20(k1)
-    LONG_S     	$21, PT_R21(k1)
-    LONG_S     	$22, PT_R22(k1)
-    LONG_S     	$23, PT_R23(k1)
-    LONG_S     	$24, PT_R24(k1)
-    LONG_S     	$25, PT_R25(k1)
+	LONG_S	$18, PT_R18(k1)
+	LONG_S	$19, PT_R19(k1)
+	LONG_S	$20, PT_R20(k1)
+	LONG_S	$21, PT_R21(k1)
+	LONG_S	$22, PT_R22(k1)
+	LONG_S	$23, PT_R23(k1)
+	LONG_S	$24, PT_R24(k1)
+	LONG_S	$25, PT_R25(k1)

 	/* XXXKYMA k0/k1 not saved, not being used if we got here through an ioctl() */

-    LONG_S     	$28, PT_R28(k1)
-    LONG_S     	$29, PT_R29(k1)
-    LONG_S     	$30, PT_R30(k1)
-    LONG_S     	$31, PT_R31(k1)
+	LONG_S	$28, PT_R28(k1)
+	LONG_S	$29, PT_R29(k1)
+	LONG_S	$30, PT_R30(k1)
+	LONG_S	$31, PT_R31(k1)

-    /* Save hi/lo */
-	mflo		v0
-	LONG_S		v0, PT_LO(k1)
-	mfhi   		v1
-	LONG_S		v1, PT_HI(k1)
+	/* Save hi/lo */
+	mflo	v0
+	LONG_S	v0, PT_LO(k1)
+	mfhi	v1
+	LONG_S	v1, PT_HI(k1)

 	/* Save host status */
-	mfc0		v0, CP0_STATUS
-	LONG_S		v0, PT_STATUS(k1)
+	mfc0	v0, CP0_STATUS
+	LONG_S	v0, PT_STATUS(k1)

 	/* Save host ASID, shove it into the BVADDR location */
-	mfc0 		v1,CP0_ENTRYHI
-	andi		v1, 0xff
-	LONG_S		v1, PT_HOST_ASID(k1)
+	mfc0	v1, CP0_ENTRYHI
+	andi	v1, 0xff
+	LONG_S	v1, PT_HOST_ASID(k1)

-    /* Save DDATA_LO, will be used to store pointer to vcpu */
-    mfc0        v1, CP0_DDATA_LO
-    LONG_S      v1, PT_HOST_USERLOCAL(k1)
+	/* Save DDATA_LO, will be used to store pointer to vcpu */
+	mfc0	v1, CP0_DDATA_LO
+	LONG_S	v1, PT_HOST_USERLOCAL(k1)

-    /* DDATA_LO has pointer to vcpu */
-    mtc0        a1,CP0_DDATA_LO
+	/* DDATA_LO has pointer to vcpu */
+	mtc0	a1, CP0_DDATA_LO

-    /* Offset into vcpu->arch */
-	addiu		k1, a1, VCPU_HOST_ARCH
+	/* Offset into vcpu->arch */
+	INT_ADDIU k1, a1, VCPU_HOST_ARCH

-    /* Save the host stack to VCPU, used for exception processing when we exit from the Guest */
-    LONG_S      sp, VCPU_HOST_STACK(k1)
+	/*
+	 * Save the host stack to VCPU, used for exception processing
+	 * when we exit from the Guest
+	 */
+	LONG_S	sp, VCPU_HOST_STACK(k1)

-    /* Save the kernel gp as well */
-    LONG_S      gp, VCPU_HOST_GP(k1)
+	/* Save the kernel gp as well */
+	LONG_S	gp, VCPU_HOST_GP(k1)

 	/* Setup status register for running the guest in UM, interrupts are disabled */
-	li			k0,(ST0_EXL | KSU_USER| ST0_BEV)
-	mtc0		k0,CP0_STATUS
-    ehb
+	li	k0, (ST0_EXL | KSU_USER | ST0_BEV)
+	mtc0	k0, CP0_STATUS
+	ehb

-    /* load up the new EBASE */
-    LONG_L      k0, VCPU_GUEST_EBASE(k1)
-    mtc0        k0,CP0_EBASE
+	/* load up the new EBASE */
+	LONG_L	k0, VCPU_GUEST_EBASE(k1)
+	mtc0	k0, CP0_EBASE

-    /* Now that the new EBASE has been loaded, unset BEV, set interrupt mask as it was
-     * but make sure that timer interrupts are enabled
-     */
-    li          k0,(ST0_EXL | KSU_USER | ST0_IE)
-    andi        v0, v0, ST0_IM
-    or          k0, k0, v0
-    mtc0        k0,CP0_STATUS
-    ehb
+	/*
+	 * Now that the new EBASE has been loaded, unset BEV, set
+	 * interrupt mask as it was but make sure that timer interrupts
+	 * are enabled
+	 */
+	li	k0, (ST0_EXL | KSU_USER | ST0_IE)
+	andi	v0, v0, ST0_IM
+	or	k0, k0, v0
+	mtc0	k0, CP0_STATUS
+	ehb


 	/* Set Guest EPC */
-	LONG_L		t0, VCPU_PC(k1)
-	mtc0		t0, CP0_EPC
+	LONG_L	t0, VCPU_PC(k1)
+	mtc0	t0, CP0_EPC

 FEXPORT(__kvm_mips_load_asid)
-    /* Set the ASID for the Guest Kernel */
-    sll         t0, t0, 1                       /* with kseg0 @ 0x40000000, kernel */
-                                                /* addresses shift to 0x80000000 */
-    bltz        t0, 1f                          /* If kernel */
-	addiu       t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-    addiu       t1, k1, VCPU_GUEST_USER_ASID    /* else user */
+	/* Set the ASID for the Guest Kernel */
+	INT_SLL	t0, t0, 1	/* with kseg0 @ 0x40000000, kernel */
+			        /* addresses shift to 0x80000000 */
+	bltz	t0, 1f		/* If kernel */
+	 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
+	INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
 1:
-    /* t1: contains the base of the ASID array, need to get the cpu id  */
-    LONG_L      t2, TI_CPU($28)             /* smp_processor_id */
-    sll         t2, t2, 2                   /* x4 */
-    addu        t3, t1, t2
-    LONG_L      k0, (t3)
-    andi        k0, k0, 0xff
-	mtc0		k0,CP0_ENTRYHI
-    ehb
+	     /* t1: contains the base of the ASID array, need to get the cpu id  */
+	LONG_L	t2, TI_CPU($28)             /* smp_processor_id */
+	INT_SLL	t2, t2, 2                   /* x4 */
+	REG_ADDU t3, t1, t2
+	LONG_L	k0, (t3)
+	andi	k0, k0, 0xff
+	mtc0	k0, CP0_ENTRYHI
+	ehb

-    /* Disable RDHWR access */
-    mtc0    zero,  CP0_HWRENA
+	/* Disable RDHWR access */
+	mtc0	zero, CP0_HWRENA

-    /* Now load up the Guest Context from VCPU */
-    LONG_L     	$1, VCPU_R1(k1)
-    LONG_L     	$2, VCPU_R2(k1)
-    LONG_L     	$3, VCPU_R3(k1)
+	/* Now load up the Guest Context from VCPU */
+	LONG_L	$1, VCPU_R1(k1)
+	LONG_L	$2, VCPU_R2(k1)
+	LONG_L	$3, VCPU_R3(k1)

-    LONG_L     	$4, VCPU_R4(k1)
-    LONG_L     	$5, VCPU_R5(k1)
-    LONG_L     	$6, VCPU_R6(k1)
-    LONG_L     	$7, VCPU_R7(k1)
+	LONG_L	$4, VCPU_R4(k1)
+	LONG_L	$5, VCPU_R5(k1)
+	LONG_L	$6, VCPU_R6(k1)
+	LONG_L	$7, VCPU_R7(k1)

-    LONG_L     	$8,  VCPU_R8(k1)
-    LONG_L     	$9,  VCPU_R9(k1)
-    LONG_L     	$10, VCPU_R10(k1)
-    LONG_L     	$11, VCPU_R11(k1)
-    LONG_L     	$12, VCPU_R12(k1)
-    LONG_L     	$13, VCPU_R13(k1)
-    LONG_L     	$14, VCPU_R14(k1)
-    LONG_L     	$15, VCPU_R15(k1)
-    LONG_L     	$16, VCPU_R16(k1)
-    LONG_L     	$17, VCPU_R17(k1)
-    LONG_L     	$18, VCPU_R18(k1)
-    LONG_L     	$19, VCPU_R19(k1)
-    LONG_L     	$20, VCPU_R20(k1)
-    LONG_L     	$21, VCPU_R21(k1)
-    LONG_L     	$22, VCPU_R22(k1)
-    LONG_L     	$23, VCPU_R23(k1)
-    LONG_L     	$24, VCPU_R24(k1)
-    LONG_L     	$25, VCPU_R25(k1)
+	LONG_L	$8, VCPU_R8(k1)
+	LONG_L	$9, VCPU_R9(k1)
+	LONG_L	$10, VCPU_R10(k1)
+	LONG_L	$11, VCPU_R11(k1)
+	LONG_L	$12, VCPU_R12(k1)
+	LONG_L	$13, VCPU_R13(k1)
+	LONG_L	$14, VCPU_R14(k1)
+	LONG_L	$15, VCPU_R15(k1)
+	LONG_L	$16, VCPU_R16(k1)
+	LONG_L	$17, VCPU_R17(k1)
+	LONG_L	$18, VCPU_R18(k1)
+	LONG_L	$19, VCPU_R19(k1)
+	LONG_L	$20, VCPU_R20(k1)
+	LONG_L	$21, VCPU_R21(k1)
+	LONG_L	$22, VCPU_R22(k1)
+	LONG_L	$23, VCPU_R23(k1)
+	LONG_L	$24, VCPU_R24(k1)
+	LONG_L	$25, VCPU_R25(k1)

-    /* k0/k1 loaded up later */
+	/* k0/k1 loaded up later */

-    LONG_L     	$28, VCPU_R28(k1)
-    LONG_L     	$29, VCPU_R29(k1)
-    LONG_L     	$30, VCPU_R30(k1)
-    LONG_L     	$31, VCPU_R31(k1)
+	LONG_L	$28, VCPU_R28(k1)
+	LONG_L	$29, VCPU_R29(k1)
+	LONG_L	$30, VCPU_R30(k1)
+	LONG_L	$31, VCPU_R31(k1)

-    /* Restore hi/lo */
-	LONG_L		k0, VCPU_LO(k1)
-	mtlo		k0
+	/* Restore hi/lo */
+	LONG_L	k0, VCPU_LO(k1)
+	mtlo	k0

-	LONG_L		k0, VCPU_HI(k1)
-	mthi   		k0
+	LONG_L	k0, VCPU_HI(k1)
+	mthi	k0

 FEXPORT(__kvm_mips_load_k0k1)
 	/* Restore the guest's k0/k1 registers */
-    LONG_L     	k0, VCPU_R26(k1)
-    LONG_L     	k1, VCPU_R27(k1)
+	LONG_L	k0, VCPU_R26(k1)
+	LONG_L	k1, VCPU_R27(k1)

-    /* Jump to guest */
+	/* Jump to guest */
 	eret
-	.set	pop

 VECTOR(MIPSX(exception), unknown)
 /*
 * Find out what mode we came from and jump to the proper handler.
 */
-    .set    push
-	.set	noat
-    .set    noreorder
-    mtc0    k0, CP0_ERROREPC    #01: Save guest k0
-    ehb                         #02:
+	mtc0	k0, CP0_ERROREPC	#01: Save guest k0
+	ehb				#02:

-    mfc0    k0, CP0_EBASE       #02: Get EBASE
-    srl     k0, k0, 10          #03: Get rid of CPUNum
-    sll     k0, k0, 10          #04
-    LONG_S  k1, 0x3000(k0)      #05: Save k1 @ offset 0x3000
-    addiu   k0, k0, 0x2000      #06: Exception handler is installed @ offset 0x2000
-	j	k0				        #07: jump to the function
-	nop				        	#08: branch delay slot
-	.set	push
+	mfc0	k0, CP0_EBASE		#02: Get EBASE
+	INT_SRL	k0, k0, 10		#03: Get rid of CPUNum
+	INT_SLL	k0, k0, 10		#04
+	LONG_S	k1, 0x3000(k0)		#05: Save k1 @ offset 0x3000
+	INT_ADDIU k0, k0, 0x2000		#06: Exception handler is installed @ offset 0x2000
+	j	k0			#07: jump to the function
+	 nop				#08: branch delay slot
 VECTOR_END(MIPSX(exceptionEnd))
 .end MIPSX(exception)

@ -253,329 +251,327 @@ VECTOR_END(MIPSX(exceptionEnd))
 *
 */
 NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
-    .set    push
-    .set    noat
-    .set    noreorder
+	/* Get the VCPU pointer from DDTATA_LO */
+	mfc0	k1, CP0_DDATA_LO
+	INT_ADDIU k1, k1, VCPU_HOST_ARCH

-    /* Get the VCPU pointer from DDTATA_LO */
-    mfc0        k1, CP0_DDATA_LO
-	addiu		k1, k1, VCPU_HOST_ARCH
+	/* Start saving Guest context to VCPU */
+	LONG_S	$0, VCPU_R0(k1)
+	LONG_S	$1, VCPU_R1(k1)
+	LONG_S	$2, VCPU_R2(k1)
+	LONG_S	$3, VCPU_R3(k1)
+	LONG_S	$4, VCPU_R4(k1)
+	LONG_S	$5, VCPU_R5(k1)
+	LONG_S	$6, VCPU_R6(k1)
+	LONG_S	$7, VCPU_R7(k1)
+	LONG_S	$8, VCPU_R8(k1)
+	LONG_S	$9, VCPU_R9(k1)
+	LONG_S	$10, VCPU_R10(k1)
+	LONG_S	$11, VCPU_R11(k1)
+	LONG_S	$12, VCPU_R12(k1)
+	LONG_S	$13, VCPU_R13(k1)
+	LONG_S	$14, VCPU_R14(k1)
+	LONG_S	$15, VCPU_R15(k1)
+	LONG_S	$16, VCPU_R16(k1)
+	LONG_S	$17, VCPU_R17(k1)
+	LONG_S	$18, VCPU_R18(k1)
+	LONG_S	$19, VCPU_R19(k1)
+	LONG_S	$20, VCPU_R20(k1)
+	LONG_S	$21, VCPU_R21(k1)
+	LONG_S	$22, VCPU_R22(k1)
+	LONG_S	$23, VCPU_R23(k1)
+	LONG_S	$24, VCPU_R24(k1)
+	LONG_S	$25, VCPU_R25(k1)

-    /* Start saving Guest context to VCPU */
-    LONG_S  $0, VCPU_R0(k1)
-    LONG_S  $1, VCPU_R1(k1)
-    LONG_S  $2, VCPU_R2(k1)
-    LONG_S  $3, VCPU_R3(k1)
-    LONG_S  $4, VCPU_R4(k1)
-    LONG_S  $5, VCPU_R5(k1)
-    LONG_S  $6, VCPU_R6(k1)
-    LONG_S  $7, VCPU_R7(k1)
-    LONG_S  $8, VCPU_R8(k1)
-    LONG_S  $9, VCPU_R9(k1)
-    LONG_S  $10, VCPU_R10(k1)
-    LONG_S  $11, VCPU_R11(k1)
-    LONG_S  $12, VCPU_R12(k1)
-    LONG_S  $13, VCPU_R13(k1)
-    LONG_S  $14, VCPU_R14(k1)
-    LONG_S  $15, VCPU_R15(k1)
-    LONG_S  $16, VCPU_R16(k1)
-    LONG_S  $17,VCPU_R17(k1)
-    LONG_S  $18, VCPU_R18(k1)
-    LONG_S  $19, VCPU_R19(k1)
-    LONG_S  $20, VCPU_R20(k1)
-    LONG_S  $21, VCPU_R21(k1)
-    LONG_S  $22, VCPU_R22(k1)
-    LONG_S  $23, VCPU_R23(k1)
-    LONG_S  $24, VCPU_R24(k1)
-    LONG_S  $25, VCPU_R25(k1)
+	/* Guest k0/k1 saved later */

-    /* Guest k0/k1 saved later */
+	LONG_S	$28, VCPU_R28(k1)
+	LONG_S	$29, VCPU_R29(k1)
+	LONG_S	$30, VCPU_R30(k1)
+	LONG_S	$31, VCPU_R31(k1)

-    LONG_S  $28, VCPU_R28(k1)
-    LONG_S  $29, VCPU_R29(k1)
-    LONG_S  $30, VCPU_R30(k1)
-    LONG_S  $31, VCPU_R31(k1)
+	/* We need to save hi/lo and restore them on
+	 * the way out
+	 */
+	mfhi	t0
+	LONG_S	t0, VCPU_HI(k1)

-    /* We need to save hi/lo and restore them on
-     * the way out
-     */
-    mfhi    t0
-    LONG_S  t0, VCPU_HI(k1)
+	mflo	t0
+	LONG_S	t0, VCPU_LO(k1)

-    mflo    t0
-    LONG_S  t0, VCPU_LO(k1)
+	/* Finally save guest k0/k1 to VCPU */
+	mfc0	t0, CP0_ERROREPC
+	LONG_S	t0, VCPU_R26(k1)

-    /* Finally save guest k0/k1 to VCPU */
-    mfc0    t0, CP0_ERROREPC
-    LONG_S  t0, VCPU_R26(k1)
+	/* Get GUEST k1 and save it in VCPU */
+	PTR_LI	t1, ~0x2ff
+	mfc0	t0, CP0_EBASE
+	and	t0, t0, t1
+	LONG_L	t0, 0x3000(t0)
+	LONG_S	t0, VCPU_R27(k1)

-    /* Get GUEST k1 and save it in VCPU */
-    la      t1, ~0x2ff
-    mfc0    t0, CP0_EBASE
-    and     t0, t0, t1
-    LONG_L  t0, 0x3000(t0)
-    LONG_S  t0, VCPU_R27(k1)
+	/* Now that context has been saved, we can use other registers */

-    /* Now that context has been saved, we can use other registers */
+	/* Restore vcpu */
+	mfc0	a1, CP0_DDATA_LO
+	move	s1, a1

-    /* Restore vcpu */
-    mfc0        a1, CP0_DDATA_LO
-    move        s1, a1
+	/* Restore run (vcpu->run) */
+	LONG_L	a0, VCPU_RUN(a1)
+	/* Save pointer to run in s0, will be saved by the compiler */
+	move	s0, a0

-   /* Restore run (vcpu->run) */
-    LONG_L      a0, VCPU_RUN(a1)
-    /* Save pointer to run in s0, will be saved by the compiler */
-    move        s0, a0
+	/* Save Host level EPC, BadVaddr and Cause to VCPU, useful to
+	 * process the exception */
+	mfc0	k0,CP0_EPC
+	LONG_S	k0, VCPU_PC(k1)
+
+	mfc0	k0, CP0_BADVADDR
+	LONG_S	k0, VCPU_HOST_CP0_BADVADDR(k1)
+
+	mfc0	k0, CP0_CAUSE
+	LONG_S	k0, VCPU_HOST_CP0_CAUSE(k1)
+
+	mfc0	k0, CP0_ENTRYHI
+	LONG_S	k0, VCPU_HOST_ENTRYHI(k1)
+
+	/* Now restore the host state just enough to run the handlers */
+
+	/* Swtich EBASE to the one used by Linux */
+	/* load up the host EBASE */
+	mfc0	v0, CP0_STATUS
+
+	.set	at
+	or	k0, v0, ST0_BEV
+	.set	noat
+
+	mtc0	k0, CP0_STATUS
+	ehb
+
+	LONG_L	k0, VCPU_HOST_EBASE(k1)
+	mtc0	k0,CP0_EBASE


-    /* Save Host level EPC, BadVaddr and Cause to VCPU, useful to process the exception */
-    mfc0    k0,CP0_EPC
-    LONG_S  k0, VCPU_PC(k1)
+	/* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
+	.set	at
+	and	v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
+	or	v0, v0, ST0_CU0
+	.set	noat
+	mtc0	v0, CP0_STATUS
+	ehb

-    mfc0    k0, CP0_BADVADDR
-    LONG_S  k0, VCPU_HOST_CP0_BADVADDR(k1)
+	/* Load up host GP */
+	LONG_L	gp, VCPU_HOST_GP(k1)

-    mfc0    k0, CP0_CAUSE
-    LONG_S  k0, VCPU_HOST_CP0_CAUSE(k1)
+	/* Need a stack before we can jump to "C" */
+	LONG_L	sp, VCPU_HOST_STACK(k1)

-    mfc0    k0, CP0_ENTRYHI
-    LONG_S  k0, VCPU_HOST_ENTRYHI(k1)
+	/* Saved host state */
+	INT_ADDIU sp, sp, -PT_SIZE

-    /* Now restore the host state just enough to run the handlers */
+	/* XXXKYMA do we need to load the host ASID, maybe not because the
+	 * kernel entries are marked GLOBAL, need to verify
+	 */

-    /* Swtich EBASE to the one used by Linux */
-    /* load up the host EBASE */
-    mfc0        v0, CP0_STATUS
+	/* Restore host DDATA_LO */
+	LONG_L	k0, PT_HOST_USERLOCAL(sp)
+	mtc0	k0, CP0_DDATA_LO

-    .set at
-	or          k0, v0, ST0_BEV
-    .set noat
+	/* Restore RDHWR access */
+	PTR_LI	k0, 0x2000000F
+	mtc0	k0, CP0_HWRENA

-    mtc0        k0, CP0_STATUS
-    ehb
-
-    LONG_L      k0, VCPU_HOST_EBASE(k1)
-    mtc0        k0,CP0_EBASE
-
-
-    /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
-    .set at
-	and         v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
-    or          v0, v0, ST0_CU0
-    .set noat
-    mtc0        v0, CP0_STATUS
-    ehb
-
-    /* Load up host GP */
-    LONG_L  gp, VCPU_HOST_GP(k1)
-
-    /* Need a stack before we can jump to "C" */
-    LONG_L  sp, VCPU_HOST_STACK(k1)
-
-    /* Saved host state */
-    addiu   sp,sp, -PT_SIZE
-
-    /* XXXKYMA do we need to load the host ASID, maybe not because the
-     * kernel entries are marked GLOBAL, need to verify
-     */
-
-    /* Restore host DDATA_LO */
-    LONG_L      k0, PT_HOST_USERLOCAL(sp)
-    mtc0        k0, CP0_DDATA_LO
-
-    /* Restore RDHWR access */
-    la      k0, 0x2000000F
-    mtc0    k0,  CP0_HWRENA
-
-    /* Jump to handler */
+	/* Jump to handler */
 FEXPORT(__kvm_mips_jump_to_handler)
-    /* XXXKYMA: not sure if this is safe, how large is the stack?? */
-    /* Now jump to the kvm_mips_handle_exit() to see if we can deal with this in the kernel */
-    la          t9,kvm_mips_handle_exit
-    jalr.hb     t9
-    addiu       sp,sp, -CALLFRAME_SIZ           /* BD Slot */
+	/* XXXKYMA: not sure if this is safe, how large is the stack??
+	 * Now jump to the kvm_mips_handle_exit() to see if we can deal
+	 * with this in the kernel */
+	PTR_LA	t9, kvm_mips_handle_exit
+	jalr.hb	t9
+	 INT_ADDIU sp, sp, -CALLFRAME_SIZ           /* BD Slot */

-    /* Return from handler Make sure interrupts are disabled */
-    di
-    ehb
+	/* Return from handler Make sure interrupts are disabled */
+	di
+	ehb

-    /* XXXKYMA: k0/k1 could have been blown away if we processed an exception
-     * while we were handling the exception from the guest, reload k1
-     */
-    move        k1, s1
-	addiu		k1, k1, VCPU_HOST_ARCH
+	/* XXXKYMA: k0/k1 could have been blown away if we processed
+	 * an exception while we were handling the exception from the
+	 * guest, reload k1
+	 */

-    /* Check return value, should tell us if we are returning to the host (handle I/O etc)
-     * or resuming the guest
-     */
-    andi        t0, v0, RESUME_HOST
-    bnez        t0, __kvm_mips_return_to_host
-    nop
+	move	k1, s1
+	INT_ADDIU k1, k1, VCPU_HOST_ARCH
+
+	/* Check return value, should tell us if we are returning to the
+	 * host (handle I/O etc)or resuming the guest
+	 */
+	andi	t0, v0, RESUME_HOST
+	bnez	t0, __kvm_mips_return_to_host
+	 nop

 __kvm_mips_return_to_guest:
-    /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
-    mtc0        s1, CP0_DDATA_LO
+	/* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
+	mtc0	s1, CP0_DDATA_LO

-    /* Load up the Guest EBASE to minimize the window where BEV is set */
-    LONG_L      t0, VCPU_GUEST_EBASE(k1)
+	/* Load up the Guest EBASE to minimize the window where BEV is set */
+	LONG_L	t0, VCPU_GUEST_EBASE(k1)

-    /* Switch EBASE back to the one used by KVM */
-    mfc0        v1, CP0_STATUS
-    .set at
-	or          k0, v1, ST0_BEV
-    .set noat
-    mtc0        k0, CP0_STATUS
-    ehb
-    mtc0        t0,CP0_EBASE
-
-    /* Setup status register for running guest in UM */
-    .set at
-    or     v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
-    and     v1, v1, ~ST0_CU0
-    .set noat
-    mtc0    v1, CP0_STATUS
-    ehb
+	/* Switch EBASE back to the one used by KVM */
+	mfc0	v1, CP0_STATUS
+	.set	at
+	or	k0, v1, ST0_BEV
+	.set	noat
+	mtc0	k0, CP0_STATUS
+	ehb
+	mtc0	t0, CP0_EBASE

+	/* Setup status register for running guest in UM */
+	.set	at
+	or	v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
+	and	v1, v1, ~ST0_CU0
+	.set	noat
+	mtc0	v1, CP0_STATUS
+	ehb

 	/* Set Guest EPC */
-	LONG_L		t0, VCPU_PC(k1)
-	mtc0		t0, CP0_EPC
+	LONG_L	t0, VCPU_PC(k1)
+	mtc0	t0, CP0_EPC

-    /* Set the ASID for the Guest Kernel */
-    sll         t0, t0, 1                       /* with kseg0 @ 0x40000000, kernel */
-                                                /* addresses shift to 0x80000000 */
-    bltz        t0, 1f                          /* If kernel */
-	addiu       t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-    addiu       t1, k1, VCPU_GUEST_USER_ASID    /* else user */
+	/* Set the ASID for the Guest Kernel */
+	INT_SLL	t0, t0, 1	/* with kseg0 @ 0x40000000, kernel */
+				/* addresses shift to 0x80000000 */
+	bltz	t0, 1f		/* If kernel */
+	 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
+	INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
 1:
-    /* t1: contains the base of the ASID array, need to get the cpu id  */
-    LONG_L      t2, TI_CPU($28)             /* smp_processor_id */
-    sll         t2, t2, 2                   /* x4 */
-    addu        t3, t1, t2
-    LONG_L      k0, (t3)
-    andi        k0, k0, 0xff
-	mtc0		k0,CP0_ENTRYHI
-    ehb
+	/* t1: contains the base of the ASID array, need to get the cpu id  */
+	LONG_L	t2, TI_CPU($28)		/* smp_processor_id */
+	INT_SLL	t2, t2, 2		/* x4 */
+	REG_ADDU t3, t1, t2
+	LONG_L	k0, (t3)
+	andi	k0, k0, 0xff
+	mtc0	k0,CP0_ENTRYHI
+	ehb

-    /* Disable RDHWR access */
-    mtc0    zero,  CP0_HWRENA
+	/* Disable RDHWR access */
+	mtc0    zero,  CP0_HWRENA

-    /* load the guest context from VCPU and return */
-    LONG_L  $0, VCPU_R0(k1)
-    LONG_L  $1, VCPU_R1(k1)
-    LONG_L  $2, VCPU_R2(k1)
-    LONG_L  $3, VCPU_R3(k1)
-    LONG_L  $4, VCPU_R4(k1)
-    LONG_L  $5, VCPU_R5(k1)
-    LONG_L  $6, VCPU_R6(k1)
-    LONG_L  $7, VCPU_R7(k1)
-    LONG_L  $8, VCPU_R8(k1)
-    LONG_L  $9, VCPU_R9(k1)
-    LONG_L  $10, VCPU_R10(k1)
-    LONG_L  $11, VCPU_R11(k1)
-    LONG_L  $12, VCPU_R12(k1)
-    LONG_L  $13, VCPU_R13(k1)
-    LONG_L  $14, VCPU_R14(k1)
-    LONG_L  $15, VCPU_R15(k1)
-    LONG_L  $16, VCPU_R16(k1)
-    LONG_L  $17, VCPU_R17(k1)
-    LONG_L  $18, VCPU_R18(k1)
-    LONG_L  $19, VCPU_R19(k1)
-    LONG_L  $20, VCPU_R20(k1)
-    LONG_L  $21, VCPU_R21(k1)
-    LONG_L  $22, VCPU_R22(k1)
-    LONG_L  $23, VCPU_R23(k1)
-    LONG_L  $24, VCPU_R24(k1)
-    LONG_L  $25, VCPU_R25(k1)
+	/* load the guest context from VCPU and return */
+	LONG_L	$0, VCPU_R0(k1)
+	LONG_L	$1, VCPU_R1(k1)
+	LONG_L	$2, VCPU_R2(k1)
+	LONG_L	$3, VCPU_R3(k1)
+	LONG_L	$4, VCPU_R4(k1)
+	LONG_L	$5, VCPU_R5(k1)
+	LONG_L	$6, VCPU_R6(k1)
+	LONG_L	$7, VCPU_R7(k1)
+	LONG_L	$8, VCPU_R8(k1)
+	LONG_L	$9, VCPU_R9(k1)
+	LONG_L	$10, VCPU_R10(k1)
+	LONG_L	$11, VCPU_R11(k1)
+	LONG_L	$12, VCPU_R12(k1)
+	LONG_L	$13, VCPU_R13(k1)
+	LONG_L	$14, VCPU_R14(k1)
+	LONG_L	$15, VCPU_R15(k1)
+	LONG_L	$16, VCPU_R16(k1)
+	LONG_L	$17, VCPU_R17(k1)
+	LONG_L	$18, VCPU_R18(k1)
+	LONG_L	$19, VCPU_R19(k1)
+	LONG_L	$20, VCPU_R20(k1)
+	LONG_L	$21, VCPU_R21(k1)
+	LONG_L	$22, VCPU_R22(k1)
+	LONG_L	$23, VCPU_R23(k1)
+	LONG_L	$24, VCPU_R24(k1)
+	LONG_L	$25, VCPU_R25(k1)

-    /* $/k1 loaded later */
-    LONG_L  $28, VCPU_R28(k1)
-    LONG_L  $29, VCPU_R29(k1)
-    LONG_L  $30, VCPU_R30(k1)
-    LONG_L  $31, VCPU_R31(k1)
+	/* $/k1 loaded later */
+	LONG_L	$28, VCPU_R28(k1)
+	LONG_L	$29, VCPU_R29(k1)
+	LONG_L	$30, VCPU_R30(k1)
+	LONG_L	$31, VCPU_R31(k1)

 FEXPORT(__kvm_mips_skip_guest_restore)
-    LONG_L  k0, VCPU_HI(k1)
-    mthi    k0
+	LONG_L	k0, VCPU_HI(k1)
+	mthi	k0

-    LONG_L  k0, VCPU_LO(k1)
-    mtlo    k0
+	LONG_L	k0, VCPU_LO(k1)
+	mtlo	k0

-    LONG_L  k0, VCPU_R26(k1)
-    LONG_L  k1, VCPU_R27(k1)
+	LONG_L	k0, VCPU_R26(k1)
+	LONG_L	k1, VCPU_R27(k1)

-    eret
+	eret

 __kvm_mips_return_to_host:
-    /* EBASE is already pointing to Linux */
-    LONG_L  k1, VCPU_HOST_STACK(k1)
-	addiu  	k1,k1, -PT_SIZE
+	/* EBASE is already pointing to Linux */
+	LONG_L	k1, VCPU_HOST_STACK(k1)
+	INT_ADDIU k1,k1, -PT_SIZE

-    /* Restore host DDATA_LO */
-    LONG_L      k0, PT_HOST_USERLOCAL(k1)
-    mtc0        k0, CP0_DDATA_LO
+	/* Restore host DDATA_LO */
+	LONG_L	k0, PT_HOST_USERLOCAL(k1)
+	mtc0	k0, CP0_DDATA_LO

-    /* Restore host ASID */
-    LONG_L      k0, PT_HOST_ASID(sp)
-    andi        k0, 0xff
-    mtc0        k0,CP0_ENTRYHI
-    ehb
+	/* Restore host ASID */
+	LONG_L	k0, PT_HOST_ASID(sp)
+	andi	k0, 0xff
+	mtc0	k0,CP0_ENTRYHI
+	ehb

-    /* Load context saved on the host stack */
-    LONG_L  $0, PT_R0(k1)
-    LONG_L  $1, PT_R1(k1)
+	/* Load context saved on the host stack */
+	LONG_L	$0, PT_R0(k1)
+	LONG_L	$1, PT_R1(k1)

-    /* r2/v0 is the return code, shift it down by 2 (arithmetic) to recover the err code  */
-    sra     k0, v0, 2
-    move    $2, k0
+	/* r2/v0 is the return code, shift it down by 2 (arithmetic)
+	 * to recover the err code  */
+	INT_SRA	k0, v0, 2
+	move	$2, k0

-    LONG_L  $3, PT_R3(k1)
-    LONG_L  $4, PT_R4(k1)
-    LONG_L  $5, PT_R5(k1)
-    LONG_L  $6, PT_R6(k1)
-    LONG_L  $7, PT_R7(k1)
-    LONG_L  $8, PT_R8(k1)
-    LONG_L  $9, PT_R9(k1)
-    LONG_L  $10, PT_R10(k1)
-    LONG_L  $11, PT_R11(k1)
-    LONG_L  $12, PT_R12(k1)
-    LONG_L  $13, PT_R13(k1)
-    LONG_L  $14, PT_R14(k1)
-    LONG_L  $15, PT_R15(k1)
-    LONG_L  $16, PT_R16(k1)
-    LONG_L  $17, PT_R17(k1)
-    LONG_L  $18, PT_R18(k1)
-    LONG_L  $19, PT_R19(k1)
-    LONG_L  $20, PT_R20(k1)
-    LONG_L  $21, PT_R21(k1)
-    LONG_L  $22, PT_R22(k1)
-    LONG_L  $23, PT_R23(k1)
-    LONG_L  $24, PT_R24(k1)
-    LONG_L  $25, PT_R25(k1)
+	LONG_L	$3, PT_R3(k1)
+	LONG_L	$4, PT_R4(k1)
+	LONG_L	$5, PT_R5(k1)
+	LONG_L	$6, PT_R6(k1)
+	LONG_L	$7, PT_R7(k1)
+	LONG_L	$8, PT_R8(k1)
+	LONG_L	$9, PT_R9(k1)
+	LONG_L	$10, PT_R10(k1)
+	LONG_L	$11, PT_R11(k1)
+	LONG_L	$12, PT_R12(k1)
+	LONG_L	$13, PT_R13(k1)
+	LONG_L	$14, PT_R14(k1)
+	LONG_L	$15, PT_R15(k1)
+	LONG_L	$16, PT_R16(k1)
+	LONG_L	$17, PT_R17(k1)
+	LONG_L	$18, PT_R18(k1)
+	LONG_L	$19, PT_R19(k1)
+	LONG_L	$20, PT_R20(k1)
+	LONG_L	$21, PT_R21(k1)
+	LONG_L	$22, PT_R22(k1)
+	LONG_L	$23, PT_R23(k1)
+	LONG_L	$24, PT_R24(k1)
+	LONG_L	$25, PT_R25(k1)

-    /* Host k0/k1 were not saved */
+	/* Host k0/k1 were not saved */

-    LONG_L  $28, PT_R28(k1)
-    LONG_L  $29, PT_R29(k1)
-    LONG_L  $30, PT_R30(k1)
+	LONG_L	$28, PT_R28(k1)
+	LONG_L	$29, PT_R29(k1)
+	LONG_L	$30, PT_R30(k1)

-    LONG_L  k0, PT_HI(k1)
-    mthi    k0
+	LONG_L	k0, PT_HI(k1)
+	mthi	k0

-    LONG_L  k0, PT_LO(k1)
-    mtlo    k0
+	LONG_L	k0, PT_LO(k1)
+	mtlo	k0

-    /* Restore RDHWR access */
-    la      k0, 0x2000000F
-    mtc0    k0,  CP0_HWRENA
+	/* Restore RDHWR access */
+	PTR_LI	k0, 0x2000000F
+	mtc0	k0,  CP0_HWRENA


-    /* Restore RA, which is the address we will return to */
-    LONG_L  ra, PT_R31(k1)
-    j       ra
-    nop
+	/* Restore RA, which is the address we will return to */
+	LONG_L  ra, PT_R31(k1)
+	j       ra
+	 nop

-    .set    pop
 VECTOR_END(MIPSX(GuestExceptionEnd))
 .end MIPSX(GuestException)

@ -627,24 +623,23 @@ MIPSX(exceptions):

 #define HW_SYNCI_Step       $1
 LEAF(MIPSX(SyncICache))
-    .set    push
+	.set	push
 	.set	mips32r2
-    beq     a1, zero, 20f
-    nop
-    addu    a1, a0, a1
-    rdhwr   v0, HW_SYNCI_Step
-    beq     v0, zero, 20f
-    nop
-
+	beq	a1, zero, 20f
+	 nop
+	REG_ADDU a1, a0, a1
+	rdhwr	v0, HW_SYNCI_Step
+	beq	v0, zero, 20f
+	 nop
 10:
-    synci   0(a0)
-    addu    a0, a0, v0
-    sltu    v1, a0, a1
-    bne     v1, zero, 10b
-    nop
-    sync
+	synci	0(a0)
+	REG_ADDU a0, a0, v0
+	sltu	v1, a0, a1
+	bne	v1, zero, 10b
+	 nop
+	sync
 20:
-    jr.hb   ra
-    nop
-    .set pop
+	jr.hb	ra
+	 nop
+	.set	pop
 END(MIPSX(SyncICache))
--- a/arch/mips/kvm/kvm_mips.c
+++ b/arch/mips/kvm/kvm_mips.c
@ -208,6 +208,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 	return 0;
 }

+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
                                struct kvm_userspace_memory_region *mem,
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@ -334,6 +334,27 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
 	return r;
 }

+/*
+ * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
+ * Because the sc instruction sets SRR0 to point to the following
+ * instruction, we have to fetch from pc - 4.
+ */
+static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
+{
+	ulong pc = kvmppc_get_pc(vcpu) - 4;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	u32 r;
+
+	/* Load the instruction manually if it failed to do so in the
+	 * exit path */
+	if (svcpu->last_inst == KVM_INST_FETCH_FAILED)
+		kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false);
+
+	r = svcpu->last_inst;
+	svcpu_put(svcpu);
+	return r;
+}
+
 static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
@ -446,6 +467,23 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
 	return vcpu->arch.last_inst;
 }

+/*
+ * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
+ * Because the sc instruction sets SRR0 to point to the following
+ * instruction, we have to fetch from pc - 4.
+ */
+static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
+{
+	ulong pc = kvmppc_get_pc(vcpu) - 4;
+
+	/* Load the instruction manually if it failed to do so in the
+	 * exit path */
+	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
+		kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
+
+	return vcpu->arch.last_inst;
+}
+
 static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.fault_dar;
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@ -37,7 +37,7 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)

 #ifdef CONFIG_KVM_BOOK3S_64_HV
 #define KVM_DEFAULT_HPT_ORDER	24	/* 16MB HPT by default */
-extern int kvm_hpt_order;		/* order of preallocated HPTs */
+extern unsigned long kvm_rma_pages;
 #endif

 #define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
@ -100,7 +100,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 			/* (masks depend on page size) */
 			rb |= 0x1000;		/* page encoding in LP field */
 			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
-			rb |= (va_low & 0xfe);	/* AVAL field (P7 doesn't seem to care) */
+			rb |= ((va_low << 4) & 0xf0);	/* AVAL field (P7 doesn't seem to care) */
 		}
 	} else {
 		/* 4kB page */
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@ -183,13 +183,9 @@ struct kvmppc_spapr_tce_table {
 	struct page *pages[0];
 };

-struct kvmppc_linear_info {
-	void		*base_virt;
-	unsigned long	 base_pfn;
-	unsigned long	 npages;
-	struct list_head list;
-	atomic_t	 use_count;
-	int		 type;
+struct kvm_rma_info {
+	atomic_t use_count;
+	unsigned long base_pfn;
 };

 /* XICS components, defined in book3s_xics.c */
@ -246,7 +242,7 @@ struct kvm_arch {
 	int tlbie_lock;
 	unsigned long lpcr;
 	unsigned long rmor;
-	struct kvmppc_linear_info *rma;
+	struct kvm_rma_info *rma;
 	unsigned long vrma_slb_v;
 	int rma_setup_done;
 	int using_mmu_notifiers;
@ -259,7 +255,7 @@ struct kvm_arch {
 	spinlock_t slot_phys_lock;
 	cpumask_t need_tlb_flush;
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
-	struct kvmppc_linear_info *hpt_li;
+	int hpt_cma_alloc;
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 #ifdef CONFIG_PPC_BOOK3S_64
 	struct list_head spapr_tce_tables;
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@ -137,10 +137,10 @@ extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			     unsigned long ioba, unsigned long tce);
 extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
 				struct kvm_allocate_rma *rma);
-extern struct kvmppc_linear_info *kvm_alloc_rma(void);
-extern void kvm_release_rma(struct kvmppc_linear_info *ri);
-extern struct kvmppc_linear_info *kvm_alloc_hpt(void);
-extern void kvm_release_hpt(struct kvmppc_linear_info *li);
+extern struct kvm_rma_info *kvm_alloc_rma(void);
+extern void kvm_release_rma(struct kvm_rma_info *ri);
+extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
+extern void kvm_release_hpt(struct page *page, unsigned long nr_pages);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
@ -261,6 +261,7 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 struct openpic;

 #ifdef CONFIG_KVM_BOOK3S_64_HV
+extern void kvm_cma_reserve(void) __init;
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
 	paca[cpu].kvm_hstate.xics_phys = addr;
@ -281,13 +282,12 @@ static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
 }

 extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu);
-extern void kvm_linear_init(void);

 #else
-static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+static inline void __init kvm_cma_reserve(void)
 {}

-static inline void kvm_linear_init(void)
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {}

 static inline u32 kvmppc_get_xics_latch(void)
@ -394,10 +394,15 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
 	}
 }

-/* Please call after prepare_to_enter. This function puts the lazy ee state
-   back to normal mode, without actually enabling interrupts. */
-static inline void kvmppc_lazy_ee_enable(void)
+/*
+ * Please call after prepare_to_enter. This function puts the lazy ee and irq
+ * disabled tracking state back to normal mode, without actually enabling
+ * interrupts.
+ */
+static inline void kvmppc_fix_ee_before_entry(void)
 {
+	trace_hardirqs_on();
+
 #ifdef CONFIG_PPC64
 	/* Only need to enable IRQs by hard enabling them after this */
 	local_paca->irq_happened = 0;
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@ -454,6 +454,7 @@ int main(void)
 	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
 	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
 #endif
+	DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
 	DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
 	DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
 	DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6));
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@ -229,6 +229,8 @@ void __init early_setup(unsigned long dt_ptr)
 	/* Initialize the hash table or TLB handling */
 	early_init_mmu();

+	kvm_cma_reserve();
+
 	/*
 	 * Reserve any gigantic pages requested on the command line.
 	 * memblock needs to have been initialized by the time this is
@ -609,8 +611,6 @@ void __init setup_arch(char **cmdline_p)
 	/* Initialize the MMU context management stuff */
 	mmu_context_init();

-	kvm_linear_init();
-
 	/* Interrupt code needs to be 64K-aligned */
 	if ((unsigned long)_stext & 0xffff)
 		panic("Kernelbase not 64K-aligned (0x%lx)!\n",
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@ -72,6 +72,7 @@ config KVM_BOOK3S_64_HV
 	bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
 	depends on KVM_BOOK3S_64
 	select MMU_NOTIFIER
+	select CMA
 	---help---
 	  Support running unmodified book3s_64 guest kernels in
 	  virtual machines on POWER7 and PPC970 processors that have
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@ -81,6 +81,7 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_64_vio_hv.o \
 	book3s_hv_ras.o \
 	book3s_hv_builtin.o \
+	book3s_hv_cma.o \
 	$(kvm-book3s_64-builtin-xics-objs-y)

 kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@ -182,10 +182,13 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	hva_t ptegp;
 	u64 pteg[16];
 	u64 avpn = 0;
+	u64 v, r;
+	u64 v_val, v_mask;
+	u64 eaddr_mask;
 	int i;
-	u8 key = 0;
+	u8 pp, key = 0;
 	bool found = false;
-	int second = 0;
+	bool second = false;
 	ulong mp_ea = vcpu->arch.magic_page_ea;

 	/* Magic page override */
@ -208,8 +211,16 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		goto no_seg_found;

 	avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
+	v_val = avpn & HPTE_V_AVPN;
+
 	if (slbe->tb)
-		avpn |= SLB_VSID_B_1T;
+		v_val |= SLB_VSID_B_1T;
+	if (slbe->large)
+		v_val |= HPTE_V_LARGE;
+	v_val |= HPTE_V_VALID;
+
+	v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID |
+		HPTE_V_SECONDARY;

 do_second:
 	ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second);
@ -227,90 +238,73 @@ do_second:
 		key = 4;

 	for (i=0; i<16; i+=2) {
-		u64 v = pteg[i];
-		u64 r = pteg[i+1];
-
-		/* Valid check */
-		if (!(v & HPTE_V_VALID))
-			continue;
-		/* Hash check */
-		if ((v & HPTE_V_SECONDARY) != second)
-			continue;
-
-		/* AVPN compare */
-		if (HPTE_V_COMPARE(avpn, v)) {
-			u8 pp = (r & HPTE_R_PP) | key;
-			int eaddr_mask = 0xFFF;
-
-			gpte->eaddr = eaddr;
-			gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu,
-								    eaddr,
-								    data);
-			if (slbe->large)
-				eaddr_mask = 0xFFFFFF;
-			gpte->raddr = (r & HPTE_R_RPN) | (eaddr & eaddr_mask);
-			gpte->may_execute = ((r & HPTE_R_N) ? false : true);
-			gpte->may_read = false;
-			gpte->may_write = false;
-
-			switch (pp) {
-			case 0:
-			case 1:
-			case 2:
-			case 6:
-				gpte->may_write = true;
-				/* fall through */
-			case 3:
-			case 5:
-			case 7:
-				gpte->may_read = true;
-				break;
-			}
-
-			dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
-				"-> 0x%lx\n",
-				eaddr, avpn, gpte->vpage, gpte->raddr);
+		/* Check all relevant fields of 1st dword */
+		if ((pteg[i] & v_mask) == v_val) {
 			found = true;
 			break;
 		}
 	}

+	if (!found) {
+		if (second)
+			goto no_page_found;
+		v_val |= HPTE_V_SECONDARY;
+		second = true;
+		goto do_second;
+	}
+
+	v = pteg[i];
+	r = pteg[i+1];
+	pp = (r & HPTE_R_PP) | key;
+	eaddr_mask = 0xFFF;
+
+	gpte->eaddr = eaddr;
+	gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
+	if (slbe->large)
+		eaddr_mask = 0xFFFFFF;
+	gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask);
+	gpte->may_execute = ((r & HPTE_R_N) ? false : true);
+	gpte->may_read = false;
+	gpte->may_write = false;
+
+	switch (pp) {
+	case 0:
+	case 1:
+	case 2:
+	case 6:
+		gpte->may_write = true;
+		/* fall through */
+	case 3:
+	case 5:
+	case 7:
+		gpte->may_read = true;
+		break;
+	}
+
+	dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
+		"-> 0x%lx\n",
+		eaddr, avpn, gpte->vpage, gpte->raddr);
+
 	/* Update PTE R and C bits, so the guest's swapper knows we used the
 	 * page */
-	if (found) {
-		u32 oldr = pteg[i+1];
-
-		if (gpte->may_read) {
-			/* Set the accessed flag */
-			pteg[i+1] |= HPTE_R_R;
-		}
-		if (gpte->may_write) {
-			/* Set the dirty flag */
-			pteg[i+1] |= HPTE_R_C;
-		} else {
-			dprintk("KVM: Mapping read-only page!\n");
-		}
-
-		/* Write back into the PTEG */
-		if (pteg[i+1] != oldr)
-			copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
-
-		if (!gpte->may_read)
-			return -EPERM;
-		return 0;
-	} else {
-		dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx "
-			"ptegp=0x%lx)\n",
-			eaddr, to_book3s(vcpu)->sdr1, ptegp);
-		for (i = 0; i < 16; i += 2)
-			dprintk("   %02d: 0x%llx - 0x%llx (0x%llx)\n",
-				i, pteg[i], pteg[i+1], avpn);
-
-		if (!second) {
-			second = HPTE_V_SECONDARY;
-			goto do_second;
-		}
+	if (gpte->may_read) {
+		/* Set the accessed flag */
+		r |= HPTE_R_R;
 	}
+	if (data && gpte->may_write) {
+		/* Set the dirty flag -- XXX even if not writing */
+		r |= HPTE_R_C;
+	}
+
+	/* Write back into the PTEG */
+	if (pteg[i+1] != r) {
+		pteg[i+1] = r;
+		copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
+	}
+
+	if (!gpte->may_read)
+		return -EPERM;
+	return 0;

 no_page_found:
 	return -ENOENT;
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@ -37,6 +37,8 @@
 #include <asm/ppc-opcode.h>
 #include <asm/cputable.h>

+#include "book3s_hv_cma.h"
+
 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
 #define MAX_LPID_970	63

@ -52,8 +54,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 {
 	unsigned long hpt;
 	struct revmap_entry *rev;
-	struct kvmppc_linear_info *li;
-	long order = kvm_hpt_order;
+	struct page *page = NULL;
+	long order = KVM_DEFAULT_HPT_ORDER;

 	if (htab_orderp) {
 		order = *htab_orderp;
@ -61,26 +63,23 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 			order = PPC_MIN_HPT_ORDER;
 	}

+	kvm->arch.hpt_cma_alloc = 0;
 	/*
-	 * If the user wants a different size from default,
 	 * try first to allocate it from the kernel page allocator.
+	 * We keep the CMA reserved for failed allocation.
 	 */
-	hpt = 0;
-	if (order != kvm_hpt_order) {
-		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
-				       __GFP_NOWARN, order - PAGE_SHIFT);
-		if (!hpt)
-			--order;
-	}
+	hpt = __get_free_pages(GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT |
+			       __GFP_NOWARN, order - PAGE_SHIFT);

 	/* Next try to allocate from the preallocated pool */
 	if (!hpt) {
-		li = kvm_alloc_hpt();
-		if (li) {
-			hpt = (ulong)li->base_virt;
-			kvm->arch.hpt_li = li;
-			order = kvm_hpt_order;
-		}
+		VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER);
+		page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
+		if (page) {
+			hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+			kvm->arch.hpt_cma_alloc = 1;
+		} else
+			--order;
 	}

 	/* Lastly try successively smaller sizes from the page allocator */
@ -118,8 +117,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 	return 0;

 out_freehpt:
-	if (kvm->arch.hpt_li)
-		kvm_release_hpt(kvm->arch.hpt_li);
+	if (kvm->arch.hpt_cma_alloc)
+		kvm_release_hpt(page, 1 << (order - PAGE_SHIFT));
 	else
 		free_pages(hpt, order - PAGE_SHIFT);
 	return -ENOMEM;
@ -165,8 +164,9 @@ void kvmppc_free_hpt(struct kvm *kvm)
 {
 	kvmppc_free_lpid(kvm->arch.lpid);
 	vfree(kvm->arch.revmap);
-	if (kvm->arch.hpt_li)
-		kvm_release_hpt(kvm->arch.hpt_li);
+	if (kvm->arch.hpt_cma_alloc)
+		kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
+				1 << (kvm->arch.hpt_order - PAGE_SHIFT));
 	else
 		free_pages(kvm->arch.hpt_virt,
 			   kvm->arch.hpt_order - PAGE_SHIFT);
@ -1579,7 +1579,7 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
 	ctx->first_pass = 1;

 	rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
-	ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag);
+	ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
 	if (ret < 0) {
 		kvm_put_kvm(kvm);
 		return ret;
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@ -136,7 +136,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	mutex_unlock(&kvm->lock);

 	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
-				stt, O_RDWR);
+				stt, O_RDWR | O_CLOEXEC);

 fail:
 	if (stt) {
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@ -458,6 +458,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 	case SPRN_PMC4_GEKKO:
 	case SPRN_WPAR_GEKKO:
 	case SPRN_MSSSR0:
+	case SPRN_DABR:
 		break;
 unprivileged:
 	default:
@ -555,6 +556,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 	case SPRN_PMC4_GEKKO:
 	case SPRN_WPAR_GEKKO:
 	case SPRN_MSSSR0:
+	case SPRN_DABR:
 		*spr_val = 0;
 		break;
 	default:
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@ -680,13 +680,12 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 }

 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
+				  struct kvm_sregs *sregs)
 {
 	int i;

-	sregs->pvr = vcpu->arch.pvr;
-
 	memset(sregs, 0, sizeof(struct kvm_sregs));
+	sregs->pvr = vcpu->arch.pvr;
 	for (i = 0; i < vcpu->arch.slb_max; i++) {
 		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
 		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
@ -696,7 +695,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 }

 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
+				  struct kvm_sregs *sregs)
 {
 	int i, j;

@ -1511,10 +1510,10 @@ static inline int lpcr_rmls(unsigned long rma_size)

 static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	struct kvmppc_linear_info *ri = vma->vm_file->private_data;
 	struct page *page;
+	struct kvm_rma_info *ri = vma->vm_file->private_data;

-	if (vmf->pgoff >= ri->npages)
+	if (vmf->pgoff >= kvm_rma_pages)
 		return VM_FAULT_SIGBUS;

 	page = pfn_to_page(ri->base_pfn + vmf->pgoff);
@ -1536,7 +1535,7 @@ static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)

 static int kvm_rma_release(struct inode *inode, struct file *filp)
 {
-	struct kvmppc_linear_info *ri = filp->private_data;
+	struct kvm_rma_info *ri = filp->private_data;

 	kvm_release_rma(ri);
 	return 0;
@ -1549,18 +1548,27 @@ static const struct file_operations kvm_rma_fops = {

 long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
 {
-	struct kvmppc_linear_info *ri;
 	long fd;
+	struct kvm_rma_info *ri;
+	/*
+	 * Only do this on PPC970 in HV mode
+	 */
+	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+	    !cpu_has_feature(CPU_FTR_ARCH_201))
+		return -EINVAL;
+
+	if (!kvm_rma_pages)
+		return -EINVAL;

 	ri = kvm_alloc_rma();
 	if (!ri)
 		return -ENOMEM;

-	fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR);
+	fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR | O_CLOEXEC);
 	if (fd < 0)
 		kvm_release_rma(ri);

-	ret->rma_size = ri->npages << PAGE_SHIFT;
+	ret->rma_size = kvm_rma_pages << PAGE_SHIFT;
 	return fd;
 }

@ -1725,7 +1733,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 {
 	int err = 0;
 	struct kvm *kvm = vcpu->kvm;
-	struct kvmppc_linear_info *ri = NULL;
+	struct kvm_rma_info *ri = NULL;
 	unsigned long hva;
 	struct kvm_memory_slot *memslot;
 	struct vm_area_struct *vma;
@ -1803,7 +1811,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)

 	} else {
 		/* Set up to use an RMO region */
-		rma_size = ri->npages;
+		rma_size = kvm_rma_pages;
 		if (rma_size > memslot->npages)
 			rma_size = memslot->npages;
 		rma_size <<= PAGE_SHIFT;
@ -1831,14 +1839,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 			/* POWER7 */
 			lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
 			lpcr |= rmls << LPCR_RMLS_SH;
-			kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+			kvm->arch.rmor = ri->base_pfn << PAGE_SHIFT;
 		}
 		kvm->arch.lpcr = lpcr;
 		pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n",
 			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);

 		/* Initialize phys addrs of pages in RMO */
-		npages = ri->npages;
+		npages = kvm_rma_pages;
 		porder = __ilog2(npages);
 		physp = memslot->arch.slot_phys;
 		if (physp) {
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@ -13,33 +13,34 @@
 #include <linux/spinlock.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/sizes.h>

 #include <asm/cputable.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>

-#define KVM_LINEAR_RMA		0
-#define KVM_LINEAR_HPT		1
-
-static void __init kvm_linear_init_one(ulong size, int count, int type);
-static struct kvmppc_linear_info *kvm_alloc_linear(int type);
-static void kvm_release_linear(struct kvmppc_linear_info *ri);
-
-int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER;
-EXPORT_SYMBOL_GPL(kvm_hpt_order);
-
-/*************** RMA *************/
-
+#include "book3s_hv_cma.h"
 /*
- * This maintains a list of RMAs (real mode areas) for KVM guests to use.
+ * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
+ * should be power of 2.
+ */
+#define HPT_ALIGN_PAGES		((1 << 18) >> PAGE_SHIFT) /* 256k */
+/*
+ * By default we reserve 5% of memory for hash pagetable allocation.
+ */
+static unsigned long kvm_cma_resv_ratio = 5;
+/*
+ * We allocate RMAs (real mode areas) for KVM guests from the KVM CMA area.
 * Each RMA has to be physically contiguous and of a size that the
 * hardware supports.  PPC970 and POWER7 support 64MB, 128MB and 256MB,
 * and other larger sizes.  Since we are unlikely to be allocate that
 * much physically contiguous memory after the system is up and running,
- * we preallocate a set of RMAs in early boot for KVM to use.
+ * we preallocate a set of RMAs in early boot using CMA.
+ * should be power of 2.
 */
-static unsigned long kvm_rma_size = 64 << 20;	/* 64MB */
-static unsigned long kvm_rma_count;
+unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT;	/* 128MB */
+EXPORT_SYMBOL_GPL(kvm_rma_pages);

 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
   Assumes POWER7 or PPC970. */
@ -69,165 +70,114 @@ static inline int lpcr_rmls(unsigned long rma_size)

 static int __init early_parse_rma_size(char *p)
 {
+	unsigned long kvm_rma_size;
+
+	pr_debug("%s(%s)\n", __func__, p);
 	if (!p)
-		return 1;
-
+		return -EINVAL;
 	kvm_rma_size = memparse(p, &p);
-
+	/*
+	 * Check that the requested size is one supported in hardware
+	 */
+	if (lpcr_rmls(kvm_rma_size) < 0) {
+		pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
+		return -EINVAL;
+	}
+	kvm_rma_pages = kvm_rma_size >> PAGE_SHIFT;
 	return 0;
 }
 early_param("kvm_rma_size", early_parse_rma_size);

-static int __init early_parse_rma_count(char *p)
+struct kvm_rma_info *kvm_alloc_rma()
 {
-	if (!p)
-		return 1;
+	struct page *page;
+	struct kvm_rma_info *ri;

-	kvm_rma_count = simple_strtoul(p, NULL, 0);
-
-	return 0;
-}
-early_param("kvm_rma_count", early_parse_rma_count);
-
-struct kvmppc_linear_info *kvm_alloc_rma(void)
-{
-	return kvm_alloc_linear(KVM_LINEAR_RMA);
+	ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL);
+	if (!ri)
+		return NULL;
+	page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages);
+	if (!page)
+		goto err_out;
+	atomic_set(&ri->use_count, 1);
+	ri->base_pfn = page_to_pfn(page);
+	return ri;
+err_out:
+	kfree(ri);
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(kvm_alloc_rma);

-void kvm_release_rma(struct kvmppc_linear_info *ri)
+void kvm_release_rma(struct kvm_rma_info *ri)
 {
-	kvm_release_linear(ri);
+	if (atomic_dec_and_test(&ri->use_count)) {
+		kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages);
+		kfree(ri);
+	}
 }
 EXPORT_SYMBOL_GPL(kvm_release_rma);

-/*************** HPT *************/
-
-/*
- * This maintains a list of big linear HPT tables that contain the GVA->HPA
- * memory mappings. If we don't reserve those early on, we might not be able
- * to get a big (usually 16MB) linear memory region from the kernel anymore.
- */
-
-static unsigned long kvm_hpt_count;
-
-static int __init early_parse_hpt_count(char *p)
+static int __init early_parse_kvm_cma_resv(char *p)
 {
+	pr_debug("%s(%s)\n", __func__, p);
 	if (!p)
-		return 1;
-
-	kvm_hpt_count = simple_strtoul(p, NULL, 0);
-
-	return 0;
+		return -EINVAL;
+	return kstrtoul(p, 0, &kvm_cma_resv_ratio);
 }
-early_param("kvm_hpt_count", early_parse_hpt_count);
+early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv);

-struct kvmppc_linear_info *kvm_alloc_hpt(void)
+struct page *kvm_alloc_hpt(unsigned long nr_pages)
 {
-	return kvm_alloc_linear(KVM_LINEAR_HPT);
+	unsigned long align_pages = HPT_ALIGN_PAGES;
+
+	/* Old CPUs require HPT aligned on a multiple of its size */
+	if (!cpu_has_feature(CPU_FTR_ARCH_206))
+		align_pages = nr_pages;
+	return kvm_alloc_cma(nr_pages, align_pages);
 }
 EXPORT_SYMBOL_GPL(kvm_alloc_hpt);

-void kvm_release_hpt(struct kvmppc_linear_info *li)
+void kvm_release_hpt(struct page *page, unsigned long nr_pages)
 {
-	kvm_release_linear(li);
+	kvm_release_cma(page, nr_pages);
 }
 EXPORT_SYMBOL_GPL(kvm_release_hpt);

-/*************** generic *************/
-
-static LIST_HEAD(free_linears);
-static DEFINE_SPINLOCK(linear_lock);
-
-static void __init kvm_linear_init_one(ulong size, int count, int type)
-{
-	unsigned long i;
-	unsigned long j, npages;
-	void *linear;
-	struct page *pg;
-	const char *typestr;
-	struct kvmppc_linear_info *linear_info;
-
-	if (!count)
-		return;
-
-	typestr = (type == KVM_LINEAR_RMA) ? "RMA" : "HPT";
-
-	npages = size >> PAGE_SHIFT;
-	linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info));
-	for (i = 0; i < count; ++i) {
-		linear = alloc_bootmem_align(size, size);
-		pr_debug("Allocated KVM %s at %p (%ld MB)\n", typestr, linear,
-			 size >> 20);
-		linear_info[i].base_virt = linear;
-		linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT;
-		linear_info[i].npages = npages;
-		linear_info[i].type = type;
-		list_add_tail(&linear_info[i].list, &free_linears);
-		atomic_set(&linear_info[i].use_count, 0);
-
-		pg = pfn_to_page(linear_info[i].base_pfn);
-		for (j = 0; j < npages; ++j) {
-			atomic_inc(&pg->_count);
-			++pg;
-		}
-	}
-}
-
-static struct kvmppc_linear_info *kvm_alloc_linear(int type)
-{
-	struct kvmppc_linear_info *ri, *ret;
-
-	ret = NULL;
-	spin_lock(&linear_lock);
-	list_for_each_entry(ri, &free_linears, list) {
-		if (ri->type != type)
-			continue;
-
-		list_del(&ri->list);
-		atomic_inc(&ri->use_count);
-		memset(ri->base_virt, 0, ri->npages << PAGE_SHIFT);
-		ret = ri;
-		break;
-	}
-	spin_unlock(&linear_lock);
-	return ret;
-}
-
-static void kvm_release_linear(struct kvmppc_linear_info *ri)
-{
-	if (atomic_dec_and_test(&ri->use_count)) {
-		spin_lock(&linear_lock);
-		list_add_tail(&ri->list, &free_linears);
-		spin_unlock(&linear_lock);
-
-	}
-}
-
-/*
- * Called at boot time while the bootmem allocator is active,
- * to allocate contiguous physical memory for the hash page
- * tables for guests.
+/**
+ * kvm_cma_reserve() - reserve area for kvm hash pagetable
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory.
 */
-void __init kvm_linear_init(void)
+void __init kvm_cma_reserve(void)
 {
-	/* HPT */
-	kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT);
+	unsigned long align_size;
+	struct memblock_region *reg;
+	phys_addr_t selected_size = 0;
+	/*
+	 * We cannot use memblock_phys_mem_size() here, because
+	 * memblock_analyze() has not been called yet.
+	 */
+	for_each_memblock(memory, reg)
+		selected_size += memblock_region_memory_end_pfn(reg) -
+				 memblock_region_memory_base_pfn(reg);

-	/* RMA */
-	/* Only do this on PPC970 in HV mode */
-	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
-	    !cpu_has_feature(CPU_FTR_ARCH_201))
-		return;
+	selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT;
+	if (selected_size) {
+		pr_debug("%s: reserving %ld MiB for global area\n", __func__,
+			 (unsigned long)selected_size / SZ_1M);
+		/*
+		 * Old CPUs require HPT aligned on a multiple of its size. So for them
+		 * make the alignment as max size we could request.
+		 */
+		if (!cpu_has_feature(CPU_FTR_ARCH_206))
+			align_size = __rounddown_pow_of_two(selected_size);
+		else
+			align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;

-	if (!kvm_rma_size || !kvm_rma_count)
-		return;
-
-	/* Check that the requested size is one supported in hardware */
-	if (lpcr_rmls(kvm_rma_size) < 0) {
-		pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
-		return;
+		align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size);
+		kvm_cma_declare_contiguous(selected_size, align_size);
 	}
-
-	kvm_linear_init_one(kvm_rma_size, kvm_rma_count, KVM_LINEAR_RMA);
 }
--- a/arch/powerpc/kvm/book3s_hv_cma.c
+++ b/arch/powerpc/kvm/book3s_hv_cma.c
@ -0,0 +1,240 @@
+/*
+ * Contiguous Memory Allocator for ppc KVM hash pagetable  based on CMA
+ * for DMA mapping framework
+ *
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ *
+ */
+#define pr_fmt(fmt) "kvm_cma: " fmt
+
+#ifdef CONFIG_CMA_DEBUG
+#ifndef DEBUG
+#  define DEBUG
+#endif
+#endif
+
+#include <linux/memblock.h>
+#include <linux/mutex.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+
+#include "book3s_hv_cma.h"
+
+struct kvm_cma {
+	unsigned long	base_pfn;
+	unsigned long	count;
+	unsigned long	*bitmap;
+};
+
+static DEFINE_MUTEX(kvm_cma_mutex);
+static struct kvm_cma kvm_cma_area;
+
+/**
+ * kvm_cma_declare_contiguous() - reserve area for contiguous memory handling
+ *			          for kvm hash pagetable
+ * @size:  Size of the reserved memory.
+ * @alignment:  Alignment for the contiguous memory area
+ *
+ * This function reserves memory for kvm cma area. It should be
+ * called by arch code when early allocator (memblock or bootmem)
+ * is still activate.
+ */
+long __init kvm_cma_declare_contiguous(phys_addr_t size, phys_addr_t alignment)
+{
+	long base_pfn;
+	phys_addr_t addr;
+	struct kvm_cma *cma = &kvm_cma_area;
+
+	pr_debug("%s(size %lx)\n", __func__, (unsigned long)size);
+
+	if (!size)
+		return -EINVAL;
+	/*
+	 * Sanitise input arguments.
+	 * We should be pageblock aligned for CMA.
+	 */
+	alignment = max(alignment, (phys_addr_t)(PAGE_SIZE << pageblock_order));
+	size = ALIGN(size, alignment);
+	/*
+	 * Reserve memory
+	 * Use __memblock_alloc_base() since
+	 * memblock_alloc_base() panic()s.
+	 */
+	addr = __memblock_alloc_base(size, alignment, 0);
+	if (!addr) {
+		base_pfn = -ENOMEM;
+		goto err;
+	} else
+		base_pfn = PFN_DOWN(addr);
+
+	/*
+	 * Each reserved area must be initialised later, when more kernel
+	 * subsystems (like slab allocator) are available.
+	 */
+	cma->base_pfn = base_pfn;
+	cma->count    = size >> PAGE_SHIFT;
+	pr_info("CMA: reserved %ld MiB\n", (unsigned long)size / SZ_1M);
+	return 0;
+err:
+	pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
+	return base_pfn;
+}
+
+/**
+ * kvm_alloc_cma() - allocate pages from contiguous area
+ * @nr_pages: Requested number of pages.
+ * @align_pages: Requested alignment in number of pages
+ *
+ * This function allocates memory buffer for hash pagetable.
+ */
+struct page *kvm_alloc_cma(unsigned long nr_pages, unsigned long align_pages)
+{
+	int ret;
+	struct page *page = NULL;
+	struct kvm_cma *cma = &kvm_cma_area;
+	unsigned long chunk_count, nr_chunk;
+	unsigned long mask, pfn, pageno, start = 0;
+
+
+	if (!cma || !cma->count)
+		return NULL;
+
+	pr_debug("%s(cma %p, count %lu, align pages %lu)\n", __func__,
+		 (void *)cma, nr_pages, align_pages);
+
+	if (!nr_pages)
+		return NULL;
+	/*
+	 * align mask with chunk size. The bit tracks pages in chunk size
+	 */
+	VM_BUG_ON(!is_power_of_2(align_pages));
+	mask = (align_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)) - 1;
+	BUILD_BUG_ON(PAGE_SHIFT > KVM_CMA_CHUNK_ORDER);
+
+	chunk_count = cma->count >>  (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+	nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+
+	mutex_lock(&kvm_cma_mutex);
+	for (;;) {
+		pageno = bitmap_find_next_zero_area(cma->bitmap, chunk_count,
+						    start, nr_chunk, mask);
+		if (pageno >= chunk_count)
+			break;
+
+		pfn = cma->base_pfn + (pageno << (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT));
+		ret = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_CMA);
+		if (ret == 0) {
+			bitmap_set(cma->bitmap, pageno, nr_chunk);
+			page = pfn_to_page(pfn);
+			memset(pfn_to_kaddr(pfn), 0, nr_pages << PAGE_SHIFT);
+			break;
+		} else if (ret != -EBUSY) {
+			break;
+		}
+		pr_debug("%s(): memory range at %p is busy, retrying\n",
+			 __func__, pfn_to_page(pfn));
+		/* try again with a bit different memory target */
+		start = pageno + mask + 1;
+	}
+	mutex_unlock(&kvm_cma_mutex);
+	pr_debug("%s(): returned %p\n", __func__, page);
+	return page;
+}
+
+/**
+ * kvm_release_cma() - release allocated pages for hash pagetable
+ * @pages: Allocated pages.
+ * @nr_pages: Number of allocated pages.
+ *
+ * This function releases memory allocated by kvm_alloc_cma().
+ * It returns false when provided pages do not belong to contiguous area and
+ * true otherwise.
+ */
+bool kvm_release_cma(struct page *pages, unsigned long nr_pages)
+{
+	unsigned long pfn;
+	unsigned long nr_chunk;
+	struct kvm_cma *cma = &kvm_cma_area;
+
+	if (!cma || !pages)
+		return false;
+
+	pr_debug("%s(page %p count %lu)\n", __func__, (void *)pages, nr_pages);
+
+	pfn = page_to_pfn(pages);
+
+	if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
+		return false;
+
+	VM_BUG_ON(pfn + nr_pages > cma->base_pfn + cma->count);
+	nr_chunk = nr_pages >>  (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+
+	mutex_lock(&kvm_cma_mutex);
+	bitmap_clear(cma->bitmap,
+		     (pfn - cma->base_pfn) >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT),
+		     nr_chunk);
+	free_contig_range(pfn, nr_pages);
+	mutex_unlock(&kvm_cma_mutex);
+
+	return true;
+}
+
+static int __init kvm_cma_activate_area(unsigned long base_pfn,
+					unsigned long count)
+{
+	unsigned long pfn = base_pfn;
+	unsigned i = count >> pageblock_order;
+	struct zone *zone;
+
+	WARN_ON_ONCE(!pfn_valid(pfn));
+	zone = page_zone(pfn_to_page(pfn));
+	do {
+		unsigned j;
+		base_pfn = pfn;
+		for (j = pageblock_nr_pages; j; --j, pfn++) {
+			WARN_ON_ONCE(!pfn_valid(pfn));
+			/*
+			 * alloc_contig_range requires the pfn range
+			 * specified to be in the same zone. Make this
+			 * simple by forcing the entire CMA resv range
+			 * to be in the same zone.
+			 */
+			if (page_zone(pfn_to_page(pfn)) != zone)
+				return -EINVAL;
+		}
+		init_cma_reserved_pageblock(pfn_to_page(base_pfn));
+	} while (--i);
+	return 0;
+}
+
+static int __init kvm_cma_init_reserved_areas(void)
+{
+	int bitmap_size, ret;
+	unsigned long chunk_count;
+	struct kvm_cma *cma = &kvm_cma_area;
+
+	pr_debug("%s()\n", __func__);
+	if (!cma->count)
+		return 0;
+	chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+	bitmap_size = BITS_TO_LONGS(chunk_count) * sizeof(long);
+	cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+	if (!cma->bitmap)
+		return -ENOMEM;
+
+	ret = kvm_cma_activate_area(cma->base_pfn, cma->count);
+	if (ret)
+		goto error;
+	return 0;
+
+error:
+	kfree(cma->bitmap);
+	return ret;
+}
+core_initcall(kvm_cma_init_reserved_areas);
--- a/arch/powerpc/kvm/book3s_hv_cma.h
+++ b/arch/powerpc/kvm/book3s_hv_cma.h
@ -0,0 +1,27 @@
+/*
+ * Contiguous Memory Allocator for ppc KVM hash pagetable  based on CMA
+ * for DMA mapping framework
+ *
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ *
+ */
+
+#ifndef __POWERPC_KVM_CMA_ALLOC_H__
+#define __POWERPC_KVM_CMA_ALLOC_H__
+/*
+ * Both RMA and Hash page allocation will be multiple of 256K.
+ */
+#define KVM_CMA_CHUNK_ORDER	18
+
+extern struct page *kvm_alloc_cma(unsigned long nr_pages,
+				  unsigned long align_pages);
+extern bool kvm_release_cma(struct page *pages, unsigned long nr_pages);
+extern long kvm_cma_declare_contiguous(phys_addr_t size,
+				       phys_addr_t alignment) __init;
+#endif
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@ -383,6 +383,80 @@ static inline int try_lock_tlbie(unsigned int *lock)
 	return old == 0;
 }

+/*
+ * tlbie/tlbiel is a bit different on the PPC970 compared to later
+ * processors such as POWER7; the large page bit is in the instruction
+ * not RB, and the top 16 bits and the bottom 12 bits of the VA
+ * in RB must be 0.
+ */
+static void do_tlbies_970(struct kvm *kvm, unsigned long *rbvalues,
+			  long npages, int global, bool need_sync)
+{
+	long i;
+
+	if (global) {
+		while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+			cpu_relax();
+		if (need_sync)
+			asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < npages; ++i) {
+			unsigned long rb = rbvalues[i];
+
+			if (rb & 1)		/* large page */
+				asm volatile("tlbie %0,1" : :
+					     "r" (rb & 0x0000fffffffff000ul));
+			else
+				asm volatile("tlbie %0,0" : :
+					     "r" (rb & 0x0000fffffffff000ul));
+		}
+		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+		kvm->arch.tlbie_lock = 0;
+	} else {
+		if (need_sync)
+			asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < npages; ++i) {
+			unsigned long rb = rbvalues[i];
+
+			if (rb & 1)		/* large page */
+				asm volatile("tlbiel %0,1" : :
+					     "r" (rb & 0x0000fffffffff000ul));
+			else
+				asm volatile("tlbiel %0,0" : :
+					     "r" (rb & 0x0000fffffffff000ul));
+		}
+		asm volatile("ptesync" : : : "memory");
+	}
+}
+
+static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
+		      long npages, int global, bool need_sync)
+{
+	long i;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+		/* PPC970 tlbie instruction is a bit different */
+		do_tlbies_970(kvm, rbvalues, npages, global, need_sync);
+		return;
+	}
+	if (global) {
+		while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+			cpu_relax();
+		if (need_sync)
+			asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < npages; ++i)
+			asm volatile(PPC_TLBIE(%1,%0) : :
+				     "r" (rbvalues[i]), "r" (kvm->arch.lpid));
+		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+		kvm->arch.tlbie_lock = 0;
+	} else {
+		if (need_sync)
+			asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < npages; ++i)
+			asm volatile("tlbiel %0" : : "r" (rbvalues[i]));
+		asm volatile("ptesync" : : : "memory");
+	}
+}
+
 long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 			unsigned long pte_index, unsigned long avpn,
 			unsigned long *hpret)
@ -408,19 +482,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 	if (v & HPTE_V_VALID) {
 		hpte[0] &= ~HPTE_V_VALID;
 		rb = compute_tlbie_rb(v, hpte[1], pte_index);
-		if (global_invalidates(kvm, flags)) {
-			while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
-				cpu_relax();
-			asm volatile("ptesync" : : : "memory");
-			asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-				     : : "r" (rb), "r" (kvm->arch.lpid));
-			asm volatile("ptesync" : : : "memory");
-			kvm->arch.tlbie_lock = 0;
-		} else {
-			asm volatile("ptesync" : : : "memory");
-			asm volatile("tlbiel %0" : : "r" (rb));
-			asm volatile("ptesync" : : : "memory");
-		}
+		do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
 		/* Read PTE low word after tlbie to get final R/C values */
 		remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
 	}
@ -448,12 +510,11 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 	unsigned long *hp, *hptes[4], tlbrb[4];
 	long int i, j, k, n, found, indexes[4];
 	unsigned long flags, req, pte_index, rcbits;
-	long int local = 0;
+	int global;
 	long int ret = H_SUCCESS;
 	struct revmap_entry *rev, *revs[4];

-	if (atomic_read(&kvm->online_vcpus) == 1)
-		local = 1;
+	global = global_invalidates(kvm, 0);
 	for (i = 0; i < 4 && ret == H_SUCCESS; ) {
 		n = 0;
 		for (; i < 4; ++i) {
@ -529,22 +590,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 			break;

 		/* Now that we've collected a batch, do the tlbies */
-		if (!local) {
-			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
-				cpu_relax();
-			asm volatile("ptesync" : : : "memory");
-			for (k = 0; k < n; ++k)
-				asm volatile(PPC_TLBIE(%1,%0) : :
-					     "r" (tlbrb[k]),
-					     "r" (kvm->arch.lpid));
-			asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-			kvm->arch.tlbie_lock = 0;
-		} else {
-			asm volatile("ptesync" : : : "memory");
-			for (k = 0; k < n; ++k)
-				asm volatile("tlbiel %0" : : "r" (tlbrb[k]));
-			asm volatile("ptesync" : : : "memory");
-		}
+		do_tlbies(kvm, tlbrb, n, global, true);

 		/* Read PTE low words after tlbie to get final R/C values */
 		for (k = 0; k < n; ++k) {
@ -603,19 +649,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	if (v & HPTE_V_VALID) {
 		rb = compute_tlbie_rb(v, r, pte_index);
 		hpte[0] = v & ~HPTE_V_VALID;
-		if (global_invalidates(kvm, flags)) {
-			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
-				cpu_relax();
-			asm volatile("ptesync" : : : "memory");
-			asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-				     : : "r" (rb), "r" (kvm->arch.lpid));
-			asm volatile("ptesync" : : : "memory");
-			kvm->arch.tlbie_lock = 0;
-		} else {
-			asm volatile("ptesync" : : : "memory");
-			asm volatile("tlbiel %0" : : "r" (rb));
-			asm volatile("ptesync" : : : "memory");
-		}
+		do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
 		/*
 		 * If the host has this page as readonly but the guest
 		 * wants to make it read/write, reduce the permissions.
@ -686,13 +720,7 @@ void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,

 	hptep[0] &= ~HPTE_V_VALID;
 	rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
-	while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
-		cpu_relax();
-	asm volatile("ptesync" : : : "memory");
-	asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-		     : : "r" (rb), "r" (kvm->arch.lpid));
-	asm volatile("ptesync" : : : "memory");
-	kvm->arch.tlbie_lock = 0;
+	do_tlbies(kvm, &rb, 1, 1, true);
 }
 EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);

@ -706,12 +734,7 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
 	rbyte = (hptep[1] & ~HPTE_R_R) >> 8;
 	/* modify only the second-last byte, which contains the ref bit */
 	*((char *)hptep + 14) = rbyte;
-	while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
-		cpu_relax();
-	asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-		     : : "r" (rb), "r" (kvm->arch.lpid));
-	asm volatile("ptesync" : : : "memory");
-	kvm->arch.tlbie_lock = 0;
+	do_tlbies(kvm, &rb, 1, 1, false);
 }
 EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);

--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@ -1381,7 +1381,7 @@ hcall_try_real_mode:
 	cmpldi	r3,hcall_real_table_end - hcall_real_table
 	bge	guest_exit_cont
 	LOAD_REG_ADDR(r4, hcall_real_table)
-	lwzx	r3,r3,r4
+	lwax	r3,r3,r4
 	cmpwi	r3,0
 	beq	guest_exit_cont
 	add	r3,r3,r4
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@ -92,6 +92,11 @@ kvm_start_lightweight:
 	PPC_LL	r3, VCPU_HFLAGS(r4)
 	rldicl	r3, r3, 0, 63		/* r3 &= 1 */
 	stb	r3, HSTATE_RESTORE_HID5(r13)
+
+	/* Load up guest SPRG3 value, since it's user readable */
+	ld	r3, VCPU_SHARED(r4)
+	ld	r3, VCPU_SHARED_SPRG3(r3)
+	mtspr	SPRN_SPRG3, r3
 #endif /* CONFIG_PPC_BOOK3S_64 */

 	PPC_LL	r4, VCPU_SHADOW_MSR(r4)	/* get shadow_msr */
@ -123,6 +128,15 @@ kvmppc_handler_highmem:
 	/* R7 = vcpu */
 	PPC_LL	r7, GPR4(r1)

+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * Reload kernel SPRG3 value.
+	 * No need to save guest value as usermode can't modify SPRG3.
+	 */
+	ld	r3, PACA_SPRG3(r13)
+	mtspr	SPRN_SPRG3, r3
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 	PPC_STL	r14, VCPU_GPR(R14)(r7)
 	PPC_STL	r15, VCPU_GPR(R15)(r7)
 	PPC_STL	r16, VCPU_GPR(R16)(r7)
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@ -468,7 +468,8 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
 		 * both the traditional FP registers and the added VSX
 		 * registers into thread.fpr[].
 		 */
-		giveup_fpu(current);
+		if (current->thread.regs->msr & MSR_FP)
+			giveup_fpu(current);
 		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
 			vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];

@ -483,7 +484,8 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)

 #ifdef CONFIG_ALTIVEC
 	if (msr & MSR_VEC) {
-		giveup_altivec(current);
+		if (current->thread.regs->msr & MSR_VEC)
+			giveup_altivec(current);
 		memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
 		vcpu->arch.vscr = t->vscr;
 	}
@ -575,8 +577,6 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 	printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
 #endif

-	current->thread.regs->msr |= msr;
-
 	if (msr & MSR_FP) {
 		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
 			thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
@ -598,12 +598,32 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 #endif
 	}

+	current->thread.regs->msr |= msr;
 	vcpu->arch.guest_owned_ext |= msr;
 	kvmppc_recalc_shadow_msr(vcpu);

 	return RESUME_GUEST;
 }

+/*
+ * Kernel code using FP or VMX could have flushed guest state to
+ * the thread_struct; if so, get it back now.
+ */
+static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
+{
+	unsigned long lost_ext;
+
+	lost_ext = vcpu->arch.guest_owned_ext & ~current->thread.regs->msr;
+	if (!lost_ext)
+		return;
+
+	if (lost_ext & MSR_FP)
+		kvmppc_load_up_fpu();
+	if (lost_ext & MSR_VEC)
+		kvmppc_load_up_altivec();
+	current->thread.regs->msr |= lost_ext;
+}
+
 int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                       unsigned int exit_nr)
 {
@ -772,7 +792,7 @@ program_interrupt:
 	}
 	case BOOK3S_INTERRUPT_SYSCALL:
 		if (vcpu->arch.papr_enabled &&
-		    (kvmppc_get_last_inst(vcpu) == 0x44000022) &&
+		    (kvmppc_get_last_sc(vcpu) == 0x44000022) &&
 		    !(vcpu->arch.shared->msr & MSR_PR)) {
 			/* SC 1 papr hypercalls */
 			ulong cmd = kvmppc_get_gpr(vcpu, 3);
@ -890,8 +910,9 @@ program_interrupt:
 			local_irq_enable();
 			r = s;
 		} else {
-			kvmppc_lazy_ee_enable();
+			kvmppc_fix_ee_before_entry();
 		}
+		kvmppc_handle_lost_ext(vcpu);
 	}

 	trace_kvm_book3s_reenter(r, vcpu);
@ -1162,7 +1183,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	if (vcpu->arch.shared->msr & MSR_FP)
 		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);

-	kvmppc_lazy_ee_enable();
+	kvmppc_fix_ee_before_entry();

 	ret = __kvmppc_vcpu_run(kvm_run, vcpu);

--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@ -19,6 +19,7 @@
 #include <asm/hvcall.h>
 #include <asm/xics.h>
 #include <asm/debug.h>
+#include <asm/time.h>

 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@ -674,8 +674,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		goto out;
 	}

-	kvm_guest_enter();
-
 #ifdef CONFIG_PPC_FPU
 	/* Save userspace FPU state in stack */
 	enable_kernel_fp();
@ -698,7 +696,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	kvmppc_load_guest_fp(vcpu);
 #endif

-	kvmppc_lazy_ee_enable();
+	kvmppc_fix_ee_before_entry();

 	ret = __kvmppc_vcpu_run(kvm_run, vcpu);

@ -1168,7 +1166,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			local_irq_enable();
 			r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
 		} else {
-			kvmppc_lazy_ee_enable();
+			kvmppc_fix_ee_before_entry();
 		}
 	}

--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@ -117,8 +117,6 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
 			kvm_guest_exit();
 			continue;
 		}
-
-		trace_hardirqs_on();
 #endif

 		kvm_guest_enter();
@ -420,6 +418,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 	return kvmppc_core_create_memslot(slot, npages);
 }

+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
 				   struct kvm_userspace_memory_region *mem,
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@ -274,6 +274,14 @@ struct kvm_arch{
 	int css_support;
 };

+#define KVM_HVA_ERR_BAD		(-1UL)
+#define KVM_HVA_ERR_RO_BAD	(-2UL)
+
+static inline bool kvm_is_error_hva(unsigned long addr)
+{
+	return IS_ERR_VALUE(addr);
+}
+
 extern int sie64a(struct kvm_s390_sie_block *, u64 *);
 extern char sie_exit;
 #endif
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@ -12,8 +12,6 @@ typedef struct {
 	unsigned long asce_bits;
 	unsigned long asce_limit;
 	unsigned long vdso_base;
-	/* Cloned contexts will be created with extended page tables. */
-	unsigned int alloc_pgste:1;
 	/* The mmu context has extended page tables. */
 	unsigned int has_pgste:1;
 } mm_context_t;
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@ -21,24 +21,7 @@ static inline int init_new_context(struct task_struct *tsk,
 #ifdef CONFIG_64BIT
 	mm->context.asce_bits |= _ASCE_TYPE_REGION3;
 #endif
-	if (current->mm && current->mm->context.alloc_pgste) {
-		/*
-		 * alloc_pgste indicates, that any NEW context will be created
-		 * with extended page tables. The old context is unchanged. The
-		 * page table allocation and the page table operations will
-		 * look at has_pgste to distinguish normal and extended page
-		 * tables. The only way to create extended page tables is to
-		 * set alloc_pgste and then create a new context (e.g. dup_mm).
-		 * The page table allocation is called after init_new_context
-		 * and if has_pgste is set, it will create extended page
-		 * tables.
-		 */
-		mm->context.has_pgste = 1;
-		mm->context.alloc_pgste = 1;
-	} else {
-		mm->context.has_pgste = 0;
-		mm->context.alloc_pgste = 0;
-	}
+	mm->context.has_pgste = 0;
 	mm->context.asce_limit = STACK_TOP_MAX;
 	crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
 	return 0;
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@ -1442,6 +1442,17 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */

+static inline void pmdp_flush_lazy(struct mm_struct *mm,
+				   unsigned long address, pmd_t *pmdp)
+{
+	int active = (mm == current->active_mm) ? 1 : 0;
+
+	if ((atomic_read(&mm->context.attach_count) & 0xffff) > active)
+		__pmd_idte(address, pmdp);
+	else
+		mm->context.flush_mm = 1;
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE

 #define __HAVE_ARCH_PGTABLE_DEPOSIT
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@ -43,6 +43,7 @@ extern void execve_tail(void);
 #ifndef CONFIG_64BIT

 #define TASK_SIZE		(1UL << 31)
+#define TASK_MAX_SIZE		(1UL << 31)
 #define TASK_UNMAPPED_BASE	(1UL << 30)

 #else /* CONFIG_64BIT */
@ -51,6 +52,7 @@ extern void execve_tail(void);
 #define TASK_UNMAPPED_BASE	(test_thread_flag(TIF_31BIT) ? \
 					(1UL << 30) : (1UL << 41))
 #define TASK_SIZE		TASK_SIZE_OF(current)
+#define TASK_MAX_SIZE		(1UL << 53)

 #endif /* CONFIG_64BIT */

--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@ -119,12 +119,21 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
 	 * The layout is as follows:
 	 * - gpr 2 contains the subchannel id (passed as addr)
 	 * - gpr 3 contains the virtqueue index (passed as datamatch)
+	 * - gpr 4 contains the index on the bus (optionally)
 	 */
-	ret = kvm_io_bus_write(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS,
-				vcpu->run->s.regs.gprs[2],
-				8, &vcpu->run->s.regs.gprs[3]);
+	ret = kvm_io_bus_write_cookie(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS,
+				      vcpu->run->s.regs.gprs[2],
+				      8, &vcpu->run->s.regs.gprs[3],
+				      vcpu->run->s.regs.gprs[4]);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
-	/* kvm_io_bus_write returns -EOPNOTSUPP if it found no match. */
+
+	/*
+	 * Return cookie in gpr 2, but don't overwrite the register if the
+	 * diagnose will be handled by userspace.
+	 */
+	if (ret != -EOPNOTSUPP)
+		vcpu->run->s.regs.gprs[2] = ret;
+	/* kvm_io_bus_write_cookie returns -EOPNOTSUPP if it found no match. */
 	return ret < 0 ? ret : 0;
 }

--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@ -28,6 +28,7 @@
 #include <asm/pgtable.h>
 #include <asm/nmi.h>
 #include <asm/switch_to.h>
+#include <asm/facility.h>
 #include <asm/sclp.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
@ -84,9 +85,15 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };

-static unsigned long long *facilities;
+unsigned long *vfacilities;
 static struct gmap_notifier gmap_notifier;

+/* test availability of vfacility */
+static inline int test_vfacility(unsigned long nr)
+{
+	return __test_facility(nr, (void *) vfacilities);
+}
+
 /* Section: not file related */
 int kvm_arch_hardware_enable(void *garbage)
 {
@ -387,7 +394,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	vcpu->arch.sie_block->ecb   = 6;
 	vcpu->arch.sie_block->ecb2  = 8;
 	vcpu->arch.sie_block->eca   = 0xC1002001U;
-	vcpu->arch.sie_block->fac   = (int) (long) facilities;
+	vcpu->arch.sie_block->fac   = (int) (long) vfacilities;
 	hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
 	tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
 		     (unsigned long) vcpu);
@ -1063,6 +1070,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 	return 0;
 }

+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
+
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
@ -1129,20 +1140,20 @@ static int __init kvm_s390_init(void)
 	 * to hold the maximum amount of facilities. On the other hand, we
 	 * only set facilities that are known to work in KVM.
 	 */
-	facilities = (unsigned long long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
-	if (!facilities) {
+	vfacilities = (unsigned long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
+	if (!vfacilities) {
 		kvm_exit();
 		return -ENOMEM;
 	}
-	memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
-	facilities[0] &= 0xff82fff3f47c0000ULL;
-	facilities[1] &= 0x001c000000000000ULL;
+	memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16);
+	vfacilities[0] &= 0xff82fff3f47c0000UL;
+	vfacilities[1] &= 0x001c000000000000UL;
 	return 0;
 }

 static void __exit kvm_s390_exit(void)
 {
-	free_page((unsigned long) facilities);
+	free_page((unsigned long) vfacilities);
 	kvm_exit();
 }

--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@ -24,6 +24,9 @@

 typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);

+/* declare vfacilities extern */
+extern unsigned long *vfacilities;
+
 /* negativ values are error codes, positive values for internal conditions */
 #define SIE_INTERCEPT_RERUNVCPU		(1<<0)
 #define SIE_INTERCEPT_UCONTROL		(1<<1)
@ -112,6 +115,13 @@ static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu)
 	return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
 }

+/* Set the condition code in the guest program status word */
+static inline void kvm_s390_set_psw_cc(struct kvm_vcpu *vcpu, unsigned long cc)
+{
+	vcpu->arch.sie_block->gpsw.mask &= ~(3UL << 44);
+	vcpu->arch.sie_block->gpsw.mask |= cc << 44;
+}
+
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
 void kvm_s390_tasklet(unsigned long parm);
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@ -164,8 +164,7 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
 	kfree(inti);
 no_interrupt:
 	/* Set condition code and we're done. */
-	vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
-	vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44;
+	kvm_s390_set_psw_cc(vcpu, cc);
 	return 0;
 }

@ -220,15 +219,13 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
 		 * Set condition code 3 to stop the guest from issueing channel
 		 * I/O instructions.
 		 */
-		vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
-		vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
+		kvm_s390_set_psw_cc(vcpu, 3);
 		return 0;
 	}
 }

 static int handle_stfl(struct kvm_vcpu *vcpu)
 {
-	unsigned int facility_list;
 	int rc;

 	vcpu->stat.instruction_stfl++;
@ -236,15 +233,13 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);

-	/* only pass the facility bits, which we can handle */
-	facility_list = S390_lowcore.stfl_fac_list & 0xff82fff3;
-
 	rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
-			   &facility_list, sizeof(facility_list));
+			   vfacilities, 4);
 	if (rc)
 		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-	VCPU_EVENT(vcpu, 5, "store facility list value %x", facility_list);
-	trace_kvm_s390_handle_stfl(vcpu, facility_list);
+	VCPU_EVENT(vcpu, 5, "store facility list value %x",
+		   *(unsigned int *) vfacilities);
+	trace_kvm_s390_handle_stfl(vcpu, *(unsigned int *) vfacilities);
 	return 0;
 }

@ -387,7 +382,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);

 	if (fc > 3) {
-		vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;	  /* cc 3 */
+		kvm_s390_set_psw_cc(vcpu, 3);
 		return 0;
 	}

@ -397,7 +392,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)

 	if (fc == 0) {
 		vcpu->run->s.regs.gprs[0] = 3 << 28;
-		vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);  /* cc 0 */
+		kvm_s390_set_psw_cc(vcpu, 0);
 		return 0;
 	}

@ -431,12 +426,11 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 	}
 	trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
 	free_page(mem);
-	vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+	kvm_s390_set_psw_cc(vcpu, 0);
 	vcpu->run->s.regs.gprs[0] = 0;
 	return 0;
 out_no_data:
-	/* condition code 3 */
-	vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;
+	kvm_s390_set_psw_cc(vcpu, 3);
 out_exception:
 	free_page(mem);
 	return rc;
@ -494,12 +488,12 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
 	kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);

 	/* This basically extracts the mask half of the psw. */
-	vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000;
+	vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000UL;
 	vcpu->run->s.regs.gprs[reg1] |= vcpu->arch.sie_block->gpsw.mask >> 32;
 	if (reg2) {
-		vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000;
+		vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000UL;
 		vcpu->run->s.regs.gprs[reg2] |=
-			vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffff;
+			vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffffUL;
 	}
 	return 0;
 }
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@ -335,7 +335,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,

 	if ((from | to | len) & (PMD_SIZE - 1))
 		return -EINVAL;
-	if (len == 0 || from + len > PGDIR_SIZE ||
+	if (len == 0 || from + len > TASK_MAX_SIZE ||
 	    from + len < from || to + len < to)
 		return -EINVAL;

@ -732,6 +732,11 @@ void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long addr, pte_t *pte)
 	spin_unlock(&gmap_notifier_lock);
 }

+static inline int page_table_with_pgste(struct page *page)
+{
+	return atomic_read(&page->_mapcount) == 0;
+}
+
 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
 						    unsigned long vmaddr)
 {
@ -751,7 +756,7 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
 	mp->vmaddr = vmaddr & PMD_MASK;
 	INIT_LIST_HEAD(&mp->mapper);
 	page->index = (unsigned long) mp;
-	atomic_set(&page->_mapcount, 3);
+	atomic_set(&page->_mapcount, 0);
 	table = (unsigned long *) page_to_phys(page);
 	clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
 	clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
@ -818,6 +823,11 @@ EXPORT_SYMBOL(set_guest_storage_key);

 #else /* CONFIG_PGSTE */

+static inline int page_table_with_pgste(struct page *page)
+{
+	return 0;
+}
+
 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
 						    unsigned long vmaddr)
 {
@ -894,12 +904,12 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
 	struct page *page;
 	unsigned int bit, mask;

-	if (mm_has_pgste(mm)) {
+	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+	if (page_table_with_pgste(page)) {
 		gmap_disconnect_pgtable(mm, table);
 		return page_table_free_pgste(table);
 	}
 	/* Free 1K/2K page table fragment of a 4K page */
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 	bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
 	spin_lock_bh(&mm->context.list_lock);
 	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
@ -937,14 +947,14 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
 	unsigned int bit, mask;

 	mm = tlb->mm;
-	if (mm_has_pgste(mm)) {
+	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+	if (page_table_with_pgste(page)) {
 		gmap_disconnect_pgtable(mm, table);
 		table = (unsigned long *) (__pa(table) | FRAG_MASK);
 		tlb_remove_table(tlb, table);
 		return;
 	}
 	bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 	spin_lock_bh(&mm->context.list_lock);
 	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
 		list_del(&page->lru);
@ -1030,36 +1040,120 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 }

 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void thp_split_vma(struct vm_area_struct *vma)
+static inline void thp_split_vma(struct vm_area_struct *vma)
 {
 	unsigned long addr;
-	struct page *page;

-	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
-		page = follow_page(vma, addr, FOLL_SPLIT);
-	}
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
+		follow_page(vma, addr, FOLL_SPLIT);
 }

-void thp_split_mm(struct mm_struct *mm)
+static inline void thp_split_mm(struct mm_struct *mm)
 {
-	struct vm_area_struct *vma = mm->mmap;
+	struct vm_area_struct *vma;

-	while (vma != NULL) {
+	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
 		thp_split_vma(vma);
 		vma->vm_flags &= ~VM_HUGEPAGE;
 		vma->vm_flags |= VM_NOHUGEPAGE;
-		vma = vma->vm_next;
 	}
+	mm->def_flags |= VM_NOHUGEPAGE;
+}
+#else
+static inline void thp_split_mm(struct mm_struct *mm)
+{
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */

+static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
+				struct mm_struct *mm, pud_t *pud,
+				unsigned long addr, unsigned long end)
+{
+	unsigned long next, *table, *new;
+	struct page *page;
+	pmd_t *pmd;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+again:
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		table = (unsigned long *) pmd_deref(*pmd);
+		page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+		if (page_table_with_pgste(page))
+			continue;
+		/* Allocate new page table with pgstes */
+		new = page_table_alloc_pgste(mm, addr);
+		if (!new) {
+			mm->context.has_pgste = 0;
+			continue;
+		}
+		spin_lock(&mm->page_table_lock);
+		if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
+			/* Nuke pmd entry pointing to the "short" page table */
+			pmdp_flush_lazy(mm, addr, pmd);
+			pmd_clear(pmd);
+			/* Copy ptes from old table to new table */
+			memcpy(new, table, PAGE_SIZE/2);
+			clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+			/* Establish new table */
+			pmd_populate(mm, pmd, (pte_t *) new);
+			/* Free old table with rcu, there might be a walker! */
+			page_table_free_rcu(tlb, table);
+			new = NULL;
+		}
+		spin_unlock(&mm->page_table_lock);
+		if (new) {
+			page_table_free_pgste(new);
+			goto again;
+		}
+	} while (pmd++, addr = next, addr != end);
+
+	return addr;
+}
+
+static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
+				   struct mm_struct *mm, pgd_t *pgd,
+				   unsigned long addr, unsigned long end)
+{
+	unsigned long next;
+	pud_t *pud;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
+	} while (pud++, addr = next, addr != end);
+
+	return addr;
+}
+
+static void page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
+			       unsigned long addr, unsigned long end)
+{
+	unsigned long next;
+	pgd_t *pgd;
+
+	pgd = pgd_offset(mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
+}
+
 /*
 * switch on pgstes for its userspace process (for kvm)
 */
 int s390_enable_sie(void)
 {
 	struct task_struct *tsk = current;
-	struct mm_struct *mm, *old_mm;
+	struct mm_struct *mm = tsk->mm;
+	struct mmu_gather tlb;

 	/* Do we have switched amode? If no, we cannot do sie */
 	if (s390_user_mode == HOME_SPACE_MODE)
@ -1069,57 +1163,16 @@ int s390_enable_sie(void)
 	if (mm_has_pgste(tsk->mm))
 		return 0;

-	/* lets check if we are allowed to replace the mm */
-	task_lock(tsk);
-	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
-#ifdef CONFIG_AIO
-	    !hlist_empty(&tsk->mm->ioctx_list) ||
-#endif
-	    tsk->mm != tsk->active_mm) {
-		task_unlock(tsk);
-		return -EINVAL;
-	}
-	task_unlock(tsk);
-
-	/* we copy the mm and let dup_mm create the page tables with_pgstes */
-	tsk->mm->context.alloc_pgste = 1;
-	/* make sure that both mms have a correct rss state */
-	sync_mm_rss(tsk->mm);
-	mm = dup_mm(tsk);
-	tsk->mm->context.alloc_pgste = 0;
-	if (!mm)
-		return -ENOMEM;
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	down_write(&mm->mmap_sem);
 	/* split thp mappings and disable thp for future mappings */
 	thp_split_mm(mm);
-	mm->def_flags |= VM_NOHUGEPAGE;
-#endif
-
-	/* Now lets check again if something happened */
-	task_lock(tsk);
-	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
-#ifdef CONFIG_AIO
-	    !hlist_empty(&tsk->mm->ioctx_list) ||
-#endif
-	    tsk->mm != tsk->active_mm) {
-		mmput(mm);
-		task_unlock(tsk);
-		return -EINVAL;
-	}
-
-	/* ok, we are alone. No ptrace, no threads, etc. */
-	old_mm = tsk->mm;
-	tsk->mm = tsk->active_mm = mm;
-	preempt_disable();
-	update_mm(mm, tsk);
-	atomic_inc(&mm->context.attach_count);
-	atomic_dec(&old_mm->context.attach_count);
-	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
-	preempt_enable();
-	task_unlock(tsk);
-	mmput(old_mm);
-	return 0;
+	/* Reallocate the page tables with pgstes */
+	mm->context.has_pgste = 1;
+	tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
+	page_table_realloc(&tlb, mm, 0, TASK_SIZE);
+	tlb_finish_mmu(&tlb, 0, TASK_SIZE);
+	up_write(&mm->mmap_sem);
+	return mm->context.has_pgste ? 0 : -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(s390_enable_sie);

--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@ -286,6 +286,7 @@ struct kvm_mmu {
 	u64 *pae_root;
 	u64 *lm_root;
 	u64 rsvd_bits_mask[2][4];
+	u64 bad_mt_xwr;

 	/*
 	 * Bitmap: bit set = last pte in walk
@ -323,6 +324,7 @@ struct kvm_pmu {
 	u64 global_ovf_ctrl;
 	u64 counter_bitmask[2];
 	u64 global_ctrl_mask;
+	u64 reserved_bits;
 	u8 version;
 	struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
 	struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
@ -511,6 +513,14 @@ struct kvm_vcpu_arch {
 	 * instruction.
 	 */
 	bool write_fault_to_shadow_pgtable;
+
+	/* set at EPT violation at this point */
+	unsigned long exit_qualification;
+
+	/* pv related host specific info */
+	struct {
+		bool pv_unhalted;
+	} pv;
 };

 struct kvm_lpage_info {
@ -802,8 +812,8 @@ extern u32  kvm_min_guest_tsc_khz;
 extern u32  kvm_max_guest_tsc_khz;

 enum emulation_result {
-	EMULATE_DONE,       /* no further processing */
-	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
+	EMULATE_DONE,         /* no further processing */
+	EMULATE_USER_EXIT,    /* kvm_run ready for userspace exit */
 	EMULATE_FAIL,         /* can't emulate this instruction */
 };

--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,

 struct pvclock_vsyscall_time_info {
 	struct pvclock_vcpu_time_info pvti;
-	u32 migrate_count;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));

 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@ -387,6 +387,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR		0
 #define VMX_EPT_EXTENT_CONTEXT			1
 #define VMX_EPT_EXTENT_GLOBAL			2
+#define VMX_EPT_EXTENT_SHIFT			24

 #define VMX_EPT_EXECUTE_ONLY_BIT		(1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT			(1ull << 6)
@ -394,6 +395,7 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT				(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT			(1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT			(1ull << 17)
+#define VMX_EPT_INVEPT_BIT			(1ull << 20)
 #define VMX_EPT_AD_BIT				    (1ull << 21)
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED         45
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
+#define EXIT_REASON_INVEPT              50
 #define EXIT_REASON_PREEMPTION_TIMER    52
 #define EXIT_REASON_WBINVD              54
 #define EXIT_REASON_XSETBV              55
@ -106,12 +107,13 @@
 	{ EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
 	{ EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
 	{ EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
+	{ EXIT_REASON_INVEPT,                "INVEPT" }, \
+	{ EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }, \
 	{ EXIT_REASON_WBINVD,                "WBINVD" }, \
 	{ EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
 	{ EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
 	{ EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
 	{ EXIT_REASON_INVD,                  "INVD" }, \
-	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
-	{ EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }
+	{ EXIT_REASON_INVPCID,               "INVPCID" }

 #endif /* _UAPIVMX_H */
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }

-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
-
-static struct pvclock_vsyscall_time_info *
-pvclock_get_vsyscall_user_time_info(int cpu)
-{
-	if (!pvclock_vdso_info) {
-		BUG();
-		return NULL;
-	}
-
-	return &pvclock_vdso_info[cpu];
-}
-
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
-{
-	return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
-}
-
 #ifdef CONFIG_X86_64
-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
-			        void *v)
-{
-	struct task_migration_notifier *mn = v;
-	struct pvclock_vsyscall_time_info *pvti;
-
-	pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
-
-	/* this is NULL when pvclock vsyscall is not initialized */
-	if (unlikely(pvti == NULL))
-		return NOTIFY_DONE;
-
-	pvti->migrate_count++;
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block pvclock_migrate = {
-	.notifier_call = pvclock_task_migrate,
-};
-
 /*
 * Initialize the generic pvclock vsyscall state.  This will allocate
 * a/some page(s) for the per-vcpu pvclock information, set up a
@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,

 	WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);

-	pvclock_vdso_info = i;
-
 	for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
 		__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
 			     __pa(i) + (idx*PAGE_SIZE),
 			     PAGE_KERNEL_VVAR);
 	}

-
-	register_task_migration_notifier(&pvclock_migrate);
-
 	return 0;
 }
 #endif
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
 			     (1 << KVM_FEATURE_ASYNC_PF) |
 			     (1 << KVM_FEATURE_PV_EOI) |
-			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+			     (1 << KVM_FEATURE_PV_UNHALT);

 		if (sched_info_on())
 			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
 	*((u32 *) (apic->regs + reg_off)) = val;
 }

-static inline int apic_test_and_set_vector(int vec, void *bitmap)
-{
-	return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
-{
-	return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
 static inline int apic_test_vector(int vec, void *bitmap)
 {
 	return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);

-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
 {
 	apic->irr_pending = true;
-	return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+	apic_set_vector(vec, apic->regs + APIC_IRR);
 }

 static inline int apic_search_irr(struct kvm_lapic *apic)
@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (unlikely(!apic_enabled(apic)))
 			break;

+		result = 1;
+
 		if (dest_map)
 			__set_bit(vcpu->vcpu_id, dest_map);

-		if (kvm_x86_ops->deliver_posted_interrupt) {
-			result = 1;
+		if (kvm_x86_ops->deliver_posted_interrupt)
 			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
-		} else {
-			result = !apic_test_and_set_irr(vector, apic);
-
-			if (!result) {
-				if (trig_mode)
-					apic_debug("level trig mode repeatedly "
-						"for vector %d", vector);
-				goto out;
-			}
+		else {
+			apic_set_irr(vector, apic);

 			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_vcpu_kick(vcpu);
 		}
-out:
 		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
-				trig_mode, vector, !result);
+					  trig_mode, vector, false);
 		break;

 	case APIC_DM_REMRD:
-		apic_debug("Ignoring delivery mode 3\n");
+		result = 1;
+		vcpu->arch.pv.pv_unhalted = 1;
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		kvm_vcpu_kick(vcpu);
 		break;

 	case APIC_DM_SMI:
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@ -132,8 +132,8 @@ module_param(dbg, bool, 0644);
 	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 					    * PT32_LEVEL_BITS))) - 1))

-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
-			| PT64_NX_MASK)
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
+			| shadow_x_mask | shadow_nx_mask)

 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@ -331,11 +331,6 @@ static int is_large_pte(u64 pte)
 	return pte & PT_PAGE_SIZE_MASK;
 }

-static int is_dirty_gpte(unsigned long pte)
-{
-	return pte & PT_DIRTY_MASK;
-}
-
 static int is_rmap_spte(u64 pte)
 {
 	return is_shadow_present_pte(pte);
@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
 	return __shadow_walk_next(iterator, *iterator->sptep);
 }

-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
 {
 	u64 spte;

+	BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
+			VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+
 	spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
-	       shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+	       shadow_user_mask | shadow_x_mask;
+
+	if (accessed)
+		spte |= shadow_accessed_mask;

 	mmu_spte_set(sptep, spte);
 }
@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 	mmu_free_roots(vcpu);
 }

-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-	int bit7;
-
-	bit7 = (gpte >> 7) & 1;
-	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 				     bool no_dirty_log)
 {
@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 	return gfn_to_pfn_memslot_atomic(slot, gfn);
 }

-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
-				  struct kvm_mmu_page *sp, u64 *spte,
-				  u64 gpte)
-{
-	if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-		goto no_present;
-
-	if (!is_present_gpte(gpte))
-		goto no_present;
-
-	if (!(gpte & PT_ACCESSED_MASK))
-		goto no_present;
-
-	return false;
-
-no_present:
-	drop_spte(vcpu->kvm, spte);
-	return true;
-}
-
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp,
 				    u64 *start, u64 *end)
@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 					      iterator.level - 1,
 					      1, ACC_ALL, iterator.sptep);

-			link_shadow_page(iterator.sptep, sp);
+			link_shadow_page(iterator.sptep, sp, true);
 		}
 	}
 	return emulate;
@ -2808,7 +2781,7 @@ exit:
 	return ret;
 }

-static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
+static bool page_fault_can_be_fast(u32 error_code)
 {
 	/*
 	 * Do not fix the mmio spte with invalid generation number which
@ -2861,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 	bool ret = false;
 	u64 spte = 0ull;

-	if (!page_fault_can_be_fast(vcpu, error_code))
+	if (!page_fault_can_be_fast(error_code))
 		return false;

 	walk_shadow_page_lockless_begin(vcpu);
@ -3209,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 	mmu_sync_roots(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);

 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
 				  u32 access, struct x86_exception *exception)
@ -3478,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 	++vcpu->stat.tlb_flush;
 	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);

 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
@ -3501,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
 	nonpaging_free(vcpu);
 }

-static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
-{
-	unsigned mask;
-
-	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
-
-	mask = (unsigned)~ACC_WRITE_MASK;
-	/* Allow write access to dirty gptes */
-	mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
-	*access &= mask;
-}
-
 static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
 			   unsigned access, int *nr_present)
 {
@ -3530,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
 	return false;
 }

-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
-	unsigned access;
-
-	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-	access &= ~(gpte >> PT64_NX_SHIFT);
-
-	return access;
-}
-
 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
 {
 	unsigned index;
@ -3549,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
 	return mmu->last_pte_bitmap & (1 << index);
 }

+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
@ -3563,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	u64 exb_bit_rsvd = 0;

+	context->bad_mt_xwr = 0;
+
 	if (!context->nx)
 		exb_bit_rsvd = rsvd_bits(63, 63);
 	switch (context->root_level) {
@ -3618,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 	}
 }

-static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+		struct kvm_mmu *context, bool execonly)
+{
+	int maxphyaddr = cpuid_maxphyaddr(vcpu);
+	int pte;
+
+	context->rsvd_bits_mask[0][3] =
+		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+	context->rsvd_bits_mask[0][2] =
+		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+	context->rsvd_bits_mask[0][1] =
+		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+	context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+
+	/* large page */
+	context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+	context->rsvd_bits_mask[1][2] =
+		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+	context->rsvd_bits_mask[1][1] =
+		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+	context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+
+	for (pte = 0; pte < 64; pte++) {
+		int rwx_bits = pte & 7;
+		int mt = pte >> 3;
+		if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
+				rwx_bits == 0x2 || rwx_bits == 0x6 ||
+				(rwx_bits == 0x4 && !execonly))
+			context->bad_mt_xwr |= (1ull << pte);
+	}
+}
+
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+		struct kvm_mmu *mmu, bool ept)
 {
 	unsigned bit, byte, pfec;
 	u8 map;
@ -3636,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
 			w = bit & ACC_WRITE_MASK;
 			u = bit & ACC_USER_MASK;

-			/* Not really needed: !nx will cause pte.nx to fault */
-			x |= !mmu->nx;
-			/* Allow supervisor writes if !cr0.wp */
-			w |= !is_write_protection(vcpu) && !uf;
-			/* Disallow supervisor fetches of user code if cr4.smep */
-			x &= !(smep && u && !uf);
+			if (!ept) {
+				/* Not really needed: !nx will cause pte.nx to fault */
+				x |= !mmu->nx;
+				/* Allow supervisor writes if !cr0.wp */
+				w |= !is_write_protection(vcpu) && !uf;
+				/* Disallow supervisor fetches of user code if cr4.smep */
+				x &= !(smep && u && !uf);
+			} else
+				/* Not really needed: no U/S accesses on ept  */
+				u = 1;

 			fault = (ff && !x) || (uf && !u) || (wf && !w);
 			map |= fault << bit;
@ -3676,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 	context->root_level = level;

 	reset_rsvds_bits_mask(vcpu, context);
-	update_permission_bitmask(vcpu, context);
+	update_permission_bitmask(vcpu, context, false);
 	update_last_pte_bitmap(vcpu, context);

 	ASSERT(is_pae(vcpu));
@ -3706,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
 	context->root_level = PT32_ROOT_LEVEL;

 	reset_rsvds_bits_mask(vcpu, context);
-	update_permission_bitmask(vcpu, context);
+	update_permission_bitmask(vcpu, context, false);
 	update_last_pte_bitmap(vcpu, context);

 	context->new_cr3 = paging_new_cr3;
@ -3768,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 		context->gva_to_gpa = paging32_gva_to_gpa;
 	}

-	update_permission_bitmask(vcpu, context);
+	update_permission_bitmask(vcpu, context, false);
 	update_last_pte_bitmap(vcpu, context);

 	return 0;
@ -3800,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);

+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+		bool execonly)
+{
+	ASSERT(vcpu);
+	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+
+	context->nx = true;
+	context->new_cr3 = paging_new_cr3;
+	context->page_fault = ept_page_fault;
+	context->gva_to_gpa = ept_gva_to_gpa;
+	context->sync_page = ept_sync_page;
+	context->invlpg = ept_invlpg;
+	context->update_pte = ept_update_pte;
+	context->free = paging_free;
+	context->root_level = context->shadow_root_level;
+	context->root_hpa = INVALID_PAGE;
+	context->direct_map = false;
+
+	update_permission_bitmask(vcpu, context, true);
+	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
+
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
 	int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
@ -3847,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
 	}

-	update_permission_bitmask(vcpu, g_context);
+	update_permission_bitmask(vcpu, g_context, false);
 	update_last_pte_bitmap(vcpu, g_context);

 	return 0;
@ -3923,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new)
 		return true;
 	if ((old ^ new) & PT64_BASE_ADDR_MASK)
 		return true;
-	old ^= PT64_NX_MASK;
-	new ^= PT64_NX_MASK;
+	old ^= shadow_nx_mask;
+	new ^= shadow_nx_mask;
 	return (old & ~new & PT64_PERM_MASK) != 0;
 }

@ -4182,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 	switch (er) {
 	case EMULATE_DONE:
 		return 1;
-	case EMULATE_DO_MMIO:
+	case EMULATE_USER_EXIT:
 		++vcpu->stat.mmio_exits;
 		/* fall through */
 	case EMULATE_FAIL:
@ -4390,11 +4414,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
 	/*
 	 * The very rare case: if the generation-number is round,
 	 * zap all shadow pages.
-	 *
-	 * The max value is MMIO_MAX_GEN - 1 since it is not called
-	 * when mark memslot invalid.
 	 */
-	if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
+	if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
 		printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
 		kvm_mmu_invalidate_zap_all_pages(kvm);
 	}
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@ -71,6 +71,8 @@ enum {

 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+		bool execonly);

 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@ -23,6 +23,13 @@
 * so the code in this file is compiled twice, once per pte size.
 */

+/*
+ * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
+ * uses for EPT without A/D paging type.
+ */
+extern u64 __pure __using_nonexistent_pte_bit(void)
+	       __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
+
 #if PTTYPE == 64
 	#define pt_element_t u64
 	#define guest_walker guest_walker64
@ -32,6 +39,10 @@
 	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
 	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
 	#define PT_LEVEL_BITS PT64_LEVEL_BITS
+	#define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
 	#ifdef CONFIG_X86_64
 	#define PT_MAX_FULL_LEVELS 4
 	#define CMPXCHG cmpxchg
@ -49,7 +60,26 @@
 	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
 	#define PT_LEVEL_BITS PT32_LEVEL_BITS
 	#define PT_MAX_FULL_LEVELS 2
+	#define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
 	#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+	#define pt_element_t u64
+	#define guest_walker guest_walkerEPT
+	#define FNAME(name) ept_##name
+	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+	#define PT_LEVEL_BITS PT64_LEVEL_BITS
+	#define PT_GUEST_ACCESSED_MASK 0
+	#define PT_GUEST_DIRTY_MASK 0
+	#define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
+	#define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+	#define CMPXCHG cmpxchg64
+	#define PT_MAX_FULL_LEVELS 4
 #else
 	#error Invalid PTTYPE value
 #endif
@ -80,6 +110,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
 	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }

+static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
+{
+	unsigned mask;
+
+	/* dirty bit is not supported, so no need to track it */
+	if (!PT_GUEST_DIRTY_MASK)
+		return;
+
+	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+	mask = (unsigned)~ACC_WRITE_MASK;
+	/* Allow write access to dirty gptes */
+	mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
+		PT_WRITABLE_MASK;
+	*access &= mask;
+}
+
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+	int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
+
+	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
+		((mmu->bad_mt_xwr & (1ull << low6)) != 0);
+}
+
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE != PTTYPE_EPT
+	return is_present_gpte(pte);
+#else
+	return pte & 7;
+#endif
+}
+
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			       pt_element_t __user *ptep_user, unsigned index,
 			       pt_element_t orig_pte, pt_element_t new_pte)
@ -103,6 +167,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	return (ret != orig_pte);
 }

+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu_page *sp, u64 *spte,
+				  u64 gpte)
+{
+	if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+		goto no_present;
+
+	if (!FNAME(is_present_gpte)(gpte))
+		goto no_present;
+
+	/* if accessed bit is not supported prefetch non accessed gpte */
+	if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
+		goto no_present;
+
+	return false;
+
+no_present:
+	drop_spte(vcpu->kvm, spte);
+	return true;
+}
+
+static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+	unsigned access;
+#if PTTYPE == PTTYPE_EPT
+	access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+		ACC_USER_MASK;
+#else
+	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+	access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+
+	return access;
+}
+
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 					     struct kvm_mmu *mmu,
 					     struct guest_walker *walker,
@ -114,18 +214,23 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 	gfn_t table_gfn;
 	int ret;

+	/* dirty/accessed bits are not supported, so no need to update them */
+	if (!PT_GUEST_DIRTY_MASK)
+		return 0;
+
 	for (level = walker->max_level; level >= walker->level; --level) {
 		pte = orig_pte = walker->ptes[level - 1];
 		table_gfn = walker->table_gfn[level - 1];
 		ptep_user = walker->ptep_user[level - 1];
 		index = offset_in_page(ptep_user) / sizeof(pt_element_t);
-		if (!(pte & PT_ACCESSED_MASK)) {
+		if (!(pte & PT_GUEST_ACCESSED_MASK)) {
 			trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
-			pte |= PT_ACCESSED_MASK;
+			pte |= PT_GUEST_ACCESSED_MASK;
 		}
-		if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
+		if (level == walker->level && write_fault &&
+				!(pte & PT_GUEST_DIRTY_MASK)) {
 			trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
-			pte |= PT_DIRTY_MASK;
+			pte |= PT_GUEST_DIRTY_MASK;
 		}
 		if (pte == orig_pte)
 			continue;
@ -170,7 +275,7 @@ retry_walk:
 	if (walker->level == PT32E_ROOT_LEVEL) {
 		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
-		if (!is_present_gpte(pte))
+		if (!FNAME(is_present_gpte)(pte))
 			goto error;
 		--walker->level;
 	}
@ -179,7 +284,7 @@ retry_walk:
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
 	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);

-	accessed_dirty = PT_ACCESSED_MASK;
+	accessed_dirty = PT_GUEST_ACCESSED_MASK;
 	pt_access = pte_access = ACC_ALL;
 	++walker->level;

@ -215,17 +320,17 @@ retry_walk:

 		trace_kvm_mmu_paging_element(pte, walker->level);

-		if (unlikely(!is_present_gpte(pte)))
+		if (unlikely(!FNAME(is_present_gpte)(pte)))
 			goto error;

-		if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
-					      walker->level))) {
+		if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
+					             walker->level))) {
 			errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
 			goto error;
 		}

 		accessed_dirty &= pte;
-		pte_access = pt_access & gpte_access(vcpu, pte);
+		pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);

 		walker->ptes[walker->level - 1] = pte;
 	} while (!is_last_gpte(mmu, walker->level, pte));
@ -248,13 +353,15 @@ retry_walk:
 	walker->gfn = real_gpa >> PAGE_SHIFT;

 	if (!write_fault)
-		protect_clean_gpte(&pte_access, pte);
+		FNAME(protect_clean_gpte)(&pte_access, pte);
 	else
 		/*
-		 * On a write fault, fold the dirty bit into accessed_dirty by
-		 * shifting it one place right.
+		 * On a write fault, fold the dirty bit into accessed_dirty.
+		 * For modes without A/D bits support accessed_dirty will be
+		 * always clear.
 		 */
-		accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
+		accessed_dirty &= pte >>
+			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);

 	if (unlikely(!accessed_dirty)) {
 		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@ -279,6 +386,25 @@ error:
 	walker->fault.vector = PF_VECTOR;
 	walker->fault.error_code_valid = true;
 	walker->fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+	/*
+	 * Use PFERR_RSVD_MASK in error_code to to tell if EPT
+	 * misconfiguration requires to be injected. The detection is
+	 * done by is_rsvd_bits_set() above.
+	 *
+	 * We set up the value of exit_qualification to inject:
+	 * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
+	 * [5:3] - Calculated by the page walk of the guest EPT page tables
+	 * [7:8] - Derived from [7:8] of real exit_qualification
+	 *
+	 * The other bits are set to 0.
+	 */
+	if (!(errcode & PFERR_RSVD_MASK)) {
+		vcpu->arch.exit_qualification &= 0x187;
+		vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
+	}
+#endif
 	walker->fault.address = addr;
 	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;

@ -293,6 +419,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 					access);
 }

+#if PTTYPE != PTTYPE_EPT
 static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 				   struct kvm_vcpu *vcpu, gva_t addr,
 				   u32 access)
@ -300,6 +427,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
 					addr, access);
 }
+#endif

 static bool
 FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@ -309,14 +437,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	gfn_t gfn;
 	pfn_t pfn;

-	if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
 		return false;

 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);

 	gfn = gpte_to_gfn(gpte);
-	pte_access = sp->role.access & gpte_access(vcpu, gpte);
-	protect_clean_gpte(&pte_access, gpte);
+	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+	FNAME(protect_clean_gpte)(&pte_access, gpte);
 	pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
 			no_dirty_log && (pte_access & ACC_WRITE_MASK));
 	if (is_error_pfn(pfn))
@ -446,7 +574,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			goto out_gpte_changed;

 		if (sp)
-			link_shadow_page(it.sptep, sp);
+			link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
 	}

 	for (;
@ -466,7 +594,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,

 		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
 				      true, direct_access, it.sptep);
-		link_shadow_page(it.sptep, sp);
+		link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
 	}

 	clear_sp_write_flooding_count(it.sptep);
@ -727,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	return gpa;
 }

+#if PTTYPE != PTTYPE_EPT
 static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 				      u32 access,
 				      struct x86_exception *exception)
@ -745,6 +874,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,

 	return gpa;
 }
+#endif

 /*
 * Using the cached information from sp->gfns is safe because:
@ -785,15 +915,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 					  sizeof(pt_element_t)))
 			return -EINVAL;

-		if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
+		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
 			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}

 		gfn = gpte_to_gfn(gpte);
 		pte_access = sp->role.access;
-		pte_access &= gpte_access(vcpu, gpte);
-		protect_clean_gpte(&pte_access, gpte);
+		pte_access &= FNAME(gpte_access)(vcpu, gpte);
+		FNAME(protect_clean_gpte)(&pte_access, gpte);

 		if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
 		      &nr_present))
@ -830,3 +960,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 #undef gpte_to_gfn
 #undef gpte_to_gfn_lvl
 #undef CMPXCHG
+#undef PT_GUEST_ACCESSED_MASK
+#undef PT_GUEST_DIRTY_MASK
+#undef PT_GUEST_DIRTY_SHIFT
+#undef PT_GUEST_ACCESSED_SHIFT
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc)

 static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
 		unsigned config, bool exclude_user, bool exclude_kernel,
-		bool intr)
+		bool intr, bool in_tx, bool in_tx_cp)
 {
 	struct perf_event *event;
 	struct perf_event_attr attr = {
@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
 		.exclude_kernel = exclude_kernel,
 		.config = config,
 	};
+	if (in_tx)
+		attr.config |= HSW_IN_TX;
+	if (in_tx_cp)
+		attr.config |= HSW_IN_TX_CHECKPOINTED;

 	attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);

@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)

 	if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
 				ARCH_PERFMON_EVENTSEL_INV |
-				ARCH_PERFMON_EVENTSEL_CMASK))) {
+				ARCH_PERFMON_EVENTSEL_CMASK |
+				HSW_IN_TX |
+				HSW_IN_TX_CHECKPOINTED))) {
 		config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
 				unit_mask);
 		if (config != PERF_COUNT_HW_MAX)
@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 	reprogram_counter(pmc, type, config,
 			!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
 			!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
-			eventsel & ARCH_PERFMON_EVENTSEL_INT);
+			eventsel & ARCH_PERFMON_EVENTSEL_INT,
+			(eventsel & HSW_IN_TX),
+			(eventsel & HSW_IN_TX_CHECKPOINTED));
 }

 static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
 			arch_events[fixed_pmc_events[idx]].event_type,
 			!(en & 0x2), /* exclude user */
 			!(en & 0x1), /* exclude kernel */
-			pmi);
+			pmi, false, false);
 }

 static inline u8 fixed_en_pmi(u64 ctrl, int idx)
@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
 			if (data == pmc->eventsel)
 				return 0;
-			if (!(data & 0xffffffff00200000ull)) {
+			if (!(data & pmu->reserved_bits)) {
 				reprogram_gp_counter(pmc, data);
 				return 0;
 			}
@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
 	pmu->counter_bitmask[KVM_PMC_GP] = 0;
 	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
 	pmu->version = 0;
+	pmu->reserved_bits = 0xffffffff00200000ull;

 	entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
 	if (!entry)
@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
 	pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
 		(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
 	pmu->global_ctrl_mask = ~pmu->global_ctrl;
+
+	entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+	if (entry &&
+	    (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+	    (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
+		pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
 }

 void kvm_pmu_init(struct kvm_vcpu *vcpu)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@ -373,6 +373,7 @@ struct nested_vmx {
 	 * we must keep them pinned while L2 runs.
 	 */
 	struct page *apic_access_page;
+	u64 msr_ia32_feature_control;
 };

 #define POSTED_INTR_ON  0
@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page)
 	kvm_release_page_clean(page);
 }

+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
 		(vmcs12->secondary_vm_exec_control & bit);
 }

-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
-	struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 {
 	return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }

+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
 	/*
@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 	 * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
 	 * 17 must be 1.
 	 */
+	rdmsr(MSR_IA32_VMX_EXIT_CTLS,
+		nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
 	nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 	/* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
+	nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
-	nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#else
-	nested_vmx_exit_ctls_high = 0;
+		VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
-	nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+	nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+				      VM_EXIT_LOAD_IA32_EFER);

 	/* entry controls */
 	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 	/* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
 	nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
 	nested_vmx_entry_ctls_high &=
-		VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
-	nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+#ifdef CONFIG_X86_64
+		VM_ENTRY_IA32E_MODE |
+#endif
+		VM_ENTRY_LOAD_IA32_PAT;
+	nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
+				       VM_ENTRY_LOAD_IA32_EFER);

 	/* cpu-based controls */
 	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		SECONDARY_EXEC_WBINVD_EXITING;

+	if (enable_ept) {
+		/* nested EPT: emulate EPT also to L1 */
+		nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+		nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+			 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+		nested_vmx_ept_caps &= vmx_capability.ept;
+		/*
+		 * Since invept is completely emulated we support both global
+		 * and context invalidation independent of what host cpu
+		 * supports
+		 */
+		nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+			VMX_EPT_EXTENT_CONTEXT_BIT;
+	} else
+		nested_vmx_ept_caps = 0;
+
 	/* miscellaneous data */
 	rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
 	nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)

 	switch (msr_index) {
 	case MSR_IA32_FEATURE_CONTROL:
-		*pdata = 0;
-		break;
+		if (nested_vmx_allowed(vcpu)) {
+			*pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+			break;
+		}
+		return 0;
 	case MSR_IA32_VMX_BASIC:
 		/*
 		 * This MSR reports some information about VMX support. We
@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 					nested_vmx_secondary_ctls_high);
 		break;
 	case MSR_IA32_VMX_EPT_VPID_CAP:
-		/* Currently, no nested ept or nested vpid */
-		*pdata = 0;
+		/* Currently, no nested vpid support */
+		*pdata = nested_vmx_ept_caps;
 		break;
 	default:
 		return 0;
@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 	return 1;
 }

-static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
+	u32 msr_index = msr_info->index;
+	u64 data = msr_info->data;
+	bool host_initialized = msr_info->host_initiated;
+
 	if (!nested_vmx_allowed(vcpu))
 		return 0;

-	if (msr_index == MSR_IA32_FEATURE_CONTROL)
-		/* TODO: the right thing. */
+	if (msr_index == MSR_IA32_FEATURE_CONTROL) {
+		if (!host_initialized &&
+				to_vmx(vcpu)->nested.msr_ia32_feature_control
+				& FEATURE_CONTROL_LOCKED)
+			return 0;
+		to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
 		return 1;
+	}
+
 	/*
 	 * No need to treat VMX capability MSRs specially: If we don't handle
 	 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		/* Otherwise falls through */
 	default:
-		if (vmx_set_vmx_msr(vcpu, msr_index, data))
+		if (vmx_set_vmx_msr(vcpu, msr_info))
 			break;
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
@ -5302,9 +5344,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)

 	/* It is a write fault? */
 	error_code = exit_qualification & (1U << 1);
+	/* It is a fetch fault? */
+	error_code |= (exit_qualification & (1U << 2)) << 2;
 	/* ept page table is present? */
 	error_code |= (exit_qualification >> 3) & 0x1;

+	vcpu->arch.exit_qualification = exit_qualification;
+
 	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }

@ -5438,7 +5484,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)

 		err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);

-		if (err == EMULATE_DO_MMIO) {
+		if (err == EMULATE_USER_EXIT) {
+			++vcpu->stat.mmio_exits;
 			ret = 0;
 			goto out;
 		}
@ -5567,8 +5614,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
 		free_loaded_vmcs(&vmx->vmcs01);
 }

+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+			    X86_EFLAGS_SF | X86_EFLAGS_OF))
+			| X86_EFLAGS_CF);
+}
+
 static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-				 u32 vm_instruction_error);
+					u32 vm_instruction_error)
+{
+	if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+		/*
+		 * failValid writes the error number to the current VMCS, which
+		 * can't be done there isn't a current VMCS.
+		 */
+		nested_vmx_failInvalid(vcpu);
+		return;
+	}
+	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+			    X86_EFLAGS_SF | X86_EFLAGS_OF))
+			| X86_EFLAGS_ZF);
+	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+	/*
+	 * We don't need to force a shadow sync because
+	 * VM_INSTRUCTION_ERROR is not shadowed
+	 */
+}

 /*
 * Emulate the VMXON instruction.
@ -5583,6 +5669,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 	struct kvm_segment cs;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs *shadow_vmcs;
+	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
+		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;

 	/* The Intel VMX Instruction Reference lists a bunch of bits that
 	 * are prerequisite to running VMXON, most notably cr4.VMXE must be
@ -5611,6 +5699,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		skip_emulated_instruction(vcpu);
 		return 1;
 	}
+
+	if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+			!= VMXON_NEEDED_FEATURES) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
 	if (enable_shadow_vmcs) {
 		shadow_vmcs = alloc_vmcs();
 		if (!shadow_vmcs)
@ -5628,6 +5723,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 	vmx->nested.vmxon = true;

 	skip_emulated_instruction(vcpu);
+	nested_vmx_succeed(vcpu);
 	return 1;
 }

@ -5712,6 +5808,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
 		return 1;
 	free_nested(to_vmx(vcpu));
 	skip_emulated_instruction(vcpu);
+	nested_vmx_succeed(vcpu);
 	return 1;
 }

@ -5768,48 +5865,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
 	return 0;
 }

-/*
- * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
- * by Vol 2B, VMX Instruction Reference, "Conventions".
- */
-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
-{
-	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
-			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
-}
-
-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
-{
-	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
-			    X86_EFLAGS_SF | X86_EFLAGS_OF))
-			| X86_EFLAGS_CF);
-}
-
-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-					u32 vm_instruction_error)
-{
-	if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
-		/*
-		 * failValid writes the error number to the current VMCS, which
-		 * can't be done there isn't a current VMCS.
-		 */
-		nested_vmx_failInvalid(vcpu);
-		return;
-	}
-	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-			    X86_EFLAGS_SF | X86_EFLAGS_OF))
-			| X86_EFLAGS_ZF);
-	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
-	/*
-	 * We don't need to force a shadow sync because
-	 * VM_INSTRUCTION_ERROR is not shadowed
-	 */
-}
-
 /* Emulate the VMCLEAR instruction */
 static int handle_vmclear(struct kvm_vcpu *vcpu)
 {
@ -5972,8 +6027,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 	unsigned long field;
 	u64 field_value;
 	struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
-	unsigned long *fields = (unsigned long *)shadow_read_write_fields;
-	int num_fields = max_shadow_read_write_fields;
+	const unsigned long *fields = shadow_read_write_fields;
+	const int num_fields = max_shadow_read_write_fields;

 	vmcs_load(shadow_vmcs);

@ -6002,12 +6057,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)

 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 {
-	unsigned long *fields[] = {
-		(unsigned long *)shadow_read_write_fields,
-		(unsigned long *)shadow_read_only_fields
+	const unsigned long *fields[] = {
+		shadow_read_write_fields,
+		shadow_read_only_fields
 	};
-	int num_lists =  ARRAY_SIZE(fields);
-	int max_fields[] = {
+	const int max_fields[] = {
 		max_shadow_read_write_fields,
 		max_shadow_read_only_fields
 	};
@ -6018,7 +6072,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)

 	vmcs_load(shadow_vmcs);

-	for (q = 0; q < num_lists; q++) {
+	for (q = 0; q < ARRAY_SIZE(fields); q++) {
 		for (i = 0; i < max_fields[q]; i++) {
 			field = fields[q][i];
 			vmcs12_read_any(&vmx->vcpu, field, &field_value);
@ -6248,6 +6302,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 	return 1;
 }

+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+	u32 vmx_instruction_info, types;
+	unsigned long type;
+	gva_t gva;
+	struct x86_exception e;
+	struct {
+		u64 eptp, gpa;
+	} operand;
+	u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
+
+	if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+	    !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+	types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+
+	if (!(types & (1UL << type))) {
+		nested_vmx_failValid(vcpu,
+				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+		return 1;
+	}
+
+	/* According to the Intel VMX instruction reference, the memory
+	 * operand is read even if it isn't needed (e.g., for type==global)
+	 */
+	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+			vmx_instruction_info, &gva))
+		return 1;
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+				sizeof(operand), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+
+	switch (type) {
+	case VMX_EPT_EXTENT_CONTEXT:
+		if ((operand.eptp & eptp_mask) !=
+				(nested_ept_get_cr3(vcpu) & eptp_mask))
+			break;
+	case VMX_EPT_EXTENT_GLOBAL:
+		kvm_mmu_sync_roots(vcpu);
+		kvm_mmu_flush_tlb(vcpu);
+		nested_vmx_succeed(vcpu);
+		break;
+	default:
+		BUG_ON(1);
+		break;
+	}
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 /*
 * The exit handlers return 1 if the exit was handled fully and guest execution
 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@ -6292,6 +6414,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
 	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op,
 	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+	[EXIT_REASON_INVEPT]                  = handle_invept,
 };

 static const int kvm_vmx_max_exit_handlers =
@ -6518,6 +6641,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
 	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
 	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+	case EXIT_REASON_INVEPT:
 		/*
 		 * VMX instructions trap unconditionally. This allows L1 to
 		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
@ -6550,7 +6674,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		return nested_cpu_has2(vmcs12,
 			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
 	case EXIT_REASON_EPT_VIOLATION:
+		/*
+		 * L0 always deals with the EPT violation. If nested EPT is
+		 * used, and the nested mmu code discovers that the address is
+		 * missing in the guest EPT table (EPT12), the EPT violation
+		 * will be injected with nested_ept_inject_page_fault()
+		 */
+		return 0;
 	case EXIT_REASON_EPT_MISCONFIG:
+		/*
+		 * L2 never uses directly L1's EPT, but rather L0's own EPT
+		 * table (shadow on EPT) or a merged EPT table that L0 built
+		 * (EPT on EPT). So any problems with the structure of the
+		 * table is L0's fault.
+		 */
 		return 0;
 	case EXIT_REASON_PREEMPTION_TIMER:
 		return vmcs12->pin_based_vm_exec_control &
@ -6638,7 +6775,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)

 	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
 	    !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-	                                get_vmcs12(vcpu), vcpu)))) {
+					get_vmcs12(vcpu))))) {
 		if (vmx_interrupt_allowed(vcpu)) {
 			vmx->soft_vnmi_blocked = 0;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
@ -7326,6 +7463,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 		entry->ecx |= bit(X86_FEATURE_VMX);
 }

+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+		struct x86_exception *fault)
+{
+	struct vmcs12 *vmcs12;
+	nested_vmx_vmexit(vcpu);
+	vmcs12 = get_vmcs12(vcpu);
+
+	if (fault->error_code & PFERR_RSVD_MASK)
+		vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+	else
+		vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+	vmcs12->exit_qualification = vcpu->arch.exit_qualification;
+	vmcs12->guest_physical_address = fault->address;
+}
+
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+	/* return the page table to be shadowed - in our case, EPT12 */
+	return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+	int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+			nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
+
+	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
+	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
+	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+
+	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+
+	return r;
+}
+
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
 /*
 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@ -7388,7 +7567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmcs12->guest_interruptibility_info);
 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
 	kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
-	vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
+	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
 	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
 		vmcs12->guest_pending_dbg_exceptions);
 	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
@ -7508,15 +7687,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);

-	/* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
-	vmcs_write32(VM_EXIT_CONTROLS,
-		vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
-	vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+	/* L2->L1 exit controls are emulated - the hardware exit is to L0 so
+	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+	 * bits are further modified by vmx_set_efer() below.
+	 */
+	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+	 * emulated by vmx_set_efer(), below.
+	 */
+	vmcs_write32(VM_ENTRY_CONTROLS,
+		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+			~VM_ENTRY_IA32E_MODE) |
 		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));

-	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
+	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
 		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
-	else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vcpu->arch.pat = vmcs12->guest_ia32_pat;
+	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
 		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);


@ -7538,6 +7726,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmx_flush_tlb(vcpu);
 	}

+	if (nested_cpu_has_ept(vmcs12)) {
+		kvm_mmu_unload(vcpu);
+		nested_ept_init_mmu_context(vcpu);
+	}
+
 	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
 		vcpu->arch.efer = vmcs12->guest_ia32_efer;
 	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@ -7565,6 +7758,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	kvm_set_cr3(vcpu, vmcs12->guest_cr3);
 	kvm_mmu_reset_context(vcpu);

+	/*
+	 * L1 may access the L2's PDPTR, so save them to construct vmcs12
+	 */
+	if (enable_ept) {
+		vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+		vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+		vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+	}
+
 	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
 	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
@ -7887,6 +8090,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->guest_pending_dbg_exceptions =
 		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);

+	/*
+	 * In some cases (usually, nested EPT), L2 is allowed to change its
+	 * own CR3 without exiting. If it has changed it, we must keep it.
+	 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+	 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+	 *
+	 * Additionally, restore L2's PDPTR to vmcs12.
+	 */
+	if (enable_ept) {
+		vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+		vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+		vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+		vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+		vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+	}
+
 	vmcs12->vm_entry_controls =
 		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
 		(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
@ -7948,6 +8167,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 				   struct vmcs12 *vmcs12)
 {
+	struct kvm_segment seg;
+
 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
 		vcpu->arch.efer = vmcs12->host_ia32_efer;
 	else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
@ -7982,7 +8203,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
 	kvm_set_cr4(vcpu, vmcs12->host_cr4);

-	/* shadow page tables on either EPT or shadow page tables */
+	if (nested_cpu_has_ept(vmcs12))
+		nested_ept_uninit_mmu_context(vcpu);
+
 	kvm_set_cr3(vcpu, vmcs12->host_cr3);
 	kvm_mmu_reset_context(vcpu);

@ -8001,23 +8224,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
 	vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
 	vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
-	vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
-	vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
-	vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
-	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
-	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
-	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
-	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
-	vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
-	vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
-	vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);

-	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
+	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
 		vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+		vcpu->arch.pat = vmcs12->host_ia32_pat;
+	}
 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
 		vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
 			vmcs12->host_ia32_perf_global_ctrl);

+	/* Set L1 segment info according to Intel SDM
+	    27.5.2 Loading Host Segment and Descriptor-Table Registers */
+	seg = (struct kvm_segment) {
+		.base = 0,
+		.limit = 0xFFFFFFFF,
+		.selector = vmcs12->host_cs_selector,
+		.type = 11,
+		.present = 1,
+		.s = 1,
+		.g = 1
+	};
+	if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+		seg.l = 1;
+	else
+		seg.db = 1;
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+	seg = (struct kvm_segment) {
+		.base = 0,
+		.limit = 0xFFFFFFFF,
+		.type = 3,
+		.present = 1,
+		.s = 1,
+		.db = 1,
+		.g = 1
+	};
+	seg.selector = vmcs12->host_ds_selector;
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+	seg.selector = vmcs12->host_es_selector;
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+	seg.selector = vmcs12->host_ss_selector;
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+	seg.selector = vmcs12->host_fs_selector;
+	seg.base = vmcs12->host_fs_base;
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+	seg.selector = vmcs12->host_gs_selector;
+	seg.base = vmcs12->host_gs_base;
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+	seg = (struct kvm_segment) {
+		.base = vmcs12->host_tr_base,
+		.limit = 0x67,
+		.selector = vmcs12->host_tr_selector,
+		.type = 11,
+		.present = 1
+	};
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+
 	kvm_set_dr(vcpu, 7, 0x400);
 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 }
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		 */
 	}

-	/*
-	 * Does the new cr3 value map to physical memory? (Note, we
-	 * catch an invalid cr3 even in real-mode, because it would
-	 * cause trouble later on when we turn on paging anyway.)
-	 *
-	 * A real CPU would silently accept an invalid cr3 and would
-	 * attempt to use it - with largely undefined (and often hard
-	 * to debug) behavior on the guest side.
-	 */
-	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-		return 1;
 	vcpu->arch.cr3 = cr3;
 	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 	vcpu->arch.mmu.new_cr3(vcpu);
@ -850,7 +839,8 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+	MSR_IA32_FEATURE_CONTROL
 };

 static unsigned num_msrs_to_save;
@ -1457,6 +1447,29 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 #endif
 }

+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+	int i;
+	struct kvm_vcpu *vcpu;
+	struct kvm_arch *ka = &kvm->arch;
+
+	spin_lock(&ka->pvclock_gtod_sync_lock);
+	kvm_make_mclock_inprogress_request(kvm);
+	/* no guest entries from this point */
+	pvclock_update_vm_gtod_copy(kvm);
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+
+	/* guest entries allowed */
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+
+	spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
+
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
 	unsigned long flags, this_tsc_khz;
@ -3806,6 +3819,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		delta = user_ns.clock - now_ns;
 		local_irq_enable();
 		kvm->arch.kvmclock_offset = delta;
+		kvm_gen_update_masterclock(kvm);
 		break;
 	}
 	case KVM_GET_CLOCK: {
@ -4955,6 +4969,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);

+static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
+				unsigned long *db)
+{
+	u32 dr6 = 0;
+	int i;
+	u32 enable, rwlen;
+
+	enable = dr7;
+	rwlen = dr7 >> 16;
+	for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
+		if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
+			dr6 |= (1 << i);
+	return dr6;
+}
+
+static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
+{
+	struct kvm_run *kvm_run = vcpu->run;
+
+	/*
+	 * Use the "raw" value to see if TF was passed to the processor.
+	 * Note that the new value of the flags has not been saved yet.
+	 *
+	 * This is correct even for TF set by the guest, because "the
+	 * processor will not generate this exception after the instruction
+	 * that sets the TF flag".
+	 */
+	unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+
+	if (unlikely(rflags & X86_EFLAGS_TF)) {
+		if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+			kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
+			kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+			kvm_run->debug.arch.exception = DB_VECTOR;
+			kvm_run->exit_reason = KVM_EXIT_DEBUG;
+			*r = EMULATE_USER_EXIT;
+		} else {
+			vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
+			/*
+			 * "Certain debug exceptions may clear bit 0-3.  The
+			 * remaining contents of the DR6 register are never
+			 * cleared by the processor".
+			 */
+			vcpu->arch.dr6 &= ~15;
+			vcpu->arch.dr6 |= DR6_BS;
+			kvm_queue_exception(vcpu, DB_VECTOR);
+		}
+	}
+}
+
+static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+{
+	struct kvm_run *kvm_run = vcpu->run;
+	unsigned long eip = vcpu->arch.emulate_ctxt.eip;
+	u32 dr6 = 0;
+
+	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+	    (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+		dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+					   vcpu->arch.guest_debug_dr7,
+					   vcpu->arch.eff_db);
+
+		if (dr6 != 0) {
+			kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+			kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
+				get_segment_base(vcpu, VCPU_SREG_CS);
+
+			kvm_run->debug.arch.exception = DB_VECTOR;
+			kvm_run->exit_reason = KVM_EXIT_DEBUG;
+			*r = EMULATE_USER_EXIT;
+			return true;
+		}
+	}
+
+	if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
+		dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+					   vcpu->arch.dr7,
+					   vcpu->arch.db);
+
+		if (dr6 != 0) {
+			vcpu->arch.dr6 &= ~15;
+			vcpu->arch.dr6 |= dr6;
+			kvm_queue_exception(vcpu, DB_VECTOR);
+			*r = EMULATE_DONE;
+			return true;
+		}
+	}
+
+	return false;
+}
+
 int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 			    unsigned long cr2,
 			    int emulation_type,
@ -4975,6 +5080,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,

 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 		init_emulate_ctxt(vcpu);
+
+		/*
+		 * We will reenter on the same instruction since
+		 * we do not set complete_userspace_io.  This does not
+		 * handle watchpoints yet, those would be handled in
+		 * the emulate_ops.
+		 */
+		if (kvm_vcpu_check_breakpoint(vcpu, &r))
+			return r;
+
 		ctxt->interruptibility = 0;
 		ctxt->have_exception = false;
 		ctxt->perm_ok = false;
@ -5031,17 +5146,18 @@ restart:
 		inject_emulated_exception(vcpu);
 		r = EMULATE_DONE;
 	} else if (vcpu->arch.pio.count) {
-		if (!vcpu->arch.pio.in)
+		if (!vcpu->arch.pio.in) {
+			/* FIXME: return into emulator if single-stepping.  */
 			vcpu->arch.pio.count = 0;
-		else {
+		} else {
 			writeback = false;
 			vcpu->arch.complete_userspace_io = complete_emulated_pio;
 		}
-		r = EMULATE_DO_MMIO;
+		r = EMULATE_USER_EXIT;
 	} else if (vcpu->mmio_needed) {
 		if (!vcpu->mmio_is_write)
 			writeback = false;
-		r = EMULATE_DO_MMIO;
+		r = EMULATE_USER_EXIT;
 		vcpu->arch.complete_userspace_io = complete_emulated_mmio;
 	} else if (r == EMULATION_RESTART)
 		goto restart;
@ -5050,10 +5166,12 @@ restart:

 	if (writeback) {
 		toggle_interruptibility(vcpu, ctxt->interruptibility);
-		kvm_set_rflags(vcpu, ctxt->eflags);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 		kvm_rip_write(vcpu, ctxt->eip);
+		if (r == EMULATE_DONE)
+			kvm_vcpu_check_singlestep(vcpu, &r);
+		kvm_set_rflags(vcpu, ctxt->eflags);
 	} else
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = true;

@ -5347,7 +5465,7 @@ static struct notifier_block pvclock_gtod_notifier = {
 int kvm_arch_init(void *opaque)
 {
 	int r;
-	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+	struct kvm_x86_ops *ops = opaque;

 	if (kvm_x86_ops) {
 		printk(KERN_ERR "kvm: already loaded the other module\n");
@ -5495,6 +5613,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 	return 1;
 }

+/*
+ * kvm_pv_kick_cpu_op:  Kick a vcpu.
+ *
+ * @apicid - apicid of vcpu to be kicked.
+ */
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
+{
+	struct kvm_lapic_irq lapic_irq;
+
+	lapic_irq.shorthand = 0;
+	lapic_irq.dest_mode = 0;
+	lapic_irq.dest_id = apicid;
+
+	lapic_irq.delivery_mode = APIC_DM_REMRD;
+	kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
 	unsigned long nr, a0, a1, a2, a3, ret;
@ -5528,6 +5663,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	case KVM_HC_VAPIC_POLL_IRQ:
 		ret = 0;
 		break;
+	case KVM_HC_KICK_CPU:
+		kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
+		ret = 0;
+		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
@ -5689,29 +5828,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }

-static void kvm_gen_update_masterclock(struct kvm *kvm)
-{
-#ifdef CONFIG_X86_64
-	int i;
-	struct kvm_vcpu *vcpu;
-	struct kvm_arch *ka = &kvm->arch;
-
-	spin_lock(&ka->pvclock_gtod_sync_lock);
-	kvm_make_mclock_inprogress_request(kvm);
-	/* no guest entries from this point */
-	pvclock_update_vm_gtod_copy(kvm);
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
-
-	/* guest entries allowed */
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
-
-	spin_unlock(&ka->pvclock_gtod_sync_lock);
-#endif
-}
-
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
 	u64 eoi_exit_bitmap[4];
@ -5950,6 +6066,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 				kvm_apic_accept_events(vcpu);
 				switch(vcpu->arch.mp_state) {
 				case KVM_MP_STATE_HALTED:
+					vcpu->arch.pv.pv_unhalted = false;
 					vcpu->arch.mp_state =
 						KVM_MP_STATE_RUNNABLE;
 				case KVM_MP_STATE_RUNNABLE:
@ -6061,6 +6178,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)

 	if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
 		vcpu->mmio_needed = 0;
+
+		/* FIXME: return into emulator if single-stepping.  */
 		if (vcpu->mmio_is_write)
 			return 1;
 		vcpu->mmio_read_completed = 1;
@ -6249,7 +6368,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
 	kvm_apic_accept_events(vcpu);
-	mp_state->mp_state = vcpu->arch.mp_state;
+	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
+					vcpu->arch.pv.pv_unhalted)
+		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+	else
+		mp_state->mp_state = vcpu->arch.mp_state;
+
 	return 0;
 }

@ -6770,6 +6894,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	BUG_ON(vcpu->kvm == NULL);
 	kvm = vcpu->kvm;

+	vcpu->arch.pv.pv_unhalted = false;
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@ -7019,6 +7144,15 @@ out_free:
 	return -ENOMEM;
 }

+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+	/*
+	 * memslots->generation has been incremented.
+	 * mmio generation may have reached its maximum value.
+	 */
+	kvm_mmu_invalidate_mmio_sptes(kvm);
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_userspace_memory_region *mem,
@ -7079,11 +7213,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 */
 	if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
 		kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-	/*
-	 * If memory slot is created, or moved, we need to clear all
-	 * mmio sptes.
-	 */
-	kvm_mmu_invalidate_mmio_sptes(kvm);
 }

 void kvm_arch_flush_shadow_all(struct kvm *kvm)
@ -7103,6 +7232,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 		!vcpu->arch.apf.halted)
 		|| !list_empty_careful(&vcpu->async_pf.done)
 		|| kvm_apic_has_events(vcpu)
+		|| vcpu->arch.pv.pv_unhalted
 		|| atomic_read(&vcpu->arch.nmi_queued) ||
 		(kvm_arch_interrupt_allowed(vcpu) &&
 		 kvm_cpu_has_interrupt(vcpu));
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode)
 	cycle_t ret;
 	u64 last;
 	u32 version;
-	u32 migrate_count;
 	u8 flags;
 	unsigned cpu, cpu1;


 	/*
-	 * When looping to get a consistent (time-info, tsc) pair, we
-	 * also need to deal with the possibility we can switch vcpus,
-	 * so make sure we always re-fetch time-info for the current vcpu.
+	 * Note: hypervisor must guarantee that:
+	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
+	 * 2. that per-CPU pvclock time info is updated if the
+	 *    underlying CPU changes.
+	 * 3. that version is increased whenever underlying CPU
+	 *    changes.
+	 *
 	 */
 	do {
 		cpu = __getcpu() & VGETCPU_CPU_MASK;
@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode)

 		pvti = get_pvti(cpu);

-		migrate_count = pvti->migrate_count;
-
 		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);

 		/*
@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode)
 		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
 	} while (unlikely(cpu != cpu1 ||
 			  (pvti->pvti.version & 1) ||
-			  pvti->pvti.version != version ||
-			  pvti->migrate_count != migrate_count));
+			  pvti->pvti.version != version));

 	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
 		*mode = VCLOCK_NONE;
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@ -200,11 +200,9 @@ config DMA_SHARED_BUFFER
 	  APIs extension; the file's descriptor can then be passed on to other
 	  driver.

-config CMA
-	bool "Contiguous Memory Allocator"
-	depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK
-	select MIGRATION
-	select MEMORY_ISOLATION
+config DMA_CMA
+	bool "DMA Contiguous Memory Allocator"
+	depends on HAVE_DMA_CONTIGUOUS && CMA
 	help
 	  This enables the Contiguous Memory Allocator which allows drivers
 	  to allocate big physically-contiguous blocks of memory for use with
@ -213,17 +211,7 @@ config CMA
 	  For more information see <include/linux/dma-contiguous.h>.
 	  If unsure, say "n".

-if CMA
-
-config CMA_DEBUG
-	bool "CMA debug messages (DEVELOPMENT)"
-	depends on DEBUG_KERNEL
-	help
-	  Turns on debug messages in CMA.  This produces KERN_DEBUG
-	  messages for every CMA call as well as various messages while
-	  processing calls such as dma_alloc_from_contiguous().
-	  This option does not affect warning and error messages.
-
+if  DMA_CMA
 comment "Default contiguous memory area size:"

 config CMA_SIZE_MBYTES
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@ -6,7 +6,7 @@ obj-y			:= core.o bus.o dd.o syscore.o \
 			   attribute_container.o transport_class.o \
 			   topology.o
 obj-$(CONFIG_DEVTMPFS)	+= devtmpfs.o
-obj-$(CONFIG_CMA) += dma-contiguous.o
+obj-$(CONFIG_DMA_CMA) += dma-contiguous.o
 obj-y			+= power/
 obj-$(CONFIG_HAS_DMA)	+= dma-mapping.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@ -26,7 +26,7 @@
 #include <linux/types.h>
 #include <linux/irqchip/arm-gic.h>

-#define VGIC_NR_IRQS		128
+#define VGIC_NR_IRQS		256
 #define VGIC_NR_SGIS		16
 #define VGIC_NR_PPIS		16
 #define VGIC_NR_PRIVATE_IRQS	(VGIC_NR_SGIS + VGIC_NR_PPIS)
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@ -57,7 +57,7 @@ struct cma;
 struct page;
 struct device;

-#ifdef CONFIG_CMA
+#ifdef CONFIG_DMA_CMA

 /*
 * There is always at least global CMA area and a few optional device
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@ -85,6 +85,12 @@ static inline bool is_noslot_pfn(pfn_t pfn)
 	return pfn == KVM_PFN_NOSLOT;
 }

+/*
+ * architectures with KVM_HVA_ERR_BAD other than PAGE_OFFSET (e.g. s390)
+ * provide own defines and kvm_is_error_hva
+ */
+#ifndef KVM_HVA_ERR_BAD
+
 #define KVM_HVA_ERR_BAD		(PAGE_OFFSET)
 #define KVM_HVA_ERR_RO_BAD	(PAGE_OFFSET + PAGE_SIZE)

@ -93,6 +99,8 @@ static inline bool kvm_is_error_hva(unsigned long addr)
 	return addr >= PAGE_OFFSET;
 }

+#endif
+
 #define KVM_ERR_PTR_BAD_PAGE	(ERR_PTR(-ENOENT))

 static inline bool is_error_page(struct page *page)
@ -160,8 +168,12 @@ enum kvm_bus {

 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val);
+int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+			    int len, const void *val, long cookie);
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
 		    void *val);
+int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+			   int len, void *val, long cookie);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 			    int len, struct kvm_io_device *dev);
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@ -499,6 +511,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 			   struct kvm_memory_slot *dont);
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
+void kvm_arch_memslots_updated(struct kvm *kvm);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_userspace_memory_region *mem,
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -107,14 +107,6 @@ extern unsigned long this_cpu_load(void);
 extern void calc_global_load(unsigned long ticks);
 extern void update_cpu_load_nohz(void);

-/* Notifier for when a task gets migrated to a new CPU */
-struct task_migration_notifier {
-	struct task_struct *task;
-	int from_cpu;
-	int to_cpu;
-};
-extern void register_task_migration_notifier(struct notifier_block *n);
-
 extern unsigned long get_parent_ip(unsigned long addr);

 extern void dump_cpu_task(int cpu);
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@ -667,6 +667,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_RTAS 91
 #define KVM_CAP_IRQ_XICS 92
 #define KVM_CAP_ARM_EL1_32BIT 93
+#define KVM_CAP_SPAPR_MULTITCE 94

 #ifdef KVM_CAP_IRQ_ROUTING

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -978,13 +978,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 		rq->skip_clock_update = 1;
 }

-static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
-
-void register_task_migration_notifier(struct notifier_block *n)
-{
-	atomic_notifier_chain_register(&task_migration_notifier, n);
-}
-
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@ -1015,18 +1008,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	trace_sched_migrate_task(p, new_cpu);

 	if (task_cpu(p) != new_cpu) {
-		struct task_migration_notifier tmn;
-
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
-
-		tmn.task = p;
-		tmn.from_cpu = task_cpu(p);
-		tmn.to_cpu = new_cpu;
-
-		atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
 	}

 	__set_task_cpu(p, new_cpu);
--- a/mm/Kconfig
+++ b/mm/Kconfig
@ -478,6 +478,30 @@ config FRONTSWAP

 	  If unsure, say Y to enable frontswap.

+config CMA
+	bool "Contiguous Memory Allocator"
+	depends on HAVE_MEMBLOCK
+	select MIGRATION
+	select MEMORY_ISOLATION
+	help
+	  This enables the Contiguous Memory Allocator which allows other
+	  subsystems to allocate big physically-contiguous blocks of memory.
+	  CMA reserves a region of memory and allows only movable pages to
+	  be allocated from it. This way, the kernel can use the memory for
+	  pagecache and when a subsystem requests for contiguous area, the
+	  allocated pages are migrated away to serve the contiguous request.
+
+	  If unsure, say "n".
+
+config CMA_DEBUG
+	bool "CMA debug messages (DEVELOPMENT)"
+	depends on DEBUG_KERNEL && CMA
+	help
+	  Turns on debug messages in CMA.  This produces KERN_DEBUG
+	  messages for every CMA call as well as various messages while
+	  processing calls such as dma_alloc_from_contiguous().
+	  This option does not affect warning and error messages.
+
 config ZBUD
 	tristate
 	default n
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@ -149,7 +149,7 @@ static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
 {
 	offset >>= 2;
 	BUG_ON(offset > (VGIC_NR_IRQS / 4));
-	if (offset < 4)
+	if (offset < 8)
 		return x->percpu[cpuid] + offset;
 	else
 		return x->shared + offset - 8;
@ -432,19 +432,13 @@ static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
 static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu;
-	int i, c;
-	unsigned long *bmap;
+	int i;
 	u32 val = 0;

 	irq -= VGIC_NR_PRIVATE_IRQS;

-	kvm_for_each_vcpu(c, vcpu, kvm) {
-		bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
-		for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
-			if (test_bit(irq + i, bmap))
-				val |= 1 << (c + i * 8);
-	}
+	for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
+		val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);

 	return val;
 }
@ -547,8 +541,12 @@ static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
 				struct kvm_exit_mmio *mmio, phys_addr_t offset)
 {
 	u32 val;
-	u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
-				       vcpu->vcpu_id, offset >> 1);
+	u32 *reg;
+
+	offset >>= 1;
+	reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+				  vcpu->vcpu_id, offset);
+
 	if (offset & 2)
 		val = *reg >> 16;
 	else
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@ -102,28 +102,8 @@ static bool largepages_enabled = true;

 bool kvm_is_mmio_pfn(pfn_t pfn)
 {
-	if (pfn_valid(pfn)) {
-		int reserved;
-		struct page *tail = pfn_to_page(pfn);
-		struct page *head = compound_trans_head(tail);
-		reserved = PageReserved(head);
-		if (head != tail) {
-			/*
-			 * "head" is not a dangling pointer
-			 * (compound_trans_head takes care of that)
-			 * but the hugepage may have been splitted
-			 * from under us (and we may not hold a
-			 * reference count on the head page so it can
-			 * be reused before we run PageReferenced), so
-			 * we've to check PageTail before returning
-			 * what we just read.
-			 */
-			smp_rmb();
-			if (PageTail(tail))
-				return reserved;
-		}
-		return PageReserved(tail);
-	}
+	if (pfn_valid(pfn))
+		return PageReserved(pfn_to_page(pfn));

 	return true;
 }
@ -731,7 +711,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	update_memslots(slots, new, kvm->memslots->generation);
 	rcu_assign_pointer(kvm->memslots, slots);
 	synchronize_srcu_expedited(&kvm->srcu);
-	return old_memslots; 
+
+	kvm_arch_memslots_updated(kvm);
+
+	return old_memslots;
 }

 /*
@ -1893,7 +1876,7 @@ static struct file_operations kvm_vcpu_fops = {
 */
 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 {
-	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
+	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
 }

 /*
@ -2302,7 +2285,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 		return ret;
 	}

-	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR);
+	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
 	if (ret < 0) {
 		ops->destroy(dev);
 		return ret;
@ -2586,7 +2569,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
 		return r;
 	}
 #endif
-	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC);
 	if (r < 0)
 		kvm_put_kvm(kvm);

@ -2812,11 +2795,9 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 	kfree(bus);
 }

-static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
+                                 const struct kvm_io_range *r2)
 {
-	const struct kvm_io_range *r1 = p1;
-	const struct kvm_io_range *r2 = p2;
-
 	if (r1->addr < r2->addr)
 		return -1;
 	if (r1->addr + r1->len > r2->addr + r2->len)
@ -2824,6 +2805,11 @@ static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
 	return 0;
 }

+static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+{
+	return kvm_io_bus_cmp(p1, p2);
+}
+
 static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
 			  gpa_t addr, int len)
 {
@ -2857,17 +2843,54 @@ static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,

 	off = range - bus->range;

-	while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
+	while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
 		off--;

 	return off;
 }

+static int __kvm_io_bus_write(struct kvm_io_bus *bus,
+			      struct kvm_io_range *range, const void *val)
+{
+	int idx;
+
+	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
+	if (idx < 0)
+		return -EOPNOTSUPP;
+
+	while (idx < bus->dev_count &&
+		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
+		if (!kvm_iodevice_write(bus->range[idx].dev, range->addr,
+					range->len, val))
+			return idx;
+		idx++;
+	}
+
+	return -EOPNOTSUPP;
+}
+
 /* kvm_io_bus_write - called under kvm->slots_lock */
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val)
 {
-	int idx;
+	struct kvm_io_bus *bus;
+	struct kvm_io_range range;
+	int r;
+
+	range = (struct kvm_io_range) {
+		.addr = addr,
+		.len = len,
+	};
+
+	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+	r = __kvm_io_bus_write(bus, &range, val);
+	return r < 0 ? r : 0;
+}
+
+/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
+int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+			    int len, const void *val, long cookie)
+{
 	struct kvm_io_bus *bus;
 	struct kvm_io_range range;

@ -2877,14 +2900,35 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	};

 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	idx = kvm_io_bus_get_first_dev(bus, addr, len);
+
+	/* First try the device referenced by cookie. */
+	if ((cookie >= 0) && (cookie < bus->dev_count) &&
+	    (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
+		if (!kvm_iodevice_write(bus->range[cookie].dev, addr, len,
+					val))
+			return cookie;
+
+	/*
+	 * cookie contained garbage; fall back to search and return the
+	 * correct cookie value.
+	 */
+	return __kvm_io_bus_write(bus, &range, val);
+}
+
+static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
+			     void *val)
+{
+	int idx;
+
+	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
 	if (idx < 0)
 		return -EOPNOTSUPP;

 	while (idx < bus->dev_count &&
-		kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
-		if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
-			return 0;
+		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
+		if (!kvm_iodevice_read(bus->range[idx].dev, range->addr,
+				       range->len, val))
+			return idx;
 		idx++;
 	}

@ -2895,7 +2939,24 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		    int len, void *val)
 {
-	int idx;
+	struct kvm_io_bus *bus;
+	struct kvm_io_range range;
+	int r;
+
+	range = (struct kvm_io_range) {
+		.addr = addr,
+		.len = len,
+	};
+
+	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+	r = __kvm_io_bus_read(bus, &range, val);
+	return r < 0 ? r : 0;
+}
+
+/* kvm_io_bus_read_cookie - called under kvm->slots_lock */
+int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+			   int len, void *val, long cookie)
+{
 	struct kvm_io_bus *bus;
 	struct kvm_io_range range;

@ -2905,18 +2966,19 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	};

 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	idx = kvm_io_bus_get_first_dev(bus, addr, len);
-	if (idx < 0)
-		return -EOPNOTSUPP;

-	while (idx < bus->dev_count &&
-		kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
-		if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
-			return 0;
-		idx++;
-	}
+	/* First try the device referenced by cookie. */
+	if ((cookie >= 0) && (cookie < bus->dev_count) &&
+	    (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
+		if (!kvm_iodevice_read(bus->range[cookie].dev, addr, len,
+				       val))
+			return cookie;

-	return -EOPNOTSUPP;
+	/*
+	 * cookie contained garbage; fall back to search and return the
+	 * correct cookie value.
+	 */
+	return __kvm_io_bus_read(bus, &range, val);
 }

 /* Caller must hold slots_lock. */