Bugfixes: unwinding of KVM_CREATE_VM failure,
VT-d posted interrupts, DAX/ZONE_DEVICE, module unload/reload. -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.22 (GNU/Linux) iQEcBAABAgAGBQJdyrEsAAoJEL/70l94x66DIOkH/Asqrh4o4pwfRHWE+9rnM6PI j8oFi7Q4eRXJnP4zEMnMbb6xD/BfSH1tWEcPcYgIxD/t0DFx8F92/xsETAJ/Qc5n CWpmnhMkJqERlV+GSRuBqnheMo0CEH1Ab1QZKhh5U3//pK3OtGY9WyydJHWcquTh bGh2pnxwVZOtIIEmclUUfKjyR2Fu8hJLnQwzWgYZ27UK7J2pLmiiTX0vwQG359Iq sDn9ND33pCBW5e/D2mzccRjOJEvzwrumewM1sRDsoAYLJzUjg9+xD83vZDa1d7R6 gajCDFWVJbPoLvUY+DgsZBwMMlogElimJMT/Zft3ERbCsYJbFvcmwp4JzyxDxQ4= =J6KN -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull kvm fixes from Paolo Bonzini: "Fix unwinding of KVM_CREATE_VM failure, VT-d posted interrupts, DAX/ZONE_DEVICE, and module unload/reload" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: MMU: Do not treat ZONE_DEVICE pages as being reserved KVM: VMX: Introduce pi_is_pir_empty() helper KVM: VMX: Do not change PID.NDST when loading a blocked vCPU KVM: VMX: Consider PID.PIR to determine if vCPU has pending interrupts KVM: VMX: Fix comment to specify PID.ON instead of PIR.ON KVM: X86: Fix initialization of MSR lists KVM: fix placement of refcount initialization KVM: Fix NULL-ptr deref after kvm_create_vm fails
This commit is contained in:
commit
8c5bd25bf4
@ -3393,7 +3393,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
|
||||
* here.
|
||||
*/
|
||||
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
|
||||
level == PT_PAGE_TABLE_LEVEL &&
|
||||
!kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
|
||||
PageTransCompoundMap(pfn_to_page(pfn)) &&
|
||||
!mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
|
||||
unsigned long mask;
|
||||
@ -6009,9 +6009,9 @@ restart:
|
||||
* the guest, and the guest page table is using 4K page size
|
||||
* mapping if the indirect sp has level = 1.
|
||||
*/
|
||||
if (sp->role.direct &&
|
||||
!kvm_is_reserved_pfn(pfn) &&
|
||||
PageTransCompoundMap(pfn_to_page(pfn))) {
|
||||
if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
|
||||
!kvm_is_zone_device_pfn(pfn) &&
|
||||
PageTransCompoundMap(pfn_to_page(pfn))) {
|
||||
pte_list_remove(rmap_head, sptep);
|
||||
|
||||
if (kvm_available_flush_tlb_with_range())
|
||||
|
@ -1268,6 +1268,18 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
|
||||
* PI.NDST: pi_post_block is the one expected to change PID.NDST and the
|
||||
* wakeup handler expects the vCPU to be on the blocked_vcpu_list that
|
||||
* matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
|
||||
* correctly.
|
||||
*/
|
||||
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
|
||||
pi_clear_sn(pi_desc);
|
||||
goto after_clear_sn;
|
||||
}
|
||||
|
||||
/* The full case. */
|
||||
do {
|
||||
old.control = new.control = pi_desc->control;
|
||||
@ -1283,6 +1295,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
} while (cmpxchg64(&pi_desc->control, old.control,
|
||||
new.control) != old.control);
|
||||
|
||||
after_clear_sn:
|
||||
|
||||
/*
|
||||
* Clear SN before reading the bitmap. The VT-d firmware
|
||||
* writes the bitmap and reads SN atomically (5.2.3 in the
|
||||
@ -1291,7 +1305,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
*/
|
||||
smp_mb__after_atomic();
|
||||
|
||||
if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
|
||||
if (!pi_is_pir_empty(pi_desc))
|
||||
pi_set_on(pi_desc);
|
||||
}
|
||||
|
||||
@ -6137,7 +6151,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
|
||||
if (pi_test_on(&vmx->pi_desc)) {
|
||||
pi_clear_on(&vmx->pi_desc);
|
||||
/*
|
||||
* IOMMU can write to PIR.ON, so the barrier matters even on UP.
|
||||
* IOMMU can write to PID.ON, so the barrier matters even on UP.
|
||||
* But on x86 this is just a compiler barrier anyway.
|
||||
*/
|
||||
smp_mb__after_atomic();
|
||||
@ -6167,7 +6181,10 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
|
||||
|
||||
static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return pi_test_on(vcpu_to_pi_desc(vcpu));
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
|
||||
return pi_test_on(pi_desc) ||
|
||||
(pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
|
||||
}
|
||||
|
||||
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
|
||||
|
@ -355,6 +355,11 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
|
||||
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
|
||||
}
|
||||
|
||||
static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
|
||||
{
|
||||
return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
|
||||
}
|
||||
|
||||
static inline void pi_set_sn(struct pi_desc *pi_desc)
|
||||
{
|
||||
set_bit(POSTED_INTR_SN,
|
||||
@ -373,6 +378,12 @@ static inline void pi_clear_on(struct pi_desc *pi_desc)
|
||||
(unsigned long *)&pi_desc->control);
|
||||
}
|
||||
|
||||
static inline void pi_clear_sn(struct pi_desc *pi_desc)
|
||||
{
|
||||
clear_bit(POSTED_INTR_SN,
|
||||
(unsigned long *)&pi_desc->control);
|
||||
}
|
||||
|
||||
static inline int pi_test_on(struct pi_desc *pi_desc)
|
||||
{
|
||||
return test_bit(POSTED_INTR_ON,
|
||||
|
@ -1133,13 +1133,15 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
|
||||
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
|
||||
* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
|
||||
*
|
||||
* This list is modified at module load time to reflect the
|
||||
* The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
|
||||
* extract the supported MSRs from the related const lists.
|
||||
* msrs_to_save is selected from the msrs_to_save_all to reflect the
|
||||
* capabilities of the host cpu. This capabilities test skips MSRs that are
|
||||
* kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
|
||||
* kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
|
||||
* may depend on host virtualization features rather than host cpu features.
|
||||
*/
|
||||
|
||||
static u32 msrs_to_save[] = {
|
||||
static const u32 msrs_to_save_all[] = {
|
||||
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
|
||||
MSR_STAR,
|
||||
#ifdef CONFIG_X86_64
|
||||
@ -1180,9 +1182,10 @@ static u32 msrs_to_save[] = {
|
||||
MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
|
||||
};
|
||||
|
||||
static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
|
||||
static unsigned num_msrs_to_save;
|
||||
|
||||
static u32 emulated_msrs[] = {
|
||||
static const u32 emulated_msrs_all[] = {
|
||||
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
|
||||
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
|
||||
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
|
||||
@ -1221,7 +1224,7 @@ static u32 emulated_msrs[] = {
|
||||
* by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
|
||||
* We always support the "true" VMX control MSRs, even if the host
|
||||
* processor does not, so I am putting these registers here rather
|
||||
* than in msrs_to_save.
|
||||
* than in msrs_to_save_all.
|
||||
*/
|
||||
MSR_IA32_VMX_BASIC,
|
||||
MSR_IA32_VMX_TRUE_PINBASED_CTLS,
|
||||
@ -1240,13 +1243,14 @@ static u32 emulated_msrs[] = {
|
||||
MSR_KVM_POLL_CONTROL,
|
||||
};
|
||||
|
||||
static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
|
||||
static unsigned num_emulated_msrs;
|
||||
|
||||
/*
|
||||
* List of msr numbers which are used to expose MSR-based features that
|
||||
* can be used by a hypervisor to validate requested CPU features.
|
||||
*/
|
||||
static u32 msr_based_features[] = {
|
||||
static const u32 msr_based_features_all[] = {
|
||||
MSR_IA32_VMX_BASIC,
|
||||
MSR_IA32_VMX_TRUE_PINBASED_CTLS,
|
||||
MSR_IA32_VMX_PINBASED_CTLS,
|
||||
@ -1271,6 +1275,7 @@ static u32 msr_based_features[] = {
|
||||
MSR_IA32_ARCH_CAPABILITIES,
|
||||
};
|
||||
|
||||
static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
|
||||
static unsigned int num_msr_based_features;
|
||||
|
||||
static u64 kvm_get_arch_capabilities(void)
|
||||
@ -5118,22 +5123,22 @@ static void kvm_init_msr_list(void)
|
||||
{
|
||||
struct x86_pmu_capability x86_pmu;
|
||||
u32 dummy[2];
|
||||
unsigned i, j;
|
||||
unsigned i;
|
||||
|
||||
BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
|
||||
"Please update the fixed PMCs in msrs_to_save[]");
|
||||
"Please update the fixed PMCs in msrs_to_saved_all[]");
|
||||
|
||||
perf_get_x86_pmu_capability(&x86_pmu);
|
||||
|
||||
for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
|
||||
if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
|
||||
for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
|
||||
if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Even MSRs that are valid in the host may not be exposed
|
||||
* to the guests in some cases.
|
||||
*/
|
||||
switch (msrs_to_save[i]) {
|
||||
switch (msrs_to_save_all[i]) {
|
||||
case MSR_IA32_BNDCFGS:
|
||||
if (!kvm_mpx_supported())
|
||||
continue;
|
||||
@ -5161,17 +5166,17 @@ static void kvm_init_msr_list(void)
|
||||
break;
|
||||
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
|
||||
if (!kvm_x86_ops->pt_supported() ||
|
||||
msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >=
|
||||
msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
|
||||
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
|
||||
continue;
|
||||
break;
|
||||
case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
|
||||
if (msrs_to_save[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
|
||||
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
|
||||
min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
|
||||
continue;
|
||||
break;
|
||||
case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
|
||||
if (msrs_to_save[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
|
||||
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
|
||||
min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
|
||||
continue;
|
||||
}
|
||||
@ -5179,34 +5184,25 @@ static void kvm_init_msr_list(void)
|
||||
break;
|
||||
}
|
||||
|
||||
if (j < i)
|
||||
msrs_to_save[j] = msrs_to_save[i];
|
||||
j++;
|
||||
msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
|
||||
}
|
||||
num_msrs_to_save = j;
|
||||
|
||||
for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
|
||||
if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
|
||||
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
|
||||
if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i]))
|
||||
continue;
|
||||
|
||||
if (j < i)
|
||||
emulated_msrs[j] = emulated_msrs[i];
|
||||
j++;
|
||||
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
|
||||
}
|
||||
num_emulated_msrs = j;
|
||||
|
||||
for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
|
||||
for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
|
||||
struct kvm_msr_entry msr;
|
||||
|
||||
msr.index = msr_based_features[i];
|
||||
msr.index = msr_based_features_all[i];
|
||||
if (kvm_get_msr_feature(&msr))
|
||||
continue;
|
||||
|
||||
if (j < i)
|
||||
msr_based_features[j] = msr_based_features[i];
|
||||
j++;
|
||||
msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
|
||||
}
|
||||
num_msr_based_features = j;
|
||||
}
|
||||
|
||||
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
|
||||
|
@ -966,6 +966,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
|
||||
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
|
||||
|
||||
bool kvm_is_reserved_pfn(kvm_pfn_t pfn);
|
||||
bool kvm_is_zone_device_pfn(kvm_pfn_t pfn);
|
||||
|
||||
struct kvm_irq_ack_notifier {
|
||||
struct hlist_node link;
|
||||
|
@ -150,10 +150,30 @@ __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
|
||||
{
|
||||
/*
|
||||
* The metadata used by is_zone_device_page() to determine whether or
|
||||
* not a page is ZONE_DEVICE is guaranteed to be valid if and only if
|
||||
* the device has been pinned, e.g. by get_user_pages(). WARN if the
|
||||
* page_count() is zero to help detect bad usage of this helper.
|
||||
*/
|
||||
if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
|
||||
return false;
|
||||
|
||||
return is_zone_device_page(pfn_to_page(pfn));
|
||||
}
|
||||
|
||||
bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
|
||||
{
|
||||
/*
|
||||
* ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
|
||||
* perspective they are "normal" pages, albeit with slightly different
|
||||
* usage rules.
|
||||
*/
|
||||
if (pfn_valid(pfn))
|
||||
return PageReserved(pfn_to_page(pfn));
|
||||
return PageReserved(pfn_to_page(pfn)) &&
|
||||
!kvm_is_zone_device_pfn(pfn);
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -663,6 +683,12 @@ static struct kvm *kvm_create_vm(unsigned long type)
|
||||
|
||||
BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
|
||||
|
||||
if (init_srcu_struct(&kvm->srcu))
|
||||
goto out_err_no_srcu;
|
||||
if (init_srcu_struct(&kvm->irq_srcu))
|
||||
goto out_err_no_irq_srcu;
|
||||
|
||||
refcount_set(&kvm->users_count, 1);
|
||||
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
|
||||
struct kvm_memslots *slots = kvm_alloc_memslots();
|
||||
|
||||
@ -680,7 +706,6 @@ static struct kvm *kvm_create_vm(unsigned long type)
|
||||
goto out_err_no_arch_destroy_vm;
|
||||
}
|
||||
|
||||
refcount_set(&kvm->users_count, 1);
|
||||
r = kvm_arch_init_vm(kvm, type);
|
||||
if (r)
|
||||
goto out_err_no_arch_destroy_vm;
|
||||
@ -693,11 +718,6 @@ static struct kvm *kvm_create_vm(unsigned long type)
|
||||
INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
|
||||
#endif
|
||||
|
||||
if (init_srcu_struct(&kvm->srcu))
|
||||
goto out_err_no_srcu;
|
||||
if (init_srcu_struct(&kvm->irq_srcu))
|
||||
goto out_err_no_irq_srcu;
|
||||
|
||||
r = kvm_init_mmu_notifier(kvm);
|
||||
if (r)
|
||||
goto out_err_no_mmu_notifier;
|
||||
@ -720,19 +740,19 @@ out_err:
|
||||
mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
|
||||
#endif
|
||||
out_err_no_mmu_notifier:
|
||||
cleanup_srcu_struct(&kvm->irq_srcu);
|
||||
out_err_no_irq_srcu:
|
||||
cleanup_srcu_struct(&kvm->srcu);
|
||||
out_err_no_srcu:
|
||||
hardware_disable_all();
|
||||
out_err_no_disable:
|
||||
kvm_arch_destroy_vm(kvm);
|
||||
WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
|
||||
out_err_no_arch_destroy_vm:
|
||||
WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
|
||||
for (i = 0; i < KVM_NR_BUSES; i++)
|
||||
kfree(kvm_get_bus(kvm, i));
|
||||
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
|
||||
kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
|
||||
cleanup_srcu_struct(&kvm->irq_srcu);
|
||||
out_err_no_irq_srcu:
|
||||
cleanup_srcu_struct(&kvm->srcu);
|
||||
out_err_no_srcu:
|
||||
kvm_arch_free_vm(kvm);
|
||||
mmdrop(current->mm);
|
||||
return ERR_PTR(r);
|
||||
@ -1886,7 +1906,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
|
||||
|
||||
void kvm_set_pfn_dirty(kvm_pfn_t pfn)
|
||||
{
|
||||
if (!kvm_is_reserved_pfn(pfn)) {
|
||||
if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
SetPageDirty(page);
|
||||
@ -1896,7 +1916,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
|
||||
|
||||
void kvm_set_pfn_accessed(kvm_pfn_t pfn)
|
||||
{
|
||||
if (!kvm_is_reserved_pfn(pfn))
|
||||
if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
|
||||
mark_page_accessed(pfn_to_page(pfn));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
|
||||
|
Loading…
Reference in New Issue
Block a user