* Hide KVM_CAP_IRQFD_RESAMPLE if XIVE is enabled
 
 s390:
 
 * Fix handling of external interrupts in protected guests
 
 x86:
 
 * Resample the pending state of IOAPIC interrupts when unmasking them
 
 * Fix usage of Hyper-V "enlightened TLB" on AMD
 
 * Small fixes to real mode exceptions
 
 * Suppress pending MMIO write exits if emulator detects exception
 
 Documentation:
 
 * Fix rST syntax
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmQsXMQUHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroOc6Qf/YMDcYxpPuYFZ69t9k4s0ubeGjGpr
 Qdzo6ZgKbfs4+nu89u2F5zvAftRSrEVQRpfLwmTXauM8el3s+EAEPlK/MLH6sSSo
 LG3eWoNxs7MUQ6gBWeQypY6opvz3W9hf8KYd+GY6TV7jwV5/YgH6674Yyd7T/bCm
 0/tUiil5Q7Da0w4GUY5m9gEqlbA9yjDBFySqqTZemo18NzLaPNpesKnm1hvv0QJG
 JRvOMmHwqAZgS0G5cyGEPZKY6Ga0v9QsAXc8mFC/Jtn9GMD2f47PYDRfd/7NSiVz
 ulgsk8zZkZIZ6nMbQ1t10txmGIbFmJ4iPPaATrMII/xHq+idgAlHU7zY1A==
 =QvFj
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "PPC:
   - Hide KVM_CAP_IRQFD_RESAMPLE if XIVE is enabled

  s390:
   - Fix handling of external interrupts in protected guests

  x86:
   - Resample the pending state of IOAPIC interrupts when unmasking them

   - Fix usage of Hyper-V "enlightened TLB" on AMD

   - Small fixes to real mode exceptions

   - Suppress pending MMIO write exits if emulator detects exception

  Documentation:
   - Fix rST syntax"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  docs: kvm: x86: Fix broken field list
  KVM: PPC: Make KVM_CAP_IRQFD_RESAMPLE platform dependent
  KVM: s390: pv: fix external interruption loop not always detected
  KVM: nVMX: Do not report error code when synthesizing VM-Exit from Real Mode
  KVM: x86: Clear "has_error_code", not "error_code", for RM exception injection
  KVM: x86: Suppress pending MMIO write exits if emulator detects exception
  KVM: x86/ioapic: Resample the pending state of an IRQ when unmasking
  KVM: irqfd: Make resampler_list an RCU list
  KVM: SVM: Flush Hyper-V TLB when required
This commit is contained in:
Linus Torvalds 2023-04-04 11:29:37 -07:00
commit 76f598ba7d
15 changed files with 192 additions and 29 deletions

View File

@ -8296,11 +8296,11 @@ ENOSYS for the others.
8.35 KVM_CAP_PMU_CAPABILITY
---------------------------
:Capability KVM_CAP_PMU_CAPABILITY
:Capability: KVM_CAP_PMU_CAPABILITY
:Architectures: x86
:Type: vm
:Parameters: arg[0] is bitmask of PMU virtualization capabilities.
:Returns 0 on success, -EINVAL when arg[0] contains invalid bits
:Returns: 0 on success, -EINVAL when arg[0] contains invalid bits
This capability alters PMU virtualization in KVM.

View File

@ -220,6 +220,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VCPU_ATTRIBUTES:
case KVM_CAP_PTP_KVM:
case KVM_CAP_ARM_SYSTEM_SUSPEND:
case KVM_CAP_IRQFD_RESAMPLE:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:

View File

@ -576,6 +576,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
#endif
#ifdef CONFIG_HAVE_KVM_IRQFD
case KVM_CAP_IRQFD_RESAMPLE:
r = !xive_enabled();
break;
#endif
case KVM_CAP_PPC_ALLOC_HTAB:
r = hv_enabled;
break;

View File

@ -271,10 +271,18 @@ static int handle_prog(struct kvm_vcpu *vcpu)
* handle_external_interrupt - used for external interruption interceptions
* @vcpu: virtual cpu
*
* This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if
* the new PSW does not have external interrupts disabled. In the first case,
* we've got to deliver the interrupt manually, and in the second case, we
* drop to userspace to handle the situation there.
* This interception occurs if:
* - the CPUSTAT_EXT_INT bit was already set when the external interrupt
* occurred. In this case, the interrupt needs to be injected manually to
* preserve interrupt priority.
* - the external new PSW has external interrupts enabled, which will cause an
* interruption loop. We drop to userspace in this case.
*
* The latter case can be detected by inspecting the external mask bit in the
* external new psw.
*
* Under PV, only the latter case can occur, since interrupt priorities are
* handled in the ultravisor.
*/
static int handle_external_interrupt(struct kvm_vcpu *vcpu)
{
@ -285,10 +293,18 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
vcpu->stat.exit_external_interrupt++;
rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t));
if (rc)
return rc;
/* We can not handle clock comparator or timer interrupt with bad PSW */
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
newpsw = vcpu->arch.sie_block->gpsw;
} else {
rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t));
if (rc)
return rc;
}
/*
* Clock comparator or timer interrupt with external interrupt enabled
* will cause interrupt loop. Drop to userspace.
*/
if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) &&
(newpsw.mask & PSW_MASK_EXT))
return -EOPNOTSUPP;

View File

@ -573,6 +573,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_VCPU_RESETS:
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_S390_DIAG318:
case KVM_CAP_IRQFD_RESAMPLE:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:

View File

@ -368,9 +368,39 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
mask_after = e->fields.mask;
if (mask_before != mask_after)
kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
&& ioapic->irr & (1 << index))
ioapic_service(ioapic, index, false);
if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG &&
ioapic->irr & (1 << index) && !e->fields.mask && !e->fields.remote_irr) {
/*
* Pending status in irr may be outdated: the IRQ line may have
* already been deasserted by a device while the IRQ was masked.
* This occurs, for instance, if the interrupt is handled in a
* Linux guest as a oneshot interrupt (IRQF_ONESHOT). In this
* case the guest acknowledges the interrupt to the device in
* its threaded irq handler, i.e. after the EOI but before
* unmasking, so at the time of unmasking the IRQ line is
* already down but our pending irr bit is still set. In such
* cases, injecting this pending interrupt to the guest is
* buggy: the guest will receive an extra unwanted interrupt.
*
* So we need to check here if the IRQ is actually still pending.
* As we are generally not able to probe the IRQ line status
* directly, we do it through irqfd resampler. Namely, we clear
* the pending status and notify the resampler that this interrupt
* is done, without actually injecting it into the guest. If the
* IRQ line is actually already deasserted, we are done. If it is
* still asserted, a new interrupt will be shortly triggered
* through irqfd and injected into the guest.
*
* If, however, it's not possible to resample (no irqfd resampler
* registered for this irq), then unconditionally inject this
* pending interrupt into the guest, so the guest will not miss
* an interrupt, although may get an extra unwanted interrupt.
*/
if (kvm_notify_irqfd_resampler(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index))
ioapic->irr &= ~(1 << index);
else
ioapic_service(ioapic, index, false);
}
if (e->fields.delivery_mode == APIC_DM_FIXED) {
struct kvm_lapic_irq irq;

View File

@ -12,6 +12,11 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm,
int hv_remote_flush_tlb(struct kvm *kvm);
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
#else /* !CONFIG_HYPERV */
static inline int hv_remote_flush_tlb(struct kvm *kvm)
{
return -EOPNOTSUPP;
}
static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
{
}

View File

@ -3729,7 +3729,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
}
static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@ -3753,6 +3753,37 @@ static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
svm->current_vmcb->asid_generation--;
}
static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
{
hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
/*
* When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
* flush the NPT mappings via hypercall as flushing the ASID only
* affects virtual to physical mappings, it does not invalidate guest
* physical to host physical mappings.
*/
if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
hyperv_flush_guest_mapping(root_tdp);
svm_flush_tlb_asid(vcpu);
}
static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
{
/*
* When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
* flushes should be routed to hv_remote_flush_tlb() without requesting
* a "regular" remote flush. Reaching this point means either there's
* a KVM bug or a prior hv_remote_flush_tlb() call failed, both of
* which might be fatal to the guest. Yell, but try to recover.
*/
if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
hv_remote_flush_tlb(vcpu->kvm);
svm_flush_tlb_asid(vcpu);
}
static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
{
struct vcpu_svm *svm = to_svm(vcpu);
@ -4745,10 +4776,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_rflags = svm_set_rflags,
.get_if_flag = svm_get_if_flag,
.flush_tlb_all = svm_flush_tlb_current,
.flush_tlb_all = svm_flush_tlb_all,
.flush_tlb_current = svm_flush_tlb_current,
.flush_tlb_gva = svm_flush_tlb_gva,
.flush_tlb_guest = svm_flush_tlb_current,
.flush_tlb_guest = svm_flush_tlb_asid,
.vcpu_pre_run = svm_vcpu_pre_run,
.vcpu_run = svm_vcpu_run,

View File

@ -6,6 +6,8 @@
#ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__
#define __ARCH_X86_KVM_SVM_ONHYPERV_H__
#include <asm/mshyperv.h>
#if IS_ENABLED(CONFIG_HYPERV)
#include "kvm_onhyperv.h"
@ -15,6 +17,14 @@ static struct kvm_x86_ops svm_x86_ops;
int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu);
static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
{
struct hv_vmcb_enlightenments *hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
return ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB &&
!!hve->hv_enlightenments_control.enlightened_npt_tlb;
}
static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
{
struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
@ -80,6 +90,11 @@ static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu)
}
#else
static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
{
return false;
}
static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
{
}

View File

@ -3868,7 +3868,12 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
exit_qual = 0;
}
if (ex->has_error_code) {
/*
* Unlike AMD's Paged Real Mode, which reports an error code on #PF
* VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the
* "has error code" flags on VM-Exit if the CPU is in Real Mode.
*/
if (ex->has_error_code && is_protmode(vcpu)) {
/*
* Intel CPUs do not generate error codes with bits 31:16 set,
* and more importantly VMX disallows setting bits 31:16 in the

View File

@ -4432,6 +4432,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VAPIC:
case KVM_CAP_ENABLE_CAP:
case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
case KVM_CAP_IRQFD_RESAMPLE:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@ -8903,6 +8904,8 @@ restart:
}
if (ctxt->have_exception) {
WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write);
vcpu->mmio_needed = false;
r = 1;
inject_emulated_exception(vcpu);
} else if (vcpu->arch.pio.count) {
@ -9906,13 +9909,20 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu)
static void kvm_inject_exception(struct kvm_vcpu *vcpu)
{
/*
* Suppress the error code if the vCPU is in Real Mode, as Real Mode
* exceptions don't report error codes. The presence of an error code
* is carried with the exception and only stripped when the exception
* is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do
* report an error code despite the CPU being in Real Mode.
*/
vcpu->arch.exception.has_error_code &= is_protmode(vcpu);
trace_kvm_inj_exception(vcpu->arch.exception.vector,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
vcpu->arch.exception.injected);
if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
vcpu->arch.exception.error_code = false;
static_call(kvm_x86_inject_exception)(vcpu);
}

View File

@ -755,6 +755,7 @@ struct kvm {
struct {
spinlock_t lock;
struct list_head items;
/* resampler_list update side is protected by resampler_lock. */
struct list_head resampler_list;
struct mutex resampler_lock;
} irqfds;
@ -1986,6 +1987,9 @@ int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
#ifdef CONFIG_HAVE_KVM_IRQFD
int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
void kvm_irqfd_release(struct kvm *kvm);
bool kvm_notify_irqfd_resampler(struct kvm *kvm,
unsigned int irqchip,
unsigned int pin);
void kvm_irq_routing_update(struct kvm *);
#else
static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
@ -1994,6 +1998,13 @@ static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
}
static inline void kvm_irqfd_release(struct kvm *kvm) {}
static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm,
unsigned int irqchip,
unsigned int pin)
{
return false;
}
#endif
#else

View File

@ -31,7 +31,7 @@ struct kvm_kernel_irqfd_resampler {
/*
* Entry in list of kvm->irqfd.resampler_list. Use for sharing
* resamplers among irqfds on the same gsi.
* Accessed and modified under kvm->irqfds.resampler_lock
* RCU list modified under kvm->irqfds.resampler_lock
*/
struct list_head link;
};

View File

@ -55,6 +55,15 @@ irqfd_inject(struct work_struct *work)
irqfd->gsi, 1, false);
}
static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler)
{
struct kvm_kernel_irqfd *irqfd;
list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
srcu_read_lock_held(&resampler->kvm->irq_srcu))
eventfd_signal(irqfd->resamplefd, 1);
}
/*
* Since resampler irqfds share an IRQ source ID, we de-assert once
* then notify all of the resampler irqfds using this GSI. We can't
@ -65,7 +74,6 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kernel_irqfd_resampler *resampler;
struct kvm *kvm;
struct kvm_kernel_irqfd *irqfd;
int idx;
resampler = container_of(kian,
@ -76,11 +84,7 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
resampler->notifier.gsi, 0, false);
idx = srcu_read_lock(&kvm->irq_srcu);
list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
srcu_read_lock_held(&kvm->irq_srcu))
eventfd_signal(irqfd->resamplefd, 1);
irqfd_resampler_notify(resampler);
srcu_read_unlock(&kvm->irq_srcu, idx);
}
@ -96,8 +100,12 @@ irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
synchronize_srcu(&kvm->irq_srcu);
if (list_empty(&resampler->list)) {
list_del(&resampler->link);
list_del_rcu(&resampler->link);
kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
/*
* synchronize_srcu(&kvm->irq_srcu) already called
* in kvm_unregister_irq_ack_notifier().
*/
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
resampler->notifier.gsi, 0, false);
kfree(resampler);
@ -369,7 +377,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
resampler->notifier.irq_acked = irqfd_resampler_ack;
INIT_LIST_HEAD(&resampler->link);
list_add(&resampler->link, &kvm->irqfds.resampler_list);
list_add_rcu(&resampler->link, &kvm->irqfds.resampler_list);
kvm_register_irq_ack_notifier(kvm,
&resampler->notifier);
irqfd->resampler = resampler;
@ -644,6 +652,31 @@ void kvm_irq_routing_update(struct kvm *kvm)
spin_unlock_irq(&kvm->irqfds.lock);
}
bool kvm_notify_irqfd_resampler(struct kvm *kvm,
unsigned int irqchip,
unsigned int pin)
{
struct kvm_kernel_irqfd_resampler *resampler;
int gsi, idx;
idx = srcu_read_lock(&kvm->irq_srcu);
gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
if (gsi != -1) {
list_for_each_entry_srcu(resampler,
&kvm->irqfds.resampler_list, link,
srcu_read_lock_held(&kvm->irq_srcu)) {
if (resampler->notifier.gsi == gsi) {
irqfd_resampler_notify(resampler);
srcu_read_unlock(&kvm->irq_srcu, idx);
return true;
}
}
}
srcu_read_unlock(&kvm->irq_srcu, idx);
return false;
}
/*
* create a host-wide workqueue for issuing deferred shutdown requests
* aggregated from all vm* instances. We need our own isolated

View File

@ -4479,7 +4479,6 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
#endif
#ifdef CONFIG_HAVE_KVM_IRQFD
case KVM_CAP_IRQFD:
case KVM_CAP_IRQFD_RESAMPLE:
#endif
case KVM_CAP_IOEVENTFD_ANY_LENGTH:
case KVM_CAP_CHECK_EXTENSION_VM: