- Fix MTE shared page detection
 
 - Enable selftest's use of PMU registers when asked to
 
 s390:
 
 - restore 5.13 debugfs names
 
 x86:
 
 - fix sizes for vcpu-id indexed arrays
 
 - fixes for AMD virtualized LAPIC (AVIC)
 
 - other small bugfixes
 
 Generic:
 
 - access tracking performance test
 
 - dirty_log_perf_test command line parsing fix
 
 - Fix selftest use of obsolete pthread_yield() in favour of sched_yield()
 
 - use cpu_relax when halt polling
 
 - fixed missing KVM_CLEAR_DIRTY_LOG compat ioctl
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmECvOwUHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroMjuAf/ZdJx7RKRQxMHG4jHGDtOIQq3qxds
 2uJsFZS3MWkphSOJ+mbomdXTOCHvhPbJlr5TXaSxGnasmAAl+mDk2qVT0tH6638m
 r6M+fu4X0RYvFz54Qnf96V0/elE6ee8rtteXD8WVKQ/XzE3odk1EOqbe7CBDx7yo
 A3SzO8eSBzxamKo22fmE3MR5LVVAcN9wNsCb88XGDTUkTbYl+w597r6zg83rMMlL
 gwD4f9+NYX6h88BVVwLUkWotUrD/5rRGpRVVEZk5eZKvFGzpukk15dfv0PA9347O
 AOM0i/PgnA+Qw6ZsTetWPjD8eFcXDBurGF1tIkyo4X8VogQG0wFIHxbezQ==
 =ZgK/
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "ARM:

   - Fix MTE shared page detection

   - Enable selftest's use of PMU registers when asked to

  s390:

   - restore 5.13 debugfs names

  x86:

   - fix sizes for vcpu-id indexed arrays

   - fixes for AMD virtualized LAPIC (AVIC)

   - other small bugfixes

  Generic:

   - access tracking performance test

   - dirty_log_perf_test command line parsing fix

   - Fix selftest use of obsolete pthread_yield() in favour of
     sched_yield()

   - use cpu_relax when halt polling

   - fixed missing KVM_CLEAR_DIRTY_LOG compat ioctl"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: add missing compat KVM_CLEAR_DIRTY_LOG
  KVM: use cpu_relax when halt polling
  KVM: SVM: use vmcb01 in svm_refresh_apicv_exec_ctrl
  KVM: SVM: tweak warning about enabled AVIC on nested entry
  KVM: SVM: svm_set_vintr don't warn if AVIC is active but is about to be deactivated
  KVM: s390: restore old debugfs names
  KVM: SVM: delay svm_vcpu_init_msrpm after svm->vmcb is initialized
  KVM: selftests: Introduce access_tracking_perf_test
  KVM: selftests: Fix missing break in dirty_log_perf_test arg parsing
  x86/kvm: fix vcpu-id indexed array sizes
  KVM: x86: Check the right feature bit for MSR_KVM_ASYNC_PF_ACK access
  docs: virt: kvm: api.rst: replace some characters
  KVM: Documentation: Fix KVM_CAP_ENFORCE_PV_FEATURE_CPUID name
  KVM: nSVM: Swap the parameter order for svm_copy_vmrun_state()/svm_copy_vmloadsave_state()
  KVM: nSVM: Rename nested_svm_vmloadsave() to svm_copy_vmloadsave_state()
  KVM: arm64: selftests: get-reg-list: actually enable pmu regs in pmu sublist
  KVM: selftests: change pthread_yield to sched_yield
  KVM: arm64: Fix detection of shared VMAs on guest fault
This commit is contained in:
Linus Torvalds 2021-07-29 09:42:09 -07:00
commit 7e96bf4762
20 changed files with 537 additions and 71 deletions

View File

@ -855,7 +855,7 @@ in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to
use PPIs designated for specific cpus. The irq field is interpreted
like this::
 bits: | 31 ... 28 | 27 ... 24 | 23 ... 16 | 15 ... 0 |
bits: | 31 ... 28 | 27 ... 24 | 23 ... 16 | 15 ... 0 |
field: | vcpu2_index | irq_type | vcpu_index | irq_id |
The irq_type field has the following values:
@ -2149,10 +2149,10 @@ prior to calling the KVM_RUN ioctl.
Errors:
====== ============================================================
 ENOENT   no such register
 EINVAL   invalid register ID, or no such register or used with VMs in
ENOENT no such register
EINVAL invalid register ID, or no such register or used with VMs in
protected virtualization mode on s390
 EPERM    (arm64) register access not allowed before vcpu finalization
EPERM (arm64) register access not allowed before vcpu finalization
====== ============================================================
(These error codes are indicative only: do not rely on a specific error
@ -2590,10 +2590,10 @@ following id bit patterns::
Errors include:
======== ============================================================
 ENOENT   no such register
 EINVAL   invalid register ID, or no such register or used with VMs in
ENOENT no such register
EINVAL invalid register ID, or no such register or used with VMs in
protected virtualization mode on s390
 EPERM    (arm64) register access not allowed before vcpu finalization
EPERM (arm64) register access not allowed before vcpu finalization
======== ============================================================
(These error codes are indicative only: do not rely on a specific error
@ -3112,13 +3112,13 @@ current state. "addr" is ignored.
Errors:
====== =================================================================
 EINVAL    the target is unknown, or the combination of features is invalid.
 ENOENT    a features bit specified is unknown.
EINVAL the target is unknown, or the combination of features is invalid.
ENOENT a features bit specified is unknown.
====== =================================================================
This tells KVM what type of CPU to present to the guest, and what
optional features it should have.  This will cause a reset of the cpu
registers to their initial values.  If this is not called, KVM_RUN will
optional features it should have. This will cause a reset of the cpu
registers to their initial values. If this is not called, KVM_RUN will
return ENOEXEC for that vcpu.
The initial values are defined as:
@ -3239,8 +3239,8 @@ VCPU matching underlying host.
Errors:
===== ==============================================================
 E2BIG     the reg index list is too big to fit in the array specified by
            the user (the number required will be written into n).
E2BIG the reg index list is too big to fit in the array specified by
the user (the number required will be written into n).
===== ==============================================================
::
@ -3288,7 +3288,7 @@ specific device.
ARM/arm64 divides the id field into two parts, a device id and an
address type id specific to the individual device::
 bits: | 63 ... 32 | 31 ... 16 | 15 ... 0 |
bits: | 63 ... 32 | 31 ... 16 | 15 ... 0 |
field: | 0x00000000 | device id | addr type id |
ARM/arm64 currently only require this when using the in-kernel GIC
@ -7049,7 +7049,7 @@ In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to
trap and emulate MSRs that are outside of the scope of KVM as well as
limit the attack surface on KVM's MSR emulation code.
8.28 KVM_CAP_ENFORCE_PV_CPUID
8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID
-----------------------------
Architectures: x86

View File

@ -947,7 +947,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
vma_shift = get_vma_page_shift(vma, hva);
}
shared = (vma->vm_flags & VM_PFNMAP);
shared = (vma->vm_flags & VM_SHARED);
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED

View File

@ -445,15 +445,15 @@ struct kvm_vcpu_stat {
u64 instruction_sigp_init_cpu_reset;
u64 instruction_sigp_cpu_reset;
u64 instruction_sigp_unknown;
u64 diagnose_10;
u64 diagnose_44;
u64 diagnose_9c;
u64 diagnose_9c_ignored;
u64 diagnose_9c_forward;
u64 diagnose_258;
u64 diagnose_308;
u64 diagnose_500;
u64 diagnose_other;
u64 instruction_diagnose_10;
u64 instruction_diagnose_44;
u64 instruction_diagnose_9c;
u64 diag_9c_ignored;
u64 diag_9c_forward;
u64 instruction_diagnose_258;
u64 instruction_diagnose_308;
u64 instruction_diagnose_500;
u64 instruction_diagnose_other;
u64 pfault_sync;
};

View File

@ -24,7 +24,7 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + PAGE_SIZE;
vcpu->stat.diagnose_10++;
vcpu->stat.instruction_diagnose_10++;
if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end
|| start < 2 * PAGE_SIZE)
@ -74,7 +74,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 3, "diag page reference parameter block at 0x%llx",
vcpu->run->s.regs.gprs[rx]);
vcpu->stat.diagnose_258++;
vcpu->stat.instruction_diagnose_258++;
if (vcpu->run->s.regs.gprs[rx] & 7)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm));
@ -145,7 +145,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
{
VCPU_EVENT(vcpu, 5, "%s", "diag time slice end");
vcpu->stat.diagnose_44++;
vcpu->stat.instruction_diagnose_44++;
kvm_vcpu_on_spin(vcpu, true);
return 0;
}
@ -169,7 +169,7 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
int tid;
tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
vcpu->stat.diagnose_9c++;
vcpu->stat.instruction_diagnose_9c++;
/* yield to self */
if (tid == vcpu->vcpu_id)
@ -192,7 +192,7 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 5,
"diag time slice end directed to %d: yield forwarded",
tid);
vcpu->stat.diagnose_9c_forward++;
vcpu->stat.diag_9c_forward++;
return 0;
}
@ -203,7 +203,7 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
return 0;
no_yield:
VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d: ignored", tid);
vcpu->stat.diagnose_9c_ignored++;
vcpu->stat.diag_9c_ignored++;
return 0;
}
@ -213,7 +213,7 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
unsigned long subcode = vcpu->run->s.regs.gprs[reg] & 0xffff;
VCPU_EVENT(vcpu, 3, "diag ipl functions, subcode %lx", subcode);
vcpu->stat.diagnose_308++;
vcpu->stat.instruction_diagnose_308++;
switch (subcode) {
case 3:
vcpu->run->s390_reset_flags = KVM_S390_RESET_CLEAR;
@ -245,7 +245,7 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
{
int ret;
vcpu->stat.diagnose_500++;
vcpu->stat.instruction_diagnose_500++;
/* No virtio-ccw notification? Get out quickly. */
if (!vcpu->kvm->arch.css_support ||
(vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
@ -299,7 +299,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
case 0x500:
return __diag_virtio_hypercall(vcpu);
default:
vcpu->stat.diagnose_other++;
vcpu->stat.instruction_diagnose_other++;
return -EOPNOTSUPP;
}
}

View File

@ -163,15 +163,15 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, instruction_sigp_init_cpu_reset),
STATS_DESC_COUNTER(VCPU, instruction_sigp_cpu_reset),
STATS_DESC_COUNTER(VCPU, instruction_sigp_unknown),
STATS_DESC_COUNTER(VCPU, diagnose_10),
STATS_DESC_COUNTER(VCPU, diagnose_44),
STATS_DESC_COUNTER(VCPU, diagnose_9c),
STATS_DESC_COUNTER(VCPU, diagnose_9c_ignored),
STATS_DESC_COUNTER(VCPU, diagnose_9c_forward),
STATS_DESC_COUNTER(VCPU, diagnose_258),
STATS_DESC_COUNTER(VCPU, diagnose_308),
STATS_DESC_COUNTER(VCPU, diagnose_500),
STATS_DESC_COUNTER(VCPU, diagnose_other),
STATS_DESC_COUNTER(VCPU, instruction_diagnose_10),
STATS_DESC_COUNTER(VCPU, instruction_diagnose_44),
STATS_DESC_COUNTER(VCPU, instruction_diagnose_9c),
STATS_DESC_COUNTER(VCPU, diag_9c_ignored),
STATS_DESC_COUNTER(VCPU, diag_9c_forward),
STATS_DESC_COUNTER(VCPU, instruction_diagnose_258),
STATS_DESC_COUNTER(VCPU, instruction_diagnose_308),
STATS_DESC_COUNTER(VCPU, instruction_diagnose_500),
STATS_DESC_COUNTER(VCPU, instruction_diagnose_other),
STATS_DESC_COUNTER(VCPU, pfault_sync)
};
static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==

View File

@ -96,7 +96,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
{
ioapic->rtc_status.pending_eoi = 0;
bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID);
bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1);
}
static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);

View File

@ -43,13 +43,13 @@ struct kvm_vcpu;
struct dest_map {
/* vcpu bitmap where IRQ has been sent */
DECLARE_BITMAP(map, KVM_MAX_VCPU_ID);
DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1);
/*
* Vector sent to a given vcpu, only valid when
* the vcpu's bit in map is set
*/
u8 vectors[KVM_MAX_VCPU_ID];
u8 vectors[KVM_MAX_VCPU_ID + 1];
};

View File

@ -646,7 +646,7 @@ out:
void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
struct vmcb *vmcb = svm->vmcb01.ptr;
bool activated = kvm_vcpu_apicv_active(vcpu);
if (!enable_apicv)

View File

@ -515,7 +515,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
* Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
* avic_physical_id.
*/
WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK);
WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
/* Copied from vmcb01. msrpm_base can be overwritten later. */
svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
@ -702,8 +702,8 @@ out:
}
/* Copy state save area fields which are handled by VMRUN */
void svm_copy_vmrun_state(struct vmcb_save_area *from_save,
struct vmcb_save_area *to_save)
void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
struct vmcb_save_area *from_save)
{
to_save->es = from_save->es;
to_save->cs = from_save->cs;
@ -722,7 +722,7 @@ void svm_copy_vmrun_state(struct vmcb_save_area *from_save,
to_save->cpl = 0;
}
void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
{
to_vmcb->save.fs = from_vmcb->save.fs;
to_vmcb->save.gs = from_vmcb->save.gs;
@ -1385,7 +1385,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
svm_copy_vmrun_state(save, &svm->vmcb01.ptr->save);
svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save);
nested_load_control_from_vmcb12(svm, ctl);
svm_switch_vmcb(svm, &svm->nested.vmcb02);

View File

@ -1406,8 +1406,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
goto error_free_vmsa_page;
}
svm_vcpu_init_msrpm(vcpu, svm->msrpm);
svm->vmcb01.ptr = page_address(vmcb01_page);
svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
@ -1419,6 +1417,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm_switch_vmcb(svm, &svm->vmcb01);
init_vmcb(vcpu);
svm_vcpu_init_msrpm(vcpu, svm->msrpm);
svm_init_osvw(vcpu);
vcpu->arch.microcode_version = 0x01000065;
@ -1568,8 +1568,11 @@ static void svm_set_vintr(struct vcpu_svm *svm)
{
struct vmcb_control_area *control;
/* The following fields are ignored when AVIC is enabled */
WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
/*
* The following fields are ignored when AVIC is enabled
*/
WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
svm_set_intercept(svm, INTERCEPT_VINTR);
/*
@ -2147,11 +2150,12 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
ret = kvm_skip_emulated_instruction(vcpu);
if (vmload) {
nested_svm_vmloadsave(vmcb12, svm->vmcb);
svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
svm->sysenter_eip_hi = 0;
svm->sysenter_esp_hi = 0;
} else
nested_svm_vmloadsave(svm->vmcb, vmcb12);
} else {
svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
}
kvm_vcpu_unmap(vcpu, &map, true);
@ -4344,8 +4348,8 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
svm_copy_vmrun_state(&svm->vmcb01.ptr->save,
map_save.hva + 0x400);
svm_copy_vmrun_state(map_save.hva + 0x400,
&svm->vmcb01.ptr->save);
kvm_vcpu_unmap(vcpu, &map_save, true);
}
@ -4393,8 +4397,8 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
&map_save) == -EINVAL)
return 1;
svm_copy_vmrun_state(map_save.hva + 0x400,
&svm->vmcb01.ptr->save);
svm_copy_vmrun_state(&svm->vmcb01.ptr->save,
map_save.hva + 0x400);
kvm_vcpu_unmap(vcpu, &map_save, true);
}

View File

@ -464,9 +464,9 @@ void svm_leave_nested(struct vcpu_svm *svm);
void svm_free_nested(struct vcpu_svm *svm);
int svm_allocate_nested(struct vcpu_svm *svm);
int nested_svm_vmrun(struct kvm_vcpu *vcpu);
void svm_copy_vmrun_state(struct vmcb_save_area *from_save,
struct vmcb_save_area *to_save);
void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
struct vmcb_save_area *from_save);
void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
int nested_svm_vmexit(struct vcpu_svm *svm);
static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)

View File

@ -89,7 +89,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments(
* as we mark it dirty unconditionally towards end of vcpu
* init phase.
*/
if (vmcb && vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) &&
if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) &&
hve->hv_enlightenments_control.msr_bitmap)
vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
}

View File

@ -3407,7 +3407,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
break;
case MSR_KVM_ASYNC_PF_ACK:
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
return 1;
if (data & 0x1) {
vcpu->arch.apf.pageready_pending = false;
@ -3746,7 +3746,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.apf.msr_int_val;
break;
case MSR_KVM_ASYNC_PF_ACK:
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
return 1;
msr_info->data = 0;

View File

@ -38,6 +38,7 @@
/x86_64/xen_vmcall_test
/x86_64/xss_msr_test
/x86_64/vmx_pmu_msrs_test
/access_tracking_perf_test
/demand_paging_test
/dirty_log_test
/dirty_log_perf_test

View File

@ -71,6 +71,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_msrs_test
TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test
TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
TEST_GEN_PROGS_x86_64 += access_tracking_perf_test
TEST_GEN_PROGS_x86_64 += demand_paging_test
TEST_GEN_PROGS_x86_64 += dirty_log_test
TEST_GEN_PROGS_x86_64 += dirty_log_perf_test

View File

@ -1019,7 +1019,8 @@ static __u64 sve_rejects_set[] = {
#define VREGS_SUBLIST \
{ "vregs", .regs = vregs, .regs_n = ARRAY_SIZE(vregs), }
#define PMU_SUBLIST \
{ "pmu", .regs = pmu_regs, .regs_n = ARRAY_SIZE(pmu_regs), }
{ "pmu", .capability = KVM_CAP_ARM_PMU_V3, .feature = KVM_ARM_VCPU_PMU_V3, \
.regs = pmu_regs, .regs_n = ARRAY_SIZE(pmu_regs), }
#define SVE_SUBLIST \
{ "sve", .capability = KVM_CAP_ARM_SVE, .feature = KVM_ARM_VCPU_SVE, .finalize = true, \
.regs = sve_regs, .regs_n = ARRAY_SIZE(sve_regs), \

View File

@ -0,0 +1,429 @@
// SPDX-License-Identifier: GPL-2.0
/*
* access_tracking_perf_test
*
* Copyright (C) 2021, Google, Inc.
*
* This test measures the performance effects of KVM's access tracking.
* Access tracking is driven by the MMU notifiers test_young, clear_young, and
* clear_flush_young. These notifiers do not have a direct userspace API,
* however the clear_young notifier can be triggered by marking a pages as idle
* in /sys/kernel/mm/page_idle/bitmap. This test leverages that mechanism to
* enable access tracking on guest memory.
*
* To measure performance this test runs a VM with a configurable number of
* vCPUs that each touch every page in disjoint regions of memory. Performance
* is measured in the time it takes all vCPUs to finish touching their
* predefined region.
*
* Note that a deterministic correctness test of access tracking is not possible
* by using page_idle as it exists today. This is for a few reasons:
*
* 1. page_idle only issues clear_young notifiers, which lack a TLB flush. This
* means subsequent guest accesses are not guaranteed to see page table
* updates made by KVM until some time in the future.
*
* 2. page_idle only operates on LRU pages. Newly allocated pages are not
* immediately allocated to LRU lists. Instead they are held in a "pagevec",
* which is drained to LRU lists some time in the future. There is no
* userspace API to force this drain to occur.
*
* These limitations are worked around in this test by using a large enough
* region of memory for each vCPU such that the number of translations cached in
* the TLB and the number of pages held in pagevecs are a small fraction of the
* overall workload. And if either of those conditions are not true this test
* will fail rather than silently passing.
*/
#include <inttypes.h>
#include <limits.h>
#include <pthread.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "kvm_util.h"
#include "test_util.h"
#include "perf_test_util.h"
#include "guest_modes.h"
/* Global variable used to synchronize all of the vCPU threads. */
static int iteration = -1;
/* Defines what vCPU threads should do during a given iteration. */
static enum {
/* Run the vCPU to access all its memory. */
ITERATION_ACCESS_MEMORY,
/* Mark the vCPU's memory idle in page_idle. */
ITERATION_MARK_IDLE,
} iteration_work;
/* Set to true when vCPU threads should exit. */
static bool done;
/* The iteration that was last completed by each vCPU. */
static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
/* Whether to overlap the regions of memory vCPUs access. */
static bool overlap_memory_access;
struct test_params {
/* The backing source for the region of memory. */
enum vm_mem_backing_src_type backing_src;
/* The amount of memory to allocate for each vCPU. */
uint64_t vcpu_memory_bytes;
/* The number of vCPUs to create in the VM. */
int vcpus;
};
static uint64_t pread_uint64(int fd, const char *filename, uint64_t index)
{
uint64_t value;
off_t offset = index * sizeof(value);
TEST_ASSERT(pread(fd, &value, sizeof(value), offset) == sizeof(value),
"pread from %s offset 0x%" PRIx64 " failed!",
filename, offset);
return value;
}
#define PAGEMAP_PRESENT (1ULL << 63)
#define PAGEMAP_PFN_MASK ((1ULL << 55) - 1)
static uint64_t lookup_pfn(int pagemap_fd, struct kvm_vm *vm, uint64_t gva)
{
uint64_t hva = (uint64_t) addr_gva2hva(vm, gva);
uint64_t entry;
uint64_t pfn;
entry = pread_uint64(pagemap_fd, "pagemap", hva / getpagesize());
if (!(entry & PAGEMAP_PRESENT))
return 0;
pfn = entry & PAGEMAP_PFN_MASK;
if (!pfn) {
print_skip("Looking up PFNs requires CAP_SYS_ADMIN");
exit(KSFT_SKIP);
}
return pfn;
}
static bool is_page_idle(int page_idle_fd, uint64_t pfn)
{
uint64_t bits = pread_uint64(page_idle_fd, "page_idle", pfn / 64);
return !!((bits >> (pfn % 64)) & 1);
}
static void mark_page_idle(int page_idle_fd, uint64_t pfn)
{
uint64_t bits = 1ULL << (pfn % 64);
TEST_ASSERT(pwrite(page_idle_fd, &bits, 8, 8 * (pfn / 64)) == 8,
"Set page_idle bits for PFN 0x%" PRIx64, pfn);
}
static void mark_vcpu_memory_idle(struct kvm_vm *vm, int vcpu_id)
{
uint64_t base_gva = perf_test_args.vcpu_args[vcpu_id].gva;
uint64_t pages = perf_test_args.vcpu_args[vcpu_id].pages;
uint64_t page;
uint64_t still_idle = 0;
uint64_t no_pfn = 0;
int page_idle_fd;
int pagemap_fd;
/* If vCPUs are using an overlapping region, let vCPU 0 mark it idle. */
if (overlap_memory_access && vcpu_id)
return;
page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR);
TEST_ASSERT(page_idle_fd > 0, "Failed to open page_idle.");
pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
TEST_ASSERT(pagemap_fd > 0, "Failed to open pagemap.");
for (page = 0; page < pages; page++) {
uint64_t gva = base_gva + page * perf_test_args.guest_page_size;
uint64_t pfn = lookup_pfn(pagemap_fd, vm, gva);
if (!pfn) {
no_pfn++;
continue;
}
if (is_page_idle(page_idle_fd, pfn)) {
still_idle++;
continue;
}
mark_page_idle(page_idle_fd, pfn);
}
/*
* Assumption: Less than 1% of pages are going to be swapped out from
* under us during this test.
*/
TEST_ASSERT(no_pfn < pages / 100,
"vCPU %d: No PFN for %" PRIu64 " out of %" PRIu64 " pages.",
vcpu_id, no_pfn, pages);
/*
* Test that at least 90% of memory has been marked idle (the rest might
* not be marked idle because the pages have not yet made it to an LRU
* list or the translations are still cached in the TLB). 90% is
* arbitrary; high enough that we ensure most memory access went through
* access tracking but low enough as to not make the test too brittle
* over time and across architectures.
*/
TEST_ASSERT(still_idle < pages / 10,
"vCPU%d: Too many pages still idle (%"PRIu64 " out of %"
PRIu64 ").\n",
vcpu_id, still_idle, pages);
close(page_idle_fd);
close(pagemap_fd);
}
static void assert_ucall(struct kvm_vm *vm, uint32_t vcpu_id,
uint64_t expected_ucall)
{
struct ucall uc;
uint64_t actual_ucall = get_ucall(vm, vcpu_id, &uc);
TEST_ASSERT(expected_ucall == actual_ucall,
"Guest exited unexpectedly (expected ucall %" PRIu64
", got %" PRIu64 ")",
expected_ucall, actual_ucall);
}
static bool spin_wait_for_next_iteration(int *current_iteration)
{
int last_iteration = *current_iteration;
do {
if (READ_ONCE(done))
return false;
*current_iteration = READ_ONCE(iteration);
} while (last_iteration == *current_iteration);
return true;
}
static void *vcpu_thread_main(void *arg)
{
struct perf_test_vcpu_args *vcpu_args = arg;
struct kvm_vm *vm = perf_test_args.vm;
int vcpu_id = vcpu_args->vcpu_id;
int current_iteration = -1;
vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
while (spin_wait_for_next_iteration(&current_iteration)) {
switch (READ_ONCE(iteration_work)) {
case ITERATION_ACCESS_MEMORY:
vcpu_run(vm, vcpu_id);
assert_ucall(vm, vcpu_id, UCALL_SYNC);
break;
case ITERATION_MARK_IDLE:
mark_vcpu_memory_idle(vm, vcpu_id);
break;
};
vcpu_last_completed_iteration[vcpu_id] = current_iteration;
}
return NULL;
}
static void spin_wait_for_vcpu(int vcpu_id, int target_iteration)
{
while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) !=
target_iteration) {
continue;
}
}
/* The type of memory accesses to perform in the VM. */
enum access_type {
ACCESS_READ,
ACCESS_WRITE,
};
static void run_iteration(struct kvm_vm *vm, int vcpus, const char *description)
{
struct timespec ts_start;
struct timespec ts_elapsed;
int next_iteration;
int vcpu_id;
/* Kick off the vCPUs by incrementing iteration. */
next_iteration = ++iteration;
clock_gettime(CLOCK_MONOTONIC, &ts_start);
/* Wait for all vCPUs to finish the iteration. */
for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++)
spin_wait_for_vcpu(vcpu_id, next_iteration);
ts_elapsed = timespec_elapsed(ts_start);
pr_info("%-30s: %ld.%09lds\n",
description, ts_elapsed.tv_sec, ts_elapsed.tv_nsec);
}
static void access_memory(struct kvm_vm *vm, int vcpus, enum access_type access,
const char *description)
{
perf_test_args.wr_fract = (access == ACCESS_READ) ? INT_MAX : 1;
sync_global_to_guest(vm, perf_test_args);
iteration_work = ITERATION_ACCESS_MEMORY;
run_iteration(vm, vcpus, description);
}
static void mark_memory_idle(struct kvm_vm *vm, int vcpus)
{
/*
* Even though this parallelizes the work across vCPUs, this is still a
* very slow operation because page_idle forces the test to mark one pfn
* at a time and the clear_young notifier serializes on the KVM MMU
* lock.
*/
pr_debug("Marking VM memory idle (slow)...\n");
iteration_work = ITERATION_MARK_IDLE;
run_iteration(vm, vcpus, "Mark memory idle");
}
static pthread_t *create_vcpu_threads(int vcpus)
{
pthread_t *vcpu_threads;
int i;
vcpu_threads = malloc(vcpus * sizeof(vcpu_threads[0]));
TEST_ASSERT(vcpu_threads, "Failed to allocate vcpu_threads.");
for (i = 0; i < vcpus; i++) {
vcpu_last_completed_iteration[i] = iteration;
pthread_create(&vcpu_threads[i], NULL, vcpu_thread_main,
&perf_test_args.vcpu_args[i]);
}
return vcpu_threads;
}
static void terminate_vcpu_threads(pthread_t *vcpu_threads, int vcpus)
{
int i;
/* Set done to signal the vCPU threads to exit */
done = true;
for (i = 0; i < vcpus; i++)
pthread_join(vcpu_threads[i], NULL);
}
static void run_test(enum vm_guest_mode mode, void *arg)
{
struct test_params *params = arg;
struct kvm_vm *vm;
pthread_t *vcpu_threads;
int vcpus = params->vcpus;
vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes,
params->backing_src);
perf_test_setup_vcpus(vm, vcpus, params->vcpu_memory_bytes,
!overlap_memory_access);
vcpu_threads = create_vcpu_threads(vcpus);
pr_info("\n");
access_memory(vm, vcpus, ACCESS_WRITE, "Populating memory");
/* As a control, read and write to the populated memory first. */
access_memory(vm, vcpus, ACCESS_WRITE, "Writing to populated memory");
access_memory(vm, vcpus, ACCESS_READ, "Reading from populated memory");
/* Repeat on memory that has been marked as idle. */
mark_memory_idle(vm, vcpus);
access_memory(vm, vcpus, ACCESS_WRITE, "Writing to idle memory");
mark_memory_idle(vm, vcpus);
access_memory(vm, vcpus, ACCESS_READ, "Reading from idle memory");
terminate_vcpu_threads(vcpu_threads, vcpus);
free(vcpu_threads);
perf_test_destroy_vm(vm);
}
static void help(char *name)
{
puts("");
printf("usage: %s [-h] [-m mode] [-b vcpu_bytes] [-v vcpus] [-o] [-s mem_type]\n",
name);
puts("");
printf(" -h: Display this help message.");
guest_modes_help();
printf(" -b: specify the size of the memory region which should be\n"
" dirtied by each vCPU. e.g. 10M or 3G.\n"
" (default: 1G)\n");
printf(" -v: specify the number of vCPUs to run.\n");
printf(" -o: Overlap guest memory accesses instead of partitioning\n"
" them into a separate region of memory for each vCPU.\n");
printf(" -s: specify the type of memory that should be used to\n"
" back the guest data region.\n\n");
backing_src_help();
puts("");
exit(0);
}
int main(int argc, char *argv[])
{
struct test_params params = {
.backing_src = VM_MEM_SRC_ANONYMOUS,
.vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE,
.vcpus = 1,
};
int page_idle_fd;
int opt;
guest_modes_append_default();
while ((opt = getopt(argc, argv, "hm:b:v:os:")) != -1) {
switch (opt) {
case 'm':
guest_modes_cmdline(optarg);
break;
case 'b':
params.vcpu_memory_bytes = parse_size(optarg);
break;
case 'v':
params.vcpus = atoi(optarg);
break;
case 'o':
overlap_memory_access = true;
break;
case 's':
params.backing_src = parse_backing_src_type(optarg);
break;
case 'h':
default:
help(argv[0]);
break;
}
}
page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR);
if (page_idle_fd < 0) {
print_skip("CONFIG_IDLE_PAGE_TRACKING is not enabled");
exit(KSFT_SKIP);
}
close(page_idle_fd);
for_each_guest_mode(run_test, &params);
return 0;
}

View File

@ -312,6 +312,7 @@ int main(int argc, char *argv[])
break;
case 'o':
p.partition_vcpu_memory_access = false;
break;
case 's':
p.backing_src = parse_backing_src_type(optarg);
break;

View File

@ -320,7 +320,7 @@ int main(int ac, char **av)
run_delay = get_run_delay();
pthread_create(&thread, &attr, do_steal_time, NULL);
do
pthread_yield();
sched_yield();
while (get_run_delay() - run_delay < MIN_RUN_DELAY_NS);
pthread_join(thread, NULL);
run_delay = get_run_delay() - run_delay;

View File

@ -3110,6 +3110,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
++vcpu->stat.generic.halt_poll_invalid;
goto out;
}
cpu_relax();
poll_end = cur = ktime_get();
} while (kvm_vcpu_can_poll(cur, stop));
}
@ -4390,6 +4391,16 @@ struct compat_kvm_dirty_log {
};
};
struct compat_kvm_clear_dirty_log {
__u32 slot;
__u32 num_pages;
__u64 first_page;
union {
compat_uptr_t dirty_bitmap; /* one bit per page */
__u64 padding2;
};
};
static long kvm_vm_compat_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@ -4399,6 +4410,24 @@ static long kvm_vm_compat_ioctl(struct file *filp,
if (kvm->mm != current->mm)
return -EIO;
switch (ioctl) {
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
case KVM_CLEAR_DIRTY_LOG: {
struct compat_kvm_clear_dirty_log compat_log;
struct kvm_clear_dirty_log log;
if (copy_from_user(&compat_log, (void __user *)arg,
sizeof(compat_log)))
return -EFAULT;
log.slot = compat_log.slot;
log.num_pages = compat_log.num_pages;
log.first_page = compat_log.first_page;
log.padding2 = compat_log.padding2;
log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
break;
}
#endif
case KVM_GET_DIRTY_LOG: {
struct compat_kvm_dirty_log compat_log;
struct kvm_dirty_log log;