From 9b22bf578332d3e326c349bc8a8789af3d952435 Mon Sep 17 00:00:00 2001 From: Dor Laor Date: Mon, 19 Feb 2007 16:44:49 +0200 Subject: [PATCH 01/66] KVM: Fix guest register corruption on paravirt hypercall The hypercall code mixes up the ->cache_regs() and ->decache_regs() callbacks, resulting in guest register corruption. Signed-off-by: Dor Laor Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index dc7a8c78cbf9..ff7c836ff001 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1177,7 +1177,7 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) { unsigned long nr, a0, a1, a2, a3, a4, a5, ret; - kvm_arch_ops->decache_regs(vcpu); + kvm_arch_ops->cache_regs(vcpu); ret = -KVM_EINVAL; #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { @@ -1204,7 +1204,7 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) ; } vcpu->regs[VCPU_REGS_RAX] = ret; - kvm_arch_ops->cache_regs(vcpu); + kvm_arch_ops->decache_regs(vcpu); return 1; } EXPORT_SYMBOL_GPL(kvm_hypercall); From 510043da8582ad49d22a1e9a6b211e6ede10cd2e Mon Sep 17 00:00:00 2001 From: Dor Laor Date: Mon, 19 Feb 2007 18:25:43 +0200 Subject: [PATCH 02/66] KVM: Use the generic skip_emulated_instruction() in hypercall code Instead of twiddling the rip registers directly, use the skip_emulated_instruction() function to do that for us. Signed-off-by: Dor Laor Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 3 ++- drivers/kvm/vmx.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 3d8ea7ac2ecc..6787f11738cf 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1078,7 +1078,8 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - vcpu->svm->vmcb->save.rip += 3; + vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 3; + skip_emulated_instruction(vcpu); return kvm_hypercall(vcpu, kvm_run); } diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index fbbf9d6b299f..a721b60f7385 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1658,7 +1658,7 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP)+3); + skip_emulated_instruction(vcpu); return kvm_hypercall(vcpu, kvm_run); } From bbe4432e669ab94fc8059e7ab878cafad7b8d123 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 4 Mar 2007 13:27:36 +0200 Subject: [PATCH 03/66] KVM: Use own minor number Use the minor number (232) allocated to kvm by lanana. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 2 +- include/linux/miscdevice.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index ff7c836ff001..946ed86a0595 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -2299,7 +2299,7 @@ static struct file_operations kvm_chardev_ops = { }; static struct miscdevice kvm_dev = { - MISC_DYNAMIC_MINOR, + KVM_MINOR, "kvm", &kvm_chardev_ops, }; diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index 326da7d500c7..dff9ea32606a 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -29,6 +29,7 @@ #define TUN_MINOR 200 #define HPET_MINOR 228 +#define KVM_MINOR 232 struct device; From ff42697436ddf5bd026e2cb4f117656b967f0709 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Mar 2007 09:29:48 +0200 Subject: [PATCH 04/66] KVM: Export This allows users to actually build prgrams that use kvm without the entire source tree. Signed-off-by: Avi Kivity --- include/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 4ff0f57d0add..9f05279e7dd3 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -96,6 +96,7 @@ header-y += iso_fs.h header-y += ixjuser.h header-y += jffs2.h header-y += keyctl.h +header-y += kvm.h header-y += limits.h header-y += lock_dlm_plock.h header-y += magic.h From 1ea252afcd4b264b71d9c3f55358ff5ba4c04f1b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 8 Mar 2007 11:48:09 +0200 Subject: [PATCH 05/66] KVM: Fix bogus sign extension in mmu mapping audit When auditing a 32-bit guest on a 64-bit host, sign extension of the page table directory pointer table index caused bogus addresses to be shown on audit errors. Fix by declaring the index unsigned. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index cab26f301eab..2d905770fd88 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1360,7 +1360,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, static void audit_mappings(struct kvm_vcpu *vcpu) { - int i; + unsigned i; if (vcpu->mmu.root_level == 4) audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4); From 9a2bb7f486dc639a1cf2ad803bf2227f0dc0809d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 22 Feb 2007 12:58:31 +0200 Subject: [PATCH 06/66] KVM: Use a shared page for kernel/user communication when runing a vcpu Instead of passing a 'struct kvm_run' back and forth between the kernel and userspace, allocate a page and allow the user to mmap() it. This reduces needless copying and makes the interface expandable by providing lots of free space. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 1 + drivers/kvm/kvm_main.c | 54 +++++++++++++++++++++++++++++++----------- include/linux/kvm.h | 6 ++--- 3 files changed, 44 insertions(+), 17 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 0d122bf889db..901b8d917b55 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -228,6 +228,7 @@ struct kvm_vcpu { struct mutex mutex; int cpu; int launched; + struct kvm_run *run; int interrupt_window_open; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 946ed86a0595..42be8a8f299d 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -355,6 +355,8 @@ static void kvm_free_vcpu(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); vcpu_put(vcpu); kvm_arch_ops->vcpu_free(vcpu); + free_page((unsigned long)vcpu->run); + vcpu->run = NULL; } static void kvm_free_vcpus(struct kvm *kvm) @@ -1887,6 +1889,33 @@ static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, return r; } +static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + struct kvm_vcpu *vcpu = vma->vm_file->private_data; + unsigned long pgoff; + struct page *page; + + *type = VM_FAULT_MINOR; + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (pgoff != 0) + return NOPAGE_SIGBUS; + page = virt_to_page(vcpu->run); + get_page(page); + return page; +} + +static struct vm_operations_struct kvm_vcpu_vm_ops = { + .nopage = kvm_vcpu_nopage, +}; + +static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vcpu_vm_ops; + return 0; +} + static int kvm_vcpu_release(struct inode *inode, struct file *filp) { struct kvm_vcpu *vcpu = filp->private_data; @@ -1899,6 +1928,7 @@ static struct file_operations kvm_vcpu_fops = { .release = kvm_vcpu_release, .unlocked_ioctl = kvm_vcpu_ioctl, .compat_ioctl = kvm_vcpu_ioctl, + .mmap = kvm_vcpu_mmap, }; /* @@ -1947,6 +1977,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) { int r; struct kvm_vcpu *vcpu; + struct page *page; r = -EINVAL; if (!valid_vcpu(n)) @@ -1961,6 +1992,12 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) return -EEXIST; } + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + r = -ENOMEM; + if (!page) + goto out_unlock; + vcpu->run = page_address(page); + vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, FX_IMAGE_ALIGN); vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; @@ -1990,6 +2027,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) out_free_vcpus: kvm_free_vcpu(vcpu); +out_unlock: mutex_unlock(&vcpu->mutex); out: return r; @@ -2003,21 +2041,9 @@ static long kvm_vcpu_ioctl(struct file *filp, int r = -EINVAL; switch (ioctl) { - case KVM_RUN: { - struct kvm_run kvm_run; - - r = -EFAULT; - if (copy_from_user(&kvm_run, argp, sizeof kvm_run)) - goto out; - r = kvm_vcpu_ioctl_run(vcpu, &kvm_run); - if (r < 0 && r != -EINTR) - goto out; - if (copy_to_user(argp, &kvm_run, sizeof kvm_run)) { - r = -EFAULT; - goto out; - } + case KVM_RUN: + r = kvm_vcpu_ioctl_run(vcpu, vcpu->run); break; - } case KVM_GET_REGS: { struct kvm_regs kvm_regs; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 275354ffa1cb..d88e7508ee0a 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -11,7 +11,7 @@ #include #include -#define KVM_API_VERSION 4 +#define KVM_API_VERSION 5 /* * Architectural interrupt line count, and the size of the bitmap needed @@ -49,7 +49,7 @@ enum kvm_exit_reason { KVM_EXIT_SHUTDOWN = 8, }; -/* for KVM_RUN */ +/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { /* in */ __u32 emulated; /* skip current instruction */ @@ -233,7 +233,7 @@ struct kvm_dirty_log { /* * ioctls for vcpu fds */ -#define KVM_RUN _IOWR(KVMIO, 2, struct kvm_run) +#define KVM_RUN _IO(KVMIO, 16) #define KVM_GET_REGS _IOR(KVMIO, 3, struct kvm_regs) #define KVM_SET_REGS _IOW(KVMIO, 4, struct kvm_regs) #define KVM_GET_SREGS _IOR(KVMIO, 5, struct kvm_sregs) From 46fc1477887c41c8e900f2c95485e222b9a54822 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 22 Feb 2007 19:39:30 +0200 Subject: [PATCH 07/66] KVM: Do not communicate to userspace through cpu registers during PIO Currently when passing the a PIO emulation request to userspace, we rely on userspace updating %rax (on 'in' instructions) and %rsi/%rdi/%rcx (on string instructions). This (a) requires two extra ioctls for getting and setting the registers and (b) is unfriendly to non-x86 archs, when they get kvm ports. So fix by doing the register fixups in the kernel and passing to userspace only an abstract description of the PIO to be done. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 1 + drivers/kvm/kvm_main.c | 48 +++++++++++++++++++++++++++++++++++++++--- drivers/kvm/svm.c | 2 ++ drivers/kvm/vmx.c | 2 ++ include/linux/kvm.h | 6 +++--- 5 files changed, 53 insertions(+), 6 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 901b8d917b55..59cbc5b1d905 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -274,6 +274,7 @@ struct kvm_vcpu { int mmio_size; unsigned char mmio_data[8]; gpa_t mmio_phys_addr; + int pio_pending; struct { int active; diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 42be8a8f299d..ff8bcfee76e5 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1504,6 +1504,44 @@ void save_msrs(struct vmx_msr_entry *e, int n) } EXPORT_SYMBOL_GPL(save_msrs); +static void complete_pio(struct kvm_vcpu *vcpu) +{ + struct kvm_io *io = &vcpu->run->io; + long delta; + + kvm_arch_ops->cache_regs(vcpu); + + if (!io->string) { + if (io->direction == KVM_EXIT_IO_IN) + memcpy(&vcpu->regs[VCPU_REGS_RAX], &io->value, + io->size); + } else { + delta = 1; + if (io->rep) { + delta *= io->count; + /* + * The size of the register should really depend on + * current address size. + */ + vcpu->regs[VCPU_REGS_RCX] -= delta; + } + if (io->string_down) + delta = -delta; + delta *= io->size; + if (io->direction == KVM_EXIT_IO_IN) + vcpu->regs[VCPU_REGS_RDI] += delta; + else + vcpu->regs[VCPU_REGS_RSI] += delta; + } + + vcpu->pio_pending = 0; + vcpu->run->io_completed = 0; + + kvm_arch_ops->decache_regs(vcpu); + + kvm_arch_ops->skip_emulated_instruction(vcpu); +} + static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -1518,9 +1556,13 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_run->emulated = 0; } - if (kvm_run->mmio_completed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); - vcpu->mmio_read_completed = 1; + if (kvm_run->io_completed) { + if (vcpu->pio_pending) + complete_pio(vcpu); + else { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + } } vcpu->mmio_needed = 0; diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 6787f11738cf..c35b8c83bf3f 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1037,6 +1037,7 @@ static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_run->io.size = ((io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT); kvm_run->io.string = (io_info & SVM_IOIO_STR_MASK) != 0; kvm_run->io.rep = (io_info & SVM_IOIO_REP_MASK) != 0; + kvm_run->io.count = 1; if (kvm_run->io.string) { unsigned addr_mask; @@ -1056,6 +1057,7 @@ static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } else kvm_run->io.value = vcpu->svm->vmcb->save.rax; + vcpu->pio_pending = 1; return 0; } diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index a721b60f7385..4d5f40fcb651 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1459,12 +1459,14 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; kvm_run->io.rep = (exit_qualification & 32) != 0; kvm_run->io.port = exit_qualification >> 16; + kvm_run->io.count = 1; if (kvm_run->io.string) { if (!get_io_count(vcpu, &kvm_run->io.count)) return 1; kvm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS); } else kvm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */ + vcpu->pio_pending = 1; return 0; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index d88e7508ee0a..19aeb3385188 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -11,7 +11,7 @@ #include #include -#define KVM_API_VERSION 5 +#define KVM_API_VERSION 6 /* * Architectural interrupt line count, and the size of the bitmap needed @@ -53,7 +53,7 @@ enum kvm_exit_reason { struct kvm_run { /* in */ __u32 emulated; /* skip current instruction */ - __u32 mmio_completed; /* mmio request completed */ + __u32 io_completed; /* mmio/pio request completed */ __u8 request_interrupt_window; __u8 padding1[7]; @@ -80,7 +80,7 @@ struct kvm_run { __u32 error_code; } ex; /* KVM_EXIT_IO */ - struct { + struct kvm_io { #define KVM_EXIT_IO_IN 0 #define KVM_EXIT_IO_OUT 1 __u8 direction; From 06465c5a3aa9948a7b00af49cd22ed8f235cdb0f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 28 Feb 2007 20:46:53 +0200 Subject: [PATCH 08/66] KVM: Handle cpuid in the kernel instead of punting to userspace KVM used to handle cpuid by letting userspace decide what values to return to the guest. We now handle cpuid completely in the kernel. We still let userspace decide which values the guest will see by having userspace set up the value table beforehand (this is necessary to allow management software to set the cpu features to the least common denominator, so that live migration can work). The motivation for the change is that kvm kernel code can be impacted by cpuid features, for example the x86 emulator. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 5 +++ drivers/kvm/kvm_main.c | 69 ++++++++++++++++++++++++++++++++++++++++++ drivers/kvm/svm.c | 4 +-- drivers/kvm/vmx.c | 4 +-- include/linux/kvm.h | 18 ++++++++++- 5 files changed, 95 insertions(+), 5 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 59cbc5b1d905..be3a0e7ecae4 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -55,6 +55,7 @@ #define KVM_NUM_MMU_PAGES 256 #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 +#define KVM_MAX_CPUID_ENTRIES 40 #define FX_IMAGE_SIZE 512 #define FX_IMAGE_ALIGN 16 @@ -286,6 +287,9 @@ struct kvm_vcpu { u32 ar; } tr, es, ds, fs, gs; } rmode; + + int cpuid_nent; + struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; }; struct kvm_memory_slot { @@ -446,6 +450,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, struct x86_emulate_ctxt; +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int emulate_clts(struct kvm_vcpu *vcpu); int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index ff8bcfee76e5..caec54fbf07f 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1504,6 +1504,43 @@ void save_msrs(struct vmx_msr_entry *e, int n) } EXPORT_SYMBOL_GPL(save_msrs); +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + int i; + u32 function; + struct kvm_cpuid_entry *e, *best; + + kvm_arch_ops->cache_regs(vcpu); + function = vcpu->regs[VCPU_REGS_RAX]; + vcpu->regs[VCPU_REGS_RAX] = 0; + vcpu->regs[VCPU_REGS_RBX] = 0; + vcpu->regs[VCPU_REGS_RCX] = 0; + vcpu->regs[VCPU_REGS_RDX] = 0; + best = NULL; + for (i = 0; i < vcpu->cpuid_nent; ++i) { + e = &vcpu->cpuid_entries[i]; + if (e->function == function) { + best = e; + break; + } + /* + * Both basic or both extended? + */ + if (((e->function ^ function) & 0x80000000) == 0) + if (!best || e->function > best->function) + best = e; + } + if (best) { + vcpu->regs[VCPU_REGS_RAX] = best->eax; + vcpu->regs[VCPU_REGS_RBX] = best->ebx; + vcpu->regs[VCPU_REGS_RCX] = best->ecx; + vcpu->regs[VCPU_REGS_RDX] = best->edx; + } + kvm_arch_ops->decache_regs(vcpu); + kvm_arch_ops->skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); + static void complete_pio(struct kvm_vcpu *vcpu) { struct kvm_io *io = &vcpu->run->io; @@ -2075,6 +2112,26 @@ out: return r; } +static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries) +{ + int r; + + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -EFAULT; + if (copy_from_user(&vcpu->cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry))) + goto out; + vcpu->cpuid_nent = cpuid->nent; + return 0; + +out: + return r; +} + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2181,6 +2238,18 @@ static long kvm_vcpu_ioctl(struct file *filp, case KVM_SET_MSRS: r = msr_io(vcpu, argp, do_set_msr, 0); break; + case KVM_SET_CPUID: { + struct kvm_cpuid __user *cpuid_arg = argp; + struct kvm_cpuid cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); + if (r) + goto out; + break; + } default: ; } diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index c35b8c83bf3f..d4b2936479d9 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1101,8 +1101,8 @@ static int task_switch_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_r static int cpuid_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; - kvm_run->exit_reason = KVM_EXIT_CPUID; - return 0; + kvm_emulate_cpuid(vcpu); + return 1; } static int emulate_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 4d5f40fcb651..71410a65bb90 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1585,8 +1585,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - kvm_run->exit_reason = KVM_EXIT_CPUID; - return 0; + kvm_emulate_cpuid(vcpu); + return 1; } static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 19aeb3385188..15e23bc06e8b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -41,7 +41,6 @@ enum kvm_exit_reason { KVM_EXIT_UNKNOWN = 0, KVM_EXIT_EXCEPTION = 1, KVM_EXIT_IO = 2, - KVM_EXIT_CPUID = 3, KVM_EXIT_DEBUG = 4, KVM_EXIT_HLT = 5, KVM_EXIT_MMIO = 6, @@ -210,6 +209,22 @@ struct kvm_dirty_log { }; }; +struct kvm_cpuid_entry { + __u32 function; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding; +}; + +/* for KVM_SET_CPUID */ +struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; +}; + #define KVMIO 0xAE /* @@ -243,5 +258,6 @@ struct kvm_dirty_log { #define KVM_DEBUG_GUEST _IOW(KVMIO, 9, struct kvm_debug_guest) #define KVM_GET_MSRS _IOWR(KVMIO, 13, struct kvm_msrs) #define KVM_SET_MSRS _IOW(KVMIO, 14, struct kvm_msrs) +#define KVM_SET_CPUID _IOW(KVMIO, 17, struct kvm_cpuid) #endif From 106b552b43beac2694df5fbafc8f125a72df5f65 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 1 Mar 2007 16:20:40 +0200 Subject: [PATCH 09/66] KVM: Remove the 'emulated' field from the userspace interface We no longer emulate single instructions in userspace. Instead, we service mmio or pio requests. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 5 ----- include/linux/kvm.h | 3 +-- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index caec54fbf07f..5d24203afd20 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1588,11 +1588,6 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* re-sync apic's tpr */ vcpu->cr8 = kvm_run->cr8; - if (kvm_run->emulated) { - kvm_arch_ops->skip_emulated_instruction(vcpu); - kvm_run->emulated = 0; - } - if (kvm_run->io_completed) { if (vcpu->pio_pending) complete_pio(vcpu); diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 15e23bc06e8b..c6dd4a79b74b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -51,10 +51,9 @@ enum kvm_exit_reason { /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { /* in */ - __u32 emulated; /* skip current instruction */ __u32 io_completed; /* mmio/pio request completed */ __u8 request_interrupt_window; - __u8 padding1[7]; + __u8 padding1[3]; /* out */ __u32 exit_type; From 2a4dac3952468157297b81ae0a29815c02ead179 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 1 Mar 2007 16:47:06 +0200 Subject: [PATCH 10/66] KVM: Remove minor wart from KVM_CREATE_VCPU ioctl That ioctl does not transfer any data, so it should be an _IO rather than an _IOW. Signed-off-by: Avi Kivity --- include/linux/kvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index c6dd4a79b74b..d89189a81aba 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -241,7 +241,7 @@ struct kvm_cpuid { * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. */ -#define KVM_CREATE_VCPU _IOW(KVMIO, 11, int) +#define KVM_CREATE_VCPU _IO(KVMIO, 11) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 12, struct kvm_dirty_log) /* From 739872c56f3322c38320c7a5a543ef6f56f174bc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 1 Mar 2007 17:20:13 +0200 Subject: [PATCH 11/66] KVM: Renumber ioctls The recent changes have left the ioctl numbers in complete disarray. Signed-off-by: Avi Kivity --- include/linux/kvm.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index d89189a81aba..93472daec120 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -229,34 +229,34 @@ struct kvm_cpuid { /* * ioctls for /dev/kvm fds: */ -#define KVM_GET_API_VERSION _IO(KVMIO, 1) -#define KVM_CREATE_VM _IO(KVMIO, 2) /* returns a VM fd */ -#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 15, struct kvm_msr_list) +#define KVM_GET_API_VERSION _IO(KVMIO, 0x00) +#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */ +#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 0x02, struct kvm_msr_list) /* * ioctls for VM fds */ -#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 10, struct kvm_memory_region) +#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. */ -#define KVM_CREATE_VCPU _IO(KVMIO, 11) -#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 12, struct kvm_dirty_log) +#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) +#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) /* * ioctls for vcpu fds */ -#define KVM_RUN _IO(KVMIO, 16) -#define KVM_GET_REGS _IOR(KVMIO, 3, struct kvm_regs) -#define KVM_SET_REGS _IOW(KVMIO, 4, struct kvm_regs) -#define KVM_GET_SREGS _IOR(KVMIO, 5, struct kvm_sregs) -#define KVM_SET_SREGS _IOW(KVMIO, 6, struct kvm_sregs) -#define KVM_TRANSLATE _IOWR(KVMIO, 7, struct kvm_translation) -#define KVM_INTERRUPT _IOW(KVMIO, 8, struct kvm_interrupt) -#define KVM_DEBUG_GUEST _IOW(KVMIO, 9, struct kvm_debug_guest) -#define KVM_GET_MSRS _IOWR(KVMIO, 13, struct kvm_msrs) -#define KVM_SET_MSRS _IOW(KVMIO, 14, struct kvm_msrs) -#define KVM_SET_CPUID _IOW(KVMIO, 17, struct kvm_cpuid) +#define KVM_RUN _IO(KVMIO, 0x80) +#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs) +#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs) +#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs) +#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs) +#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) +#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) +#define KVM_DEBUG_GUEST _IOW(KVMIO, 0x87, struct kvm_debug_guest) +#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs) +#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs) +#define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid) #endif From 5d308f4550d9dc4c236e08b0377b610b9578577b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 1 Mar 2007 17:56:20 +0200 Subject: [PATCH 12/66] KVM: Add method to check for backwards-compatible API extensions Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 6 ++++++ include/linux/kvm.h | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 5d24203afd20..39cf8fd343a3 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -2416,6 +2416,12 @@ static long kvm_dev_ioctl(struct file *filp, r = 0; break; } + case KVM_CHECK_EXTENSION: + /* + * No extensions defined at present. + */ + r = 0; + break; default: ; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 93472daec120..c93cf53953af 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -232,6 +232,11 @@ struct kvm_cpuid { #define KVM_GET_API_VERSION _IO(KVMIO, 0x00) #define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */ #define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 0x02, struct kvm_msr_list) +/* + * Check if a kvm extension is available. Argument is extension number, + * return is 1 (yes) or 0 (no, sorry). + */ +#define KVM_CHECK_EXTENSION _IO(KVMIO, 0x03) /* * ioctls for VM fds From b4e63f560beb187cffdaf706e534a1e2f9effb66 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 4 Mar 2007 13:59:30 +0200 Subject: [PATCH 13/66] KVM: Allow userspace to process hypercalls which have no kernel handler This is useful for paravirtualized graphics devices, for example. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 18 +++++++++++++++++- include/linux/kvm.h | 10 +++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 39cf8fd343a3..de93117f1e97 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1203,7 +1203,16 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) } switch (nr) { default: - ; + run->hypercall.args[0] = a0; + run->hypercall.args[1] = a1; + run->hypercall.args[2] = a2; + run->hypercall.args[3] = a3; + run->hypercall.args[4] = a4; + run->hypercall.args[5] = a5; + run->hypercall.ret = ret; + run->hypercall.longmode = is_long_mode(vcpu); + kvm_arch_ops->decache_regs(vcpu); + return 0; } vcpu->regs[VCPU_REGS_RAX] = ret; kvm_arch_ops->decache_regs(vcpu); @@ -1599,6 +1608,13 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->mmio_needed = 0; + if (kvm_run->exit_type == KVM_EXIT_TYPE_VM_EXIT + && kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { + kvm_arch_ops->cache_regs(vcpu); + vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; + kvm_arch_ops->decache_regs(vcpu); + } + r = kvm_arch_ops->run(vcpu, kvm_run); vcpu_put(vcpu); diff --git a/include/linux/kvm.h b/include/linux/kvm.h index c93cf53953af..9151ebfa22e9 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -11,7 +11,7 @@ #include #include -#define KVM_API_VERSION 6 +#define KVM_API_VERSION 7 /* * Architectural interrupt line count, and the size of the bitmap needed @@ -41,6 +41,7 @@ enum kvm_exit_reason { KVM_EXIT_UNKNOWN = 0, KVM_EXIT_EXCEPTION = 1, KVM_EXIT_IO = 2, + KVM_EXIT_HYPERCALL = 3, KVM_EXIT_DEBUG = 4, KVM_EXIT_HLT = 5, KVM_EXIT_MMIO = 6, @@ -103,6 +104,13 @@ struct kvm_run { __u32 len; __u8 is_write; } mmio; + /* KVM_EXIT_HYPERCALL */ + struct { + __u64 args[6]; + __u64 ret; + __u32 longmode; + __u32 pad; + } hypercall; }; }; From 8eb7d334bd8e693340ee198280f7d45035cdab8c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 4 Mar 2007 14:17:08 +0200 Subject: [PATCH 14/66] KVM: Fold kvm_run::exit_type into kvm_run::exit_reason Currently, userspace is told about the nature of the last exit from the guest using two fields, exit_type and exit_reason, where exit_type has just two enumerations (and no need for more). So fold exit_type into exit_reason, reducing the complexity of determining what really happened. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 3 +-- drivers/kvm/svm.c | 7 +++---- drivers/kvm/vmx.c | 7 +++---- include/linux/kvm.h | 15 ++++++++------- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index de93117f1e97..ac44df551aa8 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1608,8 +1608,7 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->mmio_needed = 0; - if (kvm_run->exit_type == KVM_EXIT_TYPE_VM_EXIT - && kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { kvm_arch_ops->cache_regs(vcpu); vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; kvm_arch_ops->decache_regs(vcpu); diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index d4b2936479d9..b09928f14219 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1298,8 +1298,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u32 exit_code = vcpu->svm->vmcb->control.exit_code; - kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; - if (is_external_interrupt(vcpu->svm->vmcb->control.exit_int_info) && exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " @@ -1609,8 +1607,9 @@ again: vcpu->svm->next_rip = 0; if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) { - kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; - kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code; + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->fail_entry.hardware_entry_failure_reason + = vcpu->svm->vmcb->control.exit_code; post_kvm_run_save(vcpu, kvm_run); return 0; } diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 71410a65bb90..cf9568fbe8a5 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1922,10 +1922,10 @@ again: asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); - kvm_run->exit_type = 0; if (fail) { - kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; - kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->fail_entry.hardware_entry_failure_reason + = vmcs_read32(VM_INSTRUCTION_ERROR); r = 0; } else { /* @@ -1935,7 +1935,6 @@ again: profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); vcpu->launched = 1; - kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; r = kvm_handle_exit(kvm_run, vcpu); if (r > 0) { /* Give scheduler a change to reschedule. */ diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 9151ebfa22e9..57f47ef93829 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -11,7 +11,7 @@ #include #include -#define KVM_API_VERSION 7 +#define KVM_API_VERSION 8 /* * Architectural interrupt line count, and the size of the bitmap needed @@ -34,9 +34,6 @@ struct kvm_memory_region { #define KVM_MEM_LOG_DIRTY_PAGES 1UL -#define KVM_EXIT_TYPE_FAIL_ENTRY 1 -#define KVM_EXIT_TYPE_VM_EXIT 2 - enum kvm_exit_reason { KVM_EXIT_UNKNOWN = 0, KVM_EXIT_EXCEPTION = 1, @@ -47,6 +44,7 @@ enum kvm_exit_reason { KVM_EXIT_MMIO = 6, KVM_EXIT_IRQ_WINDOW_OPEN = 7, KVM_EXIT_SHUTDOWN = 8, + KVM_EXIT_FAIL_ENTRY = 9, }; /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ @@ -57,12 +55,11 @@ struct kvm_run { __u8 padding1[3]; /* out */ - __u32 exit_type; __u32 exit_reason; __u32 instruction_length; __u8 ready_for_interrupt_injection; __u8 if_flag; - __u16 padding2; + __u8 padding2[6]; /* in (pre_kvm_run), out (post_kvm_run) */ __u64 cr8; @@ -71,8 +68,12 @@ struct kvm_run { union { /* KVM_EXIT_UNKNOWN */ struct { - __u32 hardware_exit_reason; + __u64 hardware_exit_reason; } hw; + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; + } fail_entry; /* KVM_EXIT_EXCEPTION */ struct { __u32 exception; From 1b19f3e61d7e1edb395dd64bf7d63621a37af8ca Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 4 Mar 2007 14:24:03 +0200 Subject: [PATCH 15/66] KVM: Add a special exit reason when exiting due to an interrupt This is redundant, as we also return -EINTR from the ioctl, but it allows us to examine the exit_reason field on resume without seeing old data. Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 2 ++ drivers/kvm/vmx.c | 2 ++ include/linux/kvm.h | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index b09928f14219..0311665e3c83 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1619,12 +1619,14 @@ again: if (signal_pending(current)) { ++kvm_stat.signal_exits; post_kvm_run_save(vcpu, kvm_run); + kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } if (dm_request_for_irq_injection(vcpu, kvm_run)) { ++kvm_stat.request_irq_exits; post_kvm_run_save(vcpu, kvm_run); + kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } kvm_resched(vcpu); diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index cf9568fbe8a5..e69bab6d811d 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1941,12 +1941,14 @@ again: if (signal_pending(current)) { ++kvm_stat.signal_exits; post_kvm_run_save(vcpu, kvm_run); + kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } if (dm_request_for_irq_injection(vcpu, kvm_run)) { ++kvm_stat.request_irq_exits; post_kvm_run_save(vcpu, kvm_run); + kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 57f47ef93829..b3af92e7bf5d 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -11,7 +11,7 @@ #include #include -#define KVM_API_VERSION 8 +#define KVM_API_VERSION 9 /* * Architectural interrupt line count, and the size of the bitmap needed @@ -45,6 +45,7 @@ enum kvm_exit_reason { KVM_EXIT_IRQ_WINDOW_OPEN = 7, KVM_EXIT_SHUTDOWN = 8, KVM_EXIT_FAIL_ENTRY = 9, + KVM_EXIT_INTR = 10, }; /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ From 6722c51c51518af9581ab6cd9b6aec93774334a6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 5 Mar 2007 17:45:40 +0200 Subject: [PATCH 16/66] KVM: Initialize the apic_base msr on svm too Older userspace didn't care, but newer userspace (with the cpuid changes) does. Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 0311665e3c83..2396ada23777 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -582,6 +582,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) init_vmcb(vcpu->svm->vmcb); fx_init(vcpu); + vcpu->apic_base = 0xfee00000 | + /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | + MSR_IA32_APICBASE_ENABLE; return 0; From 1961d276c877b99f5f16aaf36377c75e0e191c3a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 5 Mar 2007 19:46:05 +0200 Subject: [PATCH 17/66] KVM: Add guest mode signal mask Allow a special signal mask to be used while executing in guest mode. This allows signals to be used to interrupt a vcpu without requiring signal delivery to a userspace handler, which is quite expensive. Userspace still receives -EINTR and can get the signal via sigwait(). Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 3 +++ drivers/kvm/kvm_main.c | 41 +++++++++++++++++++++++++++++++++++++++++ include/linux/kvm.h | 7 +++++++ 3 files changed, 51 insertions(+) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index be3a0e7ecae4..1c4a581938bf 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -277,6 +277,9 @@ struct kvm_vcpu { gpa_t mmio_phys_addr; int pio_pending; + int sigset_active; + sigset_t sigset; + struct { int active; u8 save_iopl; diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index ac44df551aa8..df85f5f65489 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1591,9 +1591,13 @@ static void complete_pio(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; + sigset_t sigsaved; vcpu_load(vcpu); + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + /* re-sync apic's tpr */ vcpu->cr8 = kvm_run->cr8; @@ -1616,6 +1620,9 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = kvm_arch_ops->run(vcpu, kvm_run); + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + vcpu_put(vcpu); return r; } @@ -2142,6 +2149,17 @@ out: return r; } +static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) +{ + if (sigset) { + sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); + vcpu->sigset_active = 1; + vcpu->sigset = *sigset; + } else + vcpu->sigset_active = 0; + return 0; +} + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2260,6 +2278,29 @@ static long kvm_vcpu_ioctl(struct file *filp, goto out; break; } + case KVM_SET_SIGNAL_MASK: { + struct kvm_signal_mask __user *sigmask_arg = argp; + struct kvm_signal_mask kvm_sigmask; + sigset_t sigset, *p; + + p = NULL; + if (argp) { + r = -EFAULT; + if (copy_from_user(&kvm_sigmask, argp, + sizeof kvm_sigmask)) + goto out; + r = -EINVAL; + if (kvm_sigmask.len != sizeof sigset) + goto out; + r = -EFAULT; + if (copy_from_user(&sigset, sigmask_arg->sigset, + sizeof sigset)) + goto out; + p = &sigset; + } + r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); + break; + } default: ; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index b3af92e7bf5d..c0d10cd8088e 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -234,6 +234,12 @@ struct kvm_cpuid { struct kvm_cpuid_entry entries[0]; }; +/* for KVM_SET_SIGNAL_MASK */ +struct kvm_signal_mask { + __u32 len; + __u8 sigset[0]; +}; + #define KVMIO 0xAE /* @@ -273,5 +279,6 @@ struct kvm_cpuid { #define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs) #define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs) #define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid) +#define KVM_SET_SIGNAL_MASK _IOW(KVMIO, 0x8b, struct kvm_signal_mask) #endif From 07c45a366d89f8eaec5d9890e810171b408f9a52 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Mar 2007 13:05:38 +0200 Subject: [PATCH 18/66] KVM: Allow kernel to select size of mmap() buffer This allows us to store offsets in the kernel/user kvm_run area, and be sure that userspace has them mapped. As offsets can be outside the kvm_run struct, userspace has no way of knowing how much to mmap. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 8 +++++++- include/linux/kvm.h | 4 ++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index df85f5f65489..cba0b87c34e4 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -2436,7 +2436,7 @@ static long kvm_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; - int r = -EINVAL; + long r = -EINVAL; switch (ioctl) { case KVM_GET_API_VERSION: @@ -2478,6 +2478,12 @@ static long kvm_dev_ioctl(struct file *filp, */ r = 0; break; + case KVM_GET_VCPU_MMAP_SIZE: + r = -EINVAL; + if (arg) + goto out; + r = PAGE_SIZE; + break; default: ; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index c0d10cd8088e..dad90816cad8 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -253,6 +253,10 @@ struct kvm_signal_mask { * return is 1 (yes) or 0 (no, sorry). */ #define KVM_CHECK_EXTENSION _IO(KVMIO, 0x03) +/* + * Get size for mmap(vcpu_fd) + */ +#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */ /* * ioctls for VM fds From f0fe510864a4520a85dfa35ae14f5f376c56efc7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Mar 2007 13:11:17 +0200 Subject: [PATCH 19/66] KVM: Future-proof argument-less ioctls Some ioctls ignore their arguments. By requiring them to be zero now, we allow a nonzero value to have some special meaning in the future. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index cba0b87c34e4..ba7f43a4459e 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -2169,6 +2169,9 @@ static long kvm_vcpu_ioctl(struct file *filp, switch (ioctl) { case KVM_RUN: + r = -EINVAL; + if (arg) + goto out; r = kvm_vcpu_ioctl_run(vcpu, vcpu->run); break; case KVM_GET_REGS: { @@ -2440,9 +2443,15 @@ static long kvm_dev_ioctl(struct file *filp, switch (ioctl) { case KVM_GET_API_VERSION: + r = -EINVAL; + if (arg) + goto out; r = KVM_API_VERSION; break; case KVM_CREATE_VM: + r = -EINVAL; + if (arg) + goto out; r = kvm_dev_ioctl_create_vm(); break; case KVM_GET_MSR_INDEX_LIST: { From 039576c03c35e2f990ad9bb9c39e1bad3cd60d34 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Mar 2007 12:46:50 +0200 Subject: [PATCH 20/66] KVM: Avoid guest virtual addresses in string pio userspace interface The current string pio interface communicates using guest virtual addresses, relying on userspace to translate addresses and to check permissions. This interface cannot fully support guest smp, as the check needs to take into account two pages at one in case an unaligned string transfer straddles a page boundary. Change the interface not to communicate guest addresses at all; instead use a buffer page (mmaped by userspace) and do transfers there. The kernel manages the virtual to physical translation and can perform the checks atomically by taking the appropriate locks. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 21 ++++- drivers/kvm/kvm_main.c | 183 +++++++++++++++++++++++++++++++++++++---- drivers/kvm/mmu.c | 9 ++ drivers/kvm/svm.c | 40 +++++---- drivers/kvm/vmx.c | 40 ++++----- include/linux/kvm.h | 11 +-- 6 files changed, 238 insertions(+), 66 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 1c4a581938bf..7866b34b6c96 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -74,6 +74,8 @@ #define IOPL_SHIFT 12 +#define KVM_PIO_PAGE_OFFSET 1 + /* * Address types: * @@ -220,6 +222,18 @@ enum { VCPU_SREG_LDTR, }; +struct kvm_pio_request { + unsigned long count; + int cur_count; + struct page *guest_pages[2]; + unsigned guest_page_offset; + int in; + int size; + int string; + int down; + int rep; +}; + struct kvm_vcpu { struct kvm *kvm; union { @@ -275,7 +289,8 @@ struct kvm_vcpu { int mmio_size; unsigned char mmio_data[8]; gpa_t mmio_phys_addr; - int pio_pending; + struct kvm_pio_request pio; + void *pio_data; int sigset_active; sigset_t sigset; @@ -421,6 +436,7 @@ hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva); +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); void kvm_emulator_want_group7_invlpg(void); @@ -453,6 +469,9 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, struct x86_emulate_ctxt; +int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, + int size, unsigned long count, int string, int down, + gva_t address, int rep, unsigned port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int emulate_clts(struct kvm_vcpu *vcpu); diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index ba7f43a4459e..205998c141fb 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -346,6 +346,17 @@ static void kvm_free_physmem(struct kvm *kvm) kvm_free_physmem_slot(&kvm->memslots[i], NULL); } +static void free_pio_guest_pages(struct kvm_vcpu *vcpu) +{ + int i; + + for (i = 0; i < 2; ++i) + if (vcpu->pio.guest_pages[i]) { + __free_page(vcpu->pio.guest_pages[i]); + vcpu->pio.guest_pages[i] = NULL; + } +} + static void kvm_free_vcpu(struct kvm_vcpu *vcpu) { if (!vcpu->vmcs) @@ -357,6 +368,9 @@ static void kvm_free_vcpu(struct kvm_vcpu *vcpu) kvm_arch_ops->vcpu_free(vcpu); free_page((unsigned long)vcpu->run); vcpu->run = NULL; + free_page((unsigned long)vcpu->pio_data); + vcpu->pio_data = NULL; + free_pio_guest_pages(vcpu); } static void kvm_free_vcpus(struct kvm *kvm) @@ -1550,44 +1564,168 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); -static void complete_pio(struct kvm_vcpu *vcpu) +static int pio_copy_data(struct kvm_vcpu *vcpu) { - struct kvm_io *io = &vcpu->run->io; + void *p = vcpu->pio_data; + void *q; + unsigned bytes; + int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; + + kvm_arch_ops->vcpu_put(vcpu); + q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, + PAGE_KERNEL); + if (!q) { + kvm_arch_ops->vcpu_load(vcpu); + free_pio_guest_pages(vcpu); + return -ENOMEM; + } + q += vcpu->pio.guest_page_offset; + bytes = vcpu->pio.size * vcpu->pio.cur_count; + if (vcpu->pio.in) + memcpy(q, p, bytes); + else + memcpy(p, q, bytes); + q -= vcpu->pio.guest_page_offset; + vunmap(q); + kvm_arch_ops->vcpu_load(vcpu); + free_pio_guest_pages(vcpu); + return 0; +} + +static int complete_pio(struct kvm_vcpu *vcpu) +{ + struct kvm_pio_request *io = &vcpu->pio; long delta; + int r; kvm_arch_ops->cache_regs(vcpu); if (!io->string) { - if (io->direction == KVM_EXIT_IO_IN) - memcpy(&vcpu->regs[VCPU_REGS_RAX], &io->value, + if (io->in) + memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data, io->size); } else { + if (io->in) { + r = pio_copy_data(vcpu); + if (r) { + kvm_arch_ops->cache_regs(vcpu); + return r; + } + } + delta = 1; if (io->rep) { - delta *= io->count; + delta *= io->cur_count; /* * The size of the register should really depend on * current address size. */ vcpu->regs[VCPU_REGS_RCX] -= delta; } - if (io->string_down) + if (io->down) delta = -delta; delta *= io->size; - if (io->direction == KVM_EXIT_IO_IN) + if (io->in) vcpu->regs[VCPU_REGS_RDI] += delta; else vcpu->regs[VCPU_REGS_RSI] += delta; } - vcpu->pio_pending = 0; vcpu->run->io_completed = 0; kvm_arch_ops->decache_regs(vcpu); - kvm_arch_ops->skip_emulated_instruction(vcpu); + io->count -= io->cur_count; + io->cur_count = 0; + + if (!io->count) + kvm_arch_ops->skip_emulated_instruction(vcpu); + return 0; } +int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, + int size, unsigned long count, int string, int down, + gva_t address, int rep, unsigned port) +{ + unsigned now, in_page; + int i; + int nr_pages = 1; + struct page *page; + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + vcpu->run->io.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = count; + vcpu->run->io.port = port; + vcpu->pio.count = count; + vcpu->pio.cur_count = count; + vcpu->pio.size = size; + vcpu->pio.in = in; + vcpu->pio.string = string; + vcpu->pio.down = down; + vcpu->pio.guest_page_offset = offset_in_page(address); + vcpu->pio.rep = rep; + + if (!string) { + kvm_arch_ops->cache_regs(vcpu); + memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); + kvm_arch_ops->decache_regs(vcpu); + return 0; + } + + if (!count) { + kvm_arch_ops->skip_emulated_instruction(vcpu); + return 1; + } + + now = min(count, PAGE_SIZE / size); + + if (!down) + in_page = PAGE_SIZE - offset_in_page(address); + else + in_page = offset_in_page(address) + size; + now = min(count, (unsigned long)in_page / size); + if (!now) { + /* + * String I/O straddles page boundary. Pin two guest pages + * so that we satisfy atomicity constraints. Do just one + * transaction to avoid complexity. + */ + nr_pages = 2; + now = 1; + } + if (down) { + /* + * String I/O in reverse. Yuck. Kill the guest, fix later. + */ + printk(KERN_ERR "kvm: guest string pio down\n"); + inject_gp(vcpu); + return 1; + } + vcpu->run->io.count = now; + vcpu->pio.cur_count = now; + + for (i = 0; i < nr_pages; ++i) { + spin_lock(&vcpu->kvm->lock); + page = gva_to_page(vcpu, address + i * PAGE_SIZE); + if (page) + get_page(page); + vcpu->pio.guest_pages[i] = page; + spin_unlock(&vcpu->kvm->lock); + if (!page) { + inject_gp(vcpu); + free_pio_guest_pages(vcpu); + return 1; + } + } + + if (!vcpu->pio.in) + return pio_copy_data(vcpu); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_setup_pio); + static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -1602,9 +1740,11 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->cr8 = kvm_run->cr8; if (kvm_run->io_completed) { - if (vcpu->pio_pending) - complete_pio(vcpu); - else { + if (vcpu->pio.cur_count) { + r = complete_pio(vcpu); + if (r) + goto out; + } else { memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); vcpu->mmio_read_completed = 1; } @@ -1620,6 +1760,7 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = kvm_arch_ops->run(vcpu, kvm_run); +out: if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); @@ -1995,9 +2136,12 @@ static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma, *type = VM_FAULT_MINOR; pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - if (pgoff != 0) + if (pgoff == 0) + page = virt_to_page(vcpu->run); + else if (pgoff == KVM_PIO_PAGE_OFFSET) + page = virt_to_page(vcpu->pio_data); + else return NOPAGE_SIGBUS; - page = virt_to_page(vcpu->run); get_page(page); return page; } @@ -2094,6 +2238,12 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) goto out_unlock; vcpu->run = page_address(page); + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + r = -ENOMEM; + if (!page) + goto out_free_run; + vcpu->pio_data = page_address(page); + vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, FX_IMAGE_ALIGN); vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; @@ -2123,6 +2273,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) out_free_vcpus: kvm_free_vcpu(vcpu); +out_free_run: + free_page((unsigned long)vcpu->run); + vcpu->run = NULL; out_unlock: mutex_unlock(&vcpu->mutex); out: @@ -2491,7 +2644,7 @@ static long kvm_dev_ioctl(struct file *filp, r = -EINVAL; if (arg) goto out; - r = PAGE_SIZE; + r = 2 * PAGE_SIZE; break; default: ; diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 2d905770fd88..4843e95e54e1 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -735,6 +735,15 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva) return gpa_to_hpa(vcpu, gpa); } +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); + + if (gpa == UNMAPPED_GVA) + return NULL; + return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT); +} + static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 2396ada23777..64afc5cf890d 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -984,7 +984,7 @@ static int io_get_override(struct kvm_vcpu *vcpu, return 0; } -static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, u64 *address) +static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, gva_t *address) { unsigned long addr_mask; unsigned long *reg; @@ -1028,40 +1028,38 @@ static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, u64 *address) static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u32 io_info = vcpu->svm->vmcb->control.exit_info_1; //address size bug? - int _in = io_info & SVM_IOIO_TYPE_MASK; + int size, down, in, string, rep; + unsigned port; + unsigned long count; + gva_t address = 0; ++kvm_stat.io_exits; vcpu->svm->next_rip = vcpu->svm->vmcb->control.exit_info_2; - kvm_run->exit_reason = KVM_EXIT_IO; - kvm_run->io.port = io_info >> 16; - kvm_run->io.direction = (_in) ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - kvm_run->io.size = ((io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT); - kvm_run->io.string = (io_info & SVM_IOIO_STR_MASK) != 0; - kvm_run->io.rep = (io_info & SVM_IOIO_REP_MASK) != 0; - kvm_run->io.count = 1; + in = (io_info & SVM_IOIO_TYPE_MASK) != 0; + port = io_info >> 16; + size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; + string = (io_info & SVM_IOIO_STR_MASK) != 0; + rep = (io_info & SVM_IOIO_REP_MASK) != 0; + count = 1; + down = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; - if (kvm_run->io.string) { + if (string) { unsigned addr_mask; - addr_mask = io_adress(vcpu, _in, &kvm_run->io.address); + addr_mask = io_adress(vcpu, in, &address); if (!addr_mask) { printk(KERN_DEBUG "%s: get io address failed\n", __FUNCTION__); return 1; } - if (kvm_run->io.rep) { - kvm_run->io.count - = vcpu->regs[VCPU_REGS_RCX] & addr_mask; - kvm_run->io.string_down = (vcpu->svm->vmcb->save.rflags - & X86_EFLAGS_DF) != 0; - } - } else - kvm_run->io.value = vcpu->svm->vmcb->save.rax; - vcpu->pio_pending = 1; - return 0; + if (rep) + count = vcpu->regs[VCPU_REGS_RCX] & addr_mask; + } + return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down, + address, rep, port); } static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index e69bab6d811d..0d9bf0b36d37 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1394,7 +1394,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } -static int get_io_count(struct kvm_vcpu *vcpu, u64 *count) +static int get_io_count(struct kvm_vcpu *vcpu, unsigned long *count) { u64 inst; gva_t rip; @@ -1439,35 +1439,35 @@ static int get_io_count(struct kvm_vcpu *vcpu, u64 *count) done: countr_size *= 8; *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size)); + //printk("cx: %lx\n", vcpu->regs[VCPU_REGS_RCX]); return 1; } static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u64 exit_qualification; + int size, down, in, string, rep; + unsigned port; + unsigned long count; + gva_t address; ++kvm_stat.io_exits; exit_qualification = vmcs_read64(EXIT_QUALIFICATION); - kvm_run->exit_reason = KVM_EXIT_IO; - if (exit_qualification & 8) - kvm_run->io.direction = KVM_EXIT_IO_IN; - else - kvm_run->io.direction = KVM_EXIT_IO_OUT; - kvm_run->io.size = (exit_qualification & 7) + 1; - kvm_run->io.string = (exit_qualification & 16) != 0; - kvm_run->io.string_down - = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; - kvm_run->io.rep = (exit_qualification & 32) != 0; - kvm_run->io.port = exit_qualification >> 16; - kvm_run->io.count = 1; - if (kvm_run->io.string) { - if (!get_io_count(vcpu, &kvm_run->io.count)) + in = (exit_qualification & 8) != 0; + size = (exit_qualification & 7) + 1; + string = (exit_qualification & 16) != 0; + down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; + count = 1; + rep = (exit_qualification & 32) != 0; + port = exit_qualification >> 16; + address = 0; + if (string) { + if (rep && !get_io_count(vcpu, &count)) return 1; - kvm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS); - } else - kvm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */ - vcpu->pio_pending = 1; - return 0; + address = vmcs_readl(GUEST_LINEAR_ADDRESS); + } + return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down, + address, rep, port); } static void diff --git a/include/linux/kvm.h b/include/linux/kvm.h index dad90816cad8..728b24cf5d77 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -86,16 +86,9 @@ struct kvm_run { #define KVM_EXIT_IO_OUT 1 __u8 direction; __u8 size; /* bytes */ - __u8 string; - __u8 string_down; - __u8 rep; - __u8 pad; __u16 port; - __u64 count; - union { - __u64 address; - __u32 value; - }; + __u32 count; + __u64 data_offset; /* relative to kvm_run start */ } io; struct { } debug; From ca5aac1f96c18b5e4dcfea253d7ab607b5dcd5c9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Mar 2007 14:29:06 +0200 Subject: [PATCH 21/66] KVM: MMU: Remove unnecessary check for pdptr access We already special case the pdptr access, so no need to check it again. Signed-off-by: Avi Kivity --- drivers/kvm/paging_tmpl.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index f3bcee904651..17bd4400c92b 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -148,8 +148,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, break; } - if (walker->level != 3 || is_long_mode(vcpu)) - walker->inherited_ar &= walker->table[index]; + walker->inherited_ar &= walker->table[index]; table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK); kunmap_atomic(walker->table, KM_USER0); From aac012245a59d78372dc66d292ba567367d86b60 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Mar 2007 14:34:28 +0200 Subject: [PATCH 22/66] KVM: MMU: Remove global pte tracking The initial, noncaching, version of the kvm mmu flushed the all nonglobal shadow page table translations (much like a native tlb flush). The new implementation flushes translations only when they change, rendering global pte tracking superfluous. This removes the unused tracking mechanism and storage space. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 1 - drivers/kvm/mmu.c | 9 --------- 2 files changed, 10 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 7866b34b6c96..a4331da816d0 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -136,7 +136,6 @@ struct kvm_mmu_page { unsigned long slot_bitmap; /* One bit set per slot which has memory * in this shadow page. */ - int global; /* Set if all ptes in this page are global */ int multimapped; /* More than one parent_pte? */ int root_count; /* Currently serving as active root */ union { diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 4843e95e54e1..2930d7cc7c06 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -461,7 +461,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, list_add(&page->link, &vcpu->kvm->active_mmu_pages); ASSERT(is_empty_shadow_page(page->page_hpa)); page->slot_bitmap = 0; - page->global = 1; page->multimapped = 0; page->parent_pte = parent_pte; --vcpu->kvm->n_free_mmu_pages; @@ -927,11 +926,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu) kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); } -static void mark_pagetable_nonglobal(void *shadow_pte) -{ - page_header(__pa(shadow_pte))->global = 0; -} - static inline void set_pte_common(struct kvm_vcpu *vcpu, u64 *shadow_pte, gpa_t gaddr, @@ -949,9 +943,6 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu, *shadow_pte |= access_bits; - if (!(*shadow_pte & PT_GLOBAL_MASK)) - mark_pagetable_nonglobal(shadow_pte); - if (is_error_hpa(paddr)) { *shadow_pte |= gaddr; *shadow_pte |= PT_SHADOW_IO_MARK; From 8cb5b0333250beb382624f626851a31f601b4830 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Mar 2007 18:40:40 +0200 Subject: [PATCH 23/66] KVM: Workaround vmx inability to virtualize the reset state The reset state has cs.selector == 0xf000 and cs.base == 0xffff0000, which aren't compatible with vm86 mode, which is used for real mode virtualization. When we create a vcpu, we set cs.base to 0xf0000, but if we get there by way of a reset, the values are inconsistent and vmx refuses to enter guest mode. Workaround by detecting the state and munging it appropriately. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 0d9bf0b36d37..aa7e2ba6fb5a 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -712,6 +712,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); vmcs_write32(GUEST_CS_LIMIT, 0xffff); + if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) + vmcs_writel(GUEST_CS_BASE, 0xf0000); vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); From f6528b03f167785301908bf124db7be591e983ca Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Mar 2007 18:44:51 +0200 Subject: [PATCH 24/66] KVM: Remove set_cr0_no_modeswitch() arch op set_cr0_no_modeswitch() was a hack to avoid corrupting segment registers. As we now cache the protected mode values on entry to real mode, this isn't an issue anymore, and it interferes with reboot (which usually _is_ a modeswitch). Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 -- drivers/kvm/kvm_main.c | 2 +- drivers/kvm/svm.c | 1 - drivers/kvm/vmx.c | 17 ----------------- 4 files changed, 1 insertion(+), 21 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index a4331da816d0..7361c45d70c9 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -383,8 +383,6 @@ struct kvm_arch_ops { void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); void (*decache_cr0_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); - void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu, - unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 205998c141fb..b998bc68e9f2 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1936,7 +1936,7 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); mmu_reset_needed |= vcpu->cr0 != sregs->cr0; - kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0); + kvm_arch_ops->set_cr0(vcpu, sregs->cr0); mmu_reset_needed |= vcpu->cr4 != sregs->cr4; kvm_arch_ops->set_cr4(vcpu, sregs->cr4); diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 64afc5cf890d..d3cc1157d23b 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1716,7 +1716,6 @@ static struct kvm_arch_ops svm_arch_ops = { .get_cs_db_l_bits = svm_get_cs_db_l_bits, .decache_cr0_cr4_guest_bits = svm_decache_cr0_cr4_guest_bits, .set_cr0 = svm_set_cr0, - .set_cr0_no_modeswitch = svm_set_cr0, .set_cr3 = svm_set_cr3, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index aa7e2ba6fb5a..027a9625ef90 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -788,22 +788,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vcpu->cr0 = cr0; } -/* - * Used when restoring the VM to avoid corrupting segment registers - */ -static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK)) - enter_rmode(vcpu); - - vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0); - update_exception_bitmap(vcpu); - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, - (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); - vcpu->cr0 = cr0; -} - static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { vmcs_writel(GUEST_CR3, cr3); @@ -2069,7 +2053,6 @@ static struct kvm_arch_ops vmx_arch_ops = { .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .decache_cr0_cr4_guest_bits = vmx_decache_cr0_cr4_guest_bits, .set_cr0 = vmx_set_cr0, - .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch, .set_cr3 = vmx_set_cr3, .set_cr4 = vmx_set_cr4, #ifdef CONFIG_X86_64 From 024aa1c02f0a9f938af83f55c727bcb18187eba4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 21 Mar 2007 13:44:58 +0200 Subject: [PATCH 25/66] KVM: Modify guest segments after potentially switching modes The SET_SREGS ioctl modifies both cr0.pe (real mode/protected mode) and guest segment registers. Since segment handling is modified by the mode on Intel procesors, update the segment registers after the mode switch has taken place. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index b998bc68e9f2..05235e1751fb 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1904,16 +1904,6 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, vcpu_load(vcpu); - set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - set_segment(vcpu, &sregs->es, VCPU_SREG_ES); - set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); - - set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - dt.limit = sregs->idt.limit; dt.base = sregs->idt.base; kvm_arch_ops->set_idt(vcpu, &dt); @@ -1953,6 +1943,16 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (vcpu->irq_pending[i]) __set_bit(i, &vcpu->irq_summary); + set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + set_segment(vcpu, &sregs->es, VCPU_SREG_ES); + set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + vcpu_put(vcpu); return 0; From 038881c8bec0e9a796d1782c56e29e7c2456626d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 21 Mar 2007 17:58:32 +0200 Subject: [PATCH 26/66] KVM: Hack real-mode segments on vmx from KVM_SET_SREGS As usual, we need to mangle segment registers when emulating real mode as vm86 has specific constraints. We special case the reset segment base, and set the "access rights" (or descriptor flags) to vm86 comaptible values. This fixes reboot on vmx. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 027a9625ef90..578dff5424e3 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -864,7 +864,14 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (var->unusable) + if (vcpu->rmode.active && var->s) { + /* + * Hack real-mode segments into vm86 compatibility. + */ + if (var->base == 0xffff0000 && var->selector == 0xf000) + vmcs_writel(sf->base, 0xf0000); + ar = 0xf3; + } else if (var->unusable) ar = 1 << 16; else { ar = var->type & 15; From 6da63cf95f6a19fe0a302232048452c96b178e45 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 21 Mar 2007 18:11:36 +0200 Subject: [PATCH 27/66] KVM: Don't allow the guest to turn off the cpu cache The cpu cache is a host resource; the guest should not be able to turn it off (even for itself). Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index d3cc1157d23b..191bc45b83e0 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -737,8 +737,10 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif vcpu->svm->cr0 = cr0; - vcpu->svm->vmcb->save.cr0 = cr0 | CR0_PG_MASK | CR0_WP_MASK; vcpu->cr0 = cr0; + cr0 |= CR0_PG_MASK | CR0_WP_MASK; + cr0 &= ~(CR0_CD_MASK | CR0_NW_MASK); + vcpu->svm->vmcb->save.cr0 = cr0; } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) From fcd3410870049cb74bb1a3a2458cb3ec21185cd1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 21 Mar 2007 18:14:42 +0200 Subject: [PATCH 28/66] KVM: Remove unused and write-only variables Trivial cleanup. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_svm.h | 2 -- drivers/kvm/svm.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h index 624f1ca48657..a1a9eba50d47 100644 --- a/drivers/kvm/kvm_svm.h +++ b/drivers/kvm/kvm_svm.h @@ -28,8 +28,6 @@ struct vcpu_svm { struct svm_cpu_data *svm_data; uint64_t asid_generation; - unsigned long cr0; - unsigned long cr4; unsigned long db_regs[NUM_DB_REGS]; u64 next_rip; diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 191bc45b83e0..ddc0505c3374 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -576,7 +576,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) vcpu->svm->vmcb = page_address(page); memset(vcpu->svm->vmcb, 0, PAGE_SIZE); vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; - vcpu->svm->cr0 = 0x00000010; vcpu->svm->asid_generation = 0; memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs)); init_vmcb(vcpu->svm->vmcb); @@ -736,7 +735,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } } #endif - vcpu->svm->cr0 = cr0; vcpu->cr0 = cr0; cr0 |= CR0_PG_MASK | CR0_WP_MASK; cr0 &= ~(CR0_CD_MASK | CR0_NW_MASK); From 0e5bf0d0e449f6597870570e8dd17e78ba4d75ff Mon Sep 17 00:00:00 2001 From: Sergey Kiselev Date: Thu, 22 Mar 2007 14:06:18 +0200 Subject: [PATCH 29/66] KVM: Handle writes to MCG_STATUS msr Some older (~2.6.7) kernels write MCG_STATUS register during kernel boot (mce_clear_all() function, called from mce_init()). It's not currently handled by kvm and will cause it to inject a GPF. Following patch adds a "nop" handler for this. Signed-off-by: Sergey Kiselev Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 05235e1751fb..33eade7e237c 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1467,6 +1467,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", __FUNCTION__, data); break; + case MSR_IA32_MCG_STATUS: + printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", + __FUNCTION__, data); + break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: case 0x200 ... 0x2ff: /* MTRRs */ From 916ce2360fadc71d924e02403b31280112a31280 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 21 Mar 2007 19:47:00 +0100 Subject: [PATCH 30/66] KVM: SVM: forbid guest to execute monitor/mwait This patch forbids the guest to execute monitor/mwait instructions on SVM. This is necessary because the guest can execute these instructions if they are available even if the kvm cpuid doesn't report its existence. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 6 +++++- drivers/kvm/svm.h | 6 ++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index ddc0505c3374..0542d3357ce1 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -511,7 +511,9 @@ static void init_vmcb(struct vmcb *vmcb) (1ULL << INTERCEPT_VMSAVE) | (1ULL << INTERCEPT_STGI) | (1ULL << INTERCEPT_CLGI) | - (1ULL << INTERCEPT_SKINIT); + (1ULL << INTERCEPT_SKINIT) | + (1ULL << INTERCEPT_MONITOR) | + (1ULL << INTERCEPT_MWAIT); control->iopm_base_pa = iopm_base; control->msrpm_base_pa = msrpm_base; @@ -1292,6 +1294,8 @@ static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, [SVM_EXIT_STGI] = invalid_op_interception, [SVM_EXIT_CLGI] = invalid_op_interception, [SVM_EXIT_SKINIT] = invalid_op_interception, + [SVM_EXIT_MONITOR] = invalid_op_interception, + [SVM_EXIT_MWAIT] = invalid_op_interception, }; diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h index df731c3fb588..5e93814400ce 100644 --- a/drivers/kvm/svm.h +++ b/drivers/kvm/svm.h @@ -44,6 +44,9 @@ enum { INTERCEPT_RDTSCP, INTERCEPT_ICEBP, INTERCEPT_WBINVD, + INTERCEPT_MONITOR, + INTERCEPT_MWAIT, + INTERCEPT_MWAIT_COND, }; @@ -298,6 +301,9 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXIT_RDTSCP 0x087 #define SVM_EXIT_ICEBP 0x088 #define SVM_EXIT_WBINVD 0x089 +#define SVM_EXIT_MONITOR 0x08a +#define SVM_EXIT_MWAIT 0x08b +#define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_ERR -1 From d28c6cfbbc5e2d4fccfe6d733995ed5971ca87f6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 23 Mar 2007 09:55:25 +0200 Subject: [PATCH 31/66] KVM: MMU: Fix hugepage pdes mapping same physical address with different access The kvm mmu keeps a shadow page for hugepage pdes; if several such pdes map the same physical address, they share the same shadow page. This is a fairly common case (kernel mappings on i386 nonpae Linux, for example). However, if the two pdes map the same memory but with different permissions, kvm will happily use the cached shadow page. If the access through the more permissive pde will occur after the access to the strict pde, an endless pagefault loop will be generated and the guest will make no progress. Fix by making the access permissions part of the cache lookup key. The fix allows Xen pae to boot on kvm and run guest domains. Thanks to Jeremy Fitzhardinge for reporting the bug and testing the fix. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 ++ drivers/kvm/mmu.c | 8 +++++--- drivers/kvm/paging_tmpl.h | 7 ++++++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 7361c45d70c9..f5e343cb06b0 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -109,6 +109,7 @@ struct kvm_pte_chain { * bits 4:7 - page table level for this shadow (1-4) * bits 8:9 - page table quadrant for 2-level guests * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) + * bits 17:18 - "access" - the user and writable bits of a huge page pde */ union kvm_mmu_page_role { unsigned word; @@ -118,6 +119,7 @@ union kvm_mmu_page_role { unsigned quadrant : 2; unsigned pad_for_nice_hex_output : 6; unsigned metaphysical : 1; + unsigned hugepage_access : 2; }; }; diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 2930d7cc7c06..c738fb1cea30 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -568,6 +568,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gva_t gaddr, unsigned level, int metaphysical, + unsigned hugepage_access, u64 *parent_pte) { union kvm_mmu_page_role role; @@ -581,6 +582,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role.glevels = vcpu->mmu.root_level; role.level = level; role.metaphysical = metaphysical; + role.hugepage_access = hugepage_access; if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; @@ -780,7 +782,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) >> PAGE_SHIFT; new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, v, level - 1, - 1, &table[index]); + 1, 0, &table[index]); if (!new_table) { pgprintk("nonpaging_map: ENOMEM\n"); return -ENOMEM; @@ -835,7 +837,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); page = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, 0, NULL); + PT64_ROOT_LEVEL, 0, 0, NULL); root = page->page_hpa; ++page->root_count; vcpu->mmu.root_hpa = root; @@ -852,7 +854,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = 0; page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, !is_paging(vcpu), - NULL); + 0, NULL); root = page->page_hpa; ++page->root_count; vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 17bd4400c92b..b94010dad809 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -247,6 +247,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, u64 shadow_pte; int metaphysical; gfn_t table_gfn; + unsigned hugepage_access = 0; if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { if (level == PT_PAGE_TABLE_LEVEL) @@ -276,6 +277,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (level - 1 == PT_PAGE_TABLE_LEVEL && walker->level == PT_DIRECTORY_LEVEL) { metaphysical = 1; + hugepage_access = *guest_ent; + hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK; + hugepage_access >>= PT_WRITABLE_SHIFT; table_gfn = (*guest_ent & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; } else { @@ -283,7 +287,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, table_gfn = walker->table_gfn[level - 2]; } shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - metaphysical, shadow_ent); + metaphysical, hugepage_access, + shadow_ent); shadow_addr = shadow_page->page_hpa; shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; From 0cc5064d335543a72c5ef904a3f528966fa3f2d2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 25 Mar 2007 12:07:27 +0200 Subject: [PATCH 32/66] KVM: SVM: Ensure timestamp counter monotonicity When a vcpu is migrated from one cpu to another, its timestamp counter may lose its monotonic property if the host has unsynced timestamp counters. This can confuse the guest, sometimes to the point of refusing to boot. As the rdtsc instruction is rather fast on AMD processors (7-10 cycles), we can simply record the last host tsc when we drop the cpu, and adjust the vcpu tsc offset when we detect that we've migrated to a different cpu. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 1 + drivers/kvm/svm.c | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index f5e343cb06b0..6d0bd7aab92e 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -244,6 +244,7 @@ struct kvm_vcpu { struct mutex mutex; int cpu; int launched; + u64 host_tsc; struct kvm_run *run; int interrupt_window_open; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 0542d3357ce1..ca2642fc091f 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -459,7 +459,6 @@ static void init_vmcb(struct vmcb *vmcb) { struct vmcb_control_area *control = &vmcb->control; struct vmcb_save_area *save = &vmcb->save; - u64 tsc; control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | @@ -517,8 +516,7 @@ static void init_vmcb(struct vmcb *vmcb) control->iopm_base_pa = iopm_base; control->msrpm_base_pa = msrpm_base; - rdtscll(tsc); - control->tsc_offset = -tsc; + control->tsc_offset = 0; control->int_ctl = V_INTR_MASKING_MASK; init_seg(&save->es); @@ -606,11 +604,26 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu) { - get_cpu(); + int cpu; + + cpu = get_cpu(); + if (unlikely(cpu != vcpu->cpu)) { + u64 tsc_this, delta; + + /* + * Make sure that the guest sees a monotonically + * increasing TSC. + */ + rdtscll(tsc_this); + delta = vcpu->host_tsc - tsc_this; + vcpu->svm->vmcb->control.tsc_offset += delta; + vcpu->cpu = cpu; + } } static void svm_vcpu_put(struct kvm_vcpu *vcpu) { + rdtscll(vcpu->host_tsc); put_cpu(); } From 55bf4028342d96b21fe5dc0721b481b0bc1e81f6 Mon Sep 17 00:00:00 2001 From: Michal Piotrowski Date: Sun, 25 Mar 2007 17:59:32 +0200 Subject: [PATCH 33/66] KVM: Remove unused function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove unused function CC drivers/kvm/svm.o drivers/kvm/svm.c:207: warning: ‘inject_db’ defined but not used Signed-off-by: Michal Piotrowski Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index ca2642fc091f..303e95957e7b 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -203,13 +203,6 @@ static void inject_ud(struct kvm_vcpu *vcpu) UD_VECTOR; } -static void inject_db(struct kvm_vcpu *vcpu) -{ - vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | - SVM_EVTINJ_TYPE_EXEPT | - DB_VECTOR; -} - static int is_page_fault(uint32_t info) { info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; From 36868f7b0efd0b6a1d45fe3b40a6c4bc63222659 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Mar 2007 19:31:52 +0200 Subject: [PATCH 34/66] KVM: Use list_move() Use list_move() where possible. Noticed by Dor Laor. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index c738fb1cea30..d81b9cd3465f 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -437,9 +437,8 @@ static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) struct kvm_mmu_page *page_head = page_header(page_hpa); ASSERT(is_empty_shadow_page(page_hpa)); - list_del(&page_head->link); page_head->page_hpa = page_hpa; - list_add(&page_head->link, &vcpu->free_pages); + list_move(&page_head->link, &vcpu->free_pages); ++vcpu->kvm->n_free_mmu_pages; } @@ -457,8 +456,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, return NULL; page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); - list_del(&page->link); - list_add(&page->link, &vcpu->kvm->active_mmu_pages); + list_move(&page->link, &vcpu->kvm->active_mmu_pages); ASSERT(is_empty_shadow_page(page->page_hpa)); page->slot_bitmap = 0; page->multimapped = 0; @@ -670,10 +668,8 @@ static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu, if (!page->root_count) { hlist_del(&page->hash_link); kvm_mmu_free_page(vcpu, page->page_hpa); - } else { - list_del(&page->link); - list_add(&page->link, &vcpu->kvm->active_mmu_pages); - } + } else + list_move(&page->link, &vcpu->kvm->active_mmu_pages); } static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) From afeb1f14c5478560262b37431726eb0eb1a42e9e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 27 Mar 2007 17:50:20 +0200 Subject: [PATCH 35/66] KVM: Remove debug message No longer interesting. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 578dff5424e3..b64b7b792e84 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1131,7 +1131,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->guest_msrs[j] = vcpu->host_msrs[j]; ++vcpu->nmsrs; } - printk(KERN_DEBUG "kvm: msrs: %d\n", vcpu->nmsrs); nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS; vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, From df513e2cdd099822ed32cbc20aaf4ff310372202 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 28 Mar 2007 20:04:16 +0200 Subject: [PATCH 36/66] KVM: x86 emulator: fix bit string operations operand size On x86, bit operations operate on a string of bits that can reside in multiple words. For example, 'btsl %eax, (blah)' will touch the word at blah+4 if %eax is between 32 and 63. The x86 emulator compensates for that by advancing the operand address by (bit offset / BITS_PER_LONG) and truncating the bit offset to the range (0..BITS_PER_LONG-1). This has a side effect of forcing the operand size to 8 bytes on 64-bit hosts. Now, a 32-bit guest goes and fork()s a process. It write protects a stack page at 0xbffff000 using the 'btr' instruction, at offset 0xffc in the page table, with bit offset 1 (for the write permission bit). The emulator now forces the operand size to 8 bytes as previously described, and an innocent page table update turns into a cross-page-boundary write, which is assumed by the mmu code not to be a page table, so it doesn't actually clear the corresponding shadow page table entry. The guest and host permissions are out of sync and guest memory is corrupted soon afterwards, leading to guest failure. Fix by not using BITS_PER_LONG as the word size; instead use the actual operand size, so we get a 32-bit write in that case. Note we still have to teach the mmu to handle cross-page-boundary writes to guest page table; but for now this allows Damn Small Linux 0.4 (2.4.20) to boot. Signed-off-by: Avi Kivity --- drivers/kvm/x86_emulate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 7513cddb929f..bcf872bdaf74 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -833,8 +833,9 @@ done_prefixes: dst.ptr = (unsigned long *)cr2; dst.bytes = (d & ByteOp) ? 1 : op_bytes; if (d & BitOp) { - dst.ptr += src.val / BITS_PER_LONG; - dst.bytes = sizeof(long); + unsigned long mask = ~(dst.bytes * 8 - 1); + + dst.ptr = (void *)dst.ptr + (src.val & mask) / 8; } if (!(d & Mov) && /* optimisation - avoid slow emulated read */ ((rc = ops->read_emulated((unsigned long)dst.ptr, From e0fa826f969c262c23908953bf85add487cc2e6c Mon Sep 17 00:00:00 2001 From: Dor Laor Date: Fri, 30 Mar 2007 13:06:33 +0300 Subject: [PATCH 37/66] KVM: Add mmu cache clear function Functions that play around with the physical memory map need a way to clear mappings to possibly nonexistent or invalid memory. Both the mmu cache and the processor tlb are cleared. Signed-off-by: Dor Laor Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 1 + drivers/kvm/mmu.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 6d0bd7aab92e..59357bea5b61 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -430,6 +430,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu); int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot); +void kvm_mmu_zap_all(struct kvm_vcpu *vcpu); hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index d81b9cd3465f..376800a33968 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1314,6 +1314,23 @@ void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot) } } +void kvm_mmu_zap_all(struct kvm_vcpu *vcpu) +{ + destroy_kvm_mmu(vcpu); + + while (!list_empty(&vcpu->kvm->active_mmu_pages)) { + struct kvm_mmu_page *page; + + page = container_of(vcpu->kvm->active_mmu_pages.next, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu, page); + } + + mmu_free_memory_caches(vcpu); + kvm_arch_ops->tlb_flush(vcpu); + init_kvm_mmu(vcpu); +} + #ifdef AUDIT static const char *audit_msg; From 954bbbc236afe23b368abdf4942f313a5f6e1d50 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 30 Mar 2007 14:02:32 +0300 Subject: [PATCH 38/66] KVM: Simply gfn_to_page() Mapping a guest page to a host page is a common operation. Currently, one has first to find the memory slot where the page belongs (gfn_to_memslot), then locate the page itself (gfn_to_page()). This is clumsy, and also won't work well with memory aliases. So simplify gfn_to_page() not to require memory slot translation first, and instead do it internally. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 12 +---------- drivers/kvm/kvm_main.c | 45 +++++++++++++++++++++++------------------- drivers/kvm/mmu.c | 12 ++++------- drivers/kvm/vmx.c | 6 +++--- 4 files changed, 33 insertions(+), 42 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 59357bea5b61..d19985a5508a 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -443,11 +443,7 @@ void kvm_emulator_want_group7_invlpg(void); extern hpa_t bad_page_address; -static inline struct page *gfn_to_page(struct kvm_memory_slot *slot, gfn_t gfn) -{ - return slot->phys_mem[gfn - slot->base_gfn]; -} - +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); @@ -523,12 +519,6 @@ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, return vcpu->mmu.page_fault(vcpu, gva, error_code); } -static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); - return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : NULL; -} - static inline int is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 33eade7e237c..a0ec5dda3305 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -420,12 +420,12 @@ static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) u64 pdpte; u64 *pdpt; int ret; - struct kvm_memory_slot *memslot; + struct page *page; spin_lock(&vcpu->kvm->lock); - memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn); - /* FIXME: !memslot - emulate? 0xff? */ - pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0); + page = gfn_to_page(vcpu->kvm, pdpt_gfn); + /* FIXME: !page - emulate? 0xff? */ + pdpt = kmap_atomic(page, KM_USER0); ret = 1; for (i = 0; i < 4; ++i) { @@ -861,6 +861,17 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_memslot); +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + slot = gfn_to_memslot(kvm, gfn); + if (!slot) + return NULL; + return slot->phys_mem[gfn - slot->base_gfn]; +} +EXPORT_SYMBOL_GPL(gfn_to_page); + void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { int i; @@ -899,20 +910,20 @@ static int emulator_read_std(unsigned long addr, unsigned offset = addr & (PAGE_SIZE-1); unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); unsigned long pfn; - struct kvm_memory_slot *memslot; - void *page; + struct page *page; + void *page_virt; if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; pfn = gpa >> PAGE_SHIFT; - memslot = gfn_to_memslot(vcpu->kvm, pfn); - if (!memslot) + page = gfn_to_page(vcpu->kvm, pfn); + if (!page) return X86EMUL_UNHANDLEABLE; - page = kmap_atomic(gfn_to_page(memslot, pfn), KM_USER0); + page_virt = kmap_atomic(page, KM_USER0); - memcpy(data, page + offset, tocopy); + memcpy(data, page_virt + offset, tocopy); - kunmap_atomic(page, KM_USER0); + kunmap_atomic(page_virt, KM_USER0); bytes -= tocopy; data += tocopy; @@ -963,16 +974,14 @@ static int emulator_read_emulated(unsigned long addr, static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long val, int bytes) { - struct kvm_memory_slot *m; struct page *page; void *virt; if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) return 0; - m = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT); - if (!m) + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!page) return 0; - page = gfn_to_page(m, gpa >> PAGE_SHIFT); kvm_mmu_pre_write(vcpu, gpa, bytes); mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); virt = kmap_atomic(page, KM_USER0); @@ -2516,15 +2525,11 @@ static struct page *kvm_vm_nopage(struct vm_area_struct *vma, { struct kvm *kvm = vma->vm_file->private_data; unsigned long pgoff; - struct kvm_memory_slot *slot; struct page *page; *type = VM_FAULT_MINOR; pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - slot = gfn_to_memslot(kvm, pgoff); - if (!slot) - return NOPAGE_SIGBUS; - page = gfn_to_page(slot, pgoff); + page = gfn_to_page(kvm, pgoff); if (!page) return NOPAGE_SIGBUS; get_page(page); diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 376800a33968..8bdb9ca1811c 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -390,13 +390,11 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) { struct kvm *kvm = vcpu->kvm; struct page *page; - struct kvm_memory_slot *slot; struct kvm_rmap_desc *desc; u64 *spte; - slot = gfn_to_memslot(kvm, gfn); - BUG_ON(!slot); - page = gfn_to_page(slot, gfn); + page = gfn_to_page(kvm, gfn); + BUG_ON(!page); while (page_private(page)) { if (!(page_private(page) & 1)) @@ -711,14 +709,12 @@ hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) { - struct kvm_memory_slot *slot; struct page *page; ASSERT((gpa & HPA_ERR_MASK) == 0); - slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT); - if (!slot) + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!page) return gpa | HPA_ERR_MASK; - page = gfn_to_page(slot, gpa >> PAGE_SHIFT); return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT) | (gpa & (PAGE_SIZE-1)); } diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index b64b7b792e84..61a611691e50 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -926,9 +926,9 @@ static int init_rmode_tss(struct kvm* kvm) gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; char *page; - p1 = _gfn_to_page(kvm, fn++); - p2 = _gfn_to_page(kvm, fn++); - p3 = _gfn_to_page(kvm, fn); + p1 = gfn_to_page(kvm, fn++); + p2 = gfn_to_page(kvm, fn++); + p3 = gfn_to_page(kvm, fn); if (!p1 || !p2 || !p3) { kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); From e8207547d2f7b2f557bdb73015c1f74c32474438 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 30 Mar 2007 16:54:30 +0300 Subject: [PATCH 39/66] KVM: Add physical memory aliasing feature With this, we can specify that accesses to one physical memory range will be remapped to another. This is useful for the vga window at 0xa0000 which is used as a movable window into the (much larger) framebuffer. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 9 +++++ drivers/kvm/kvm_main.c | 89 ++++++++++++++++++++++++++++++++++++++++-- include/linux/kvm.h | 10 ++++- 3 files changed, 104 insertions(+), 4 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index d19985a5508a..fceeb840a255 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -51,6 +51,7 @@ #define UNMAPPED_GVA (~(gpa_t)0) #define KVM_MAX_VCPUS 1 +#define KVM_ALIAS_SLOTS 4 #define KVM_MEMORY_SLOTS 4 #define KVM_NUM_MMU_PAGES 256 #define KVM_MIN_FREE_MMU_PAGES 5 @@ -312,6 +313,12 @@ struct kvm_vcpu { struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; }; +struct kvm_mem_alias { + gfn_t base_gfn; + unsigned long npages; + gfn_t target_gfn; +}; + struct kvm_memory_slot { gfn_t base_gfn; unsigned long npages; @@ -322,6 +329,8 @@ struct kvm_memory_slot { struct kvm { spinlock_t lock; /* protects everything except vcpus */ + int naliases; + struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; int nmemslots; struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; /* diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index a0ec5dda3305..e495855727e2 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -846,7 +846,73 @@ out: return r; } -struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +/* + * Set a new alias region. Aliases map a portion of physical memory into + * another portion. This is useful for memory windows, for example the PC + * VGA region. + */ +static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, + struct kvm_memory_alias *alias) +{ + int r, n; + struct kvm_mem_alias *p; + + r = -EINVAL; + /* General sanity checks */ + if (alias->memory_size & (PAGE_SIZE - 1)) + goto out; + if (alias->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + if (alias->slot >= KVM_ALIAS_SLOTS) + goto out; + if (alias->guest_phys_addr + alias->memory_size + < alias->guest_phys_addr) + goto out; + if (alias->target_phys_addr + alias->memory_size + < alias->target_phys_addr) + goto out; + + spin_lock(&kvm->lock); + + p = &kvm->aliases[alias->slot]; + p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; + p->npages = alias->memory_size >> PAGE_SHIFT; + p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + + for (n = KVM_ALIAS_SLOTS; n > 0; --n) + if (kvm->aliases[n - 1].npages) + break; + kvm->naliases = n; + + spin_unlock(&kvm->lock); + + vcpu_load(&kvm->vcpus[0]); + spin_lock(&kvm->lock); + kvm_mmu_zap_all(&kvm->vcpus[0]); + spin_unlock(&kvm->lock); + vcpu_put(&kvm->vcpus[0]); + + return 0; + +out: + return r; +} + +static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + + for (i = 0; i < kvm->naliases; ++i) { + alias = &kvm->aliases[i]; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + +static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { int i; @@ -859,13 +925,19 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) } return NULL; } -EXPORT_SYMBOL_GPL(gfn_to_memslot); + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + gfn = unalias_gfn(kvm, gfn); + return __gfn_to_memslot(kvm, gfn); +} struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; - slot = gfn_to_memslot(kvm, gfn); + gfn = unalias_gfn(kvm, gfn); + slot = __gfn_to_memslot(kvm, gfn); if (!slot) return NULL; return slot->phys_mem[gfn - slot->base_gfn]; @@ -2512,6 +2584,17 @@ static long kvm_vm_ioctl(struct file *filp, goto out; break; } + case KVM_SET_MEMORY_ALIAS: { + struct kvm_memory_alias alias; + + r = -EFAULT; + if (copy_from_user(&alias, argp, sizeof alias)) + goto out; + r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); + if (r) + goto out; + break; + } default: ; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 728b24cf5d77..da9b23fa4b6c 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -11,7 +11,7 @@ #include #include -#define KVM_API_VERSION 9 +#define KVM_API_VERSION 10 /* * Architectural interrupt line count, and the size of the bitmap needed @@ -33,6 +33,13 @@ struct kvm_memory_region { /* for kvm_memory_region::flags */ #define KVM_MEM_LOG_DIRTY_PAGES 1UL +struct kvm_memory_alias { + __u32 slot; /* this has a different namespace than memory slots */ + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 target_phys_addr; +}; enum kvm_exit_reason { KVM_EXIT_UNKNOWN = 0, @@ -261,6 +268,7 @@ struct kvm_signal_mask { */ #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) +#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) /* * ioctls for vcpu fds From b8836737d92c139be770eae3d6574e33d1224caf Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Apr 2007 16:34:31 +0300 Subject: [PATCH 40/66] KVM: Add fpu get/set operations These are really helpful when migrating an floating point app to another machine. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 86 ++++++++++++++++++++++++++++++++++++++++++ include/linux/kvm.h | 17 +++++++++ 2 files changed, 103 insertions(+) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index e495855727e2..b065c4991568 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -2398,6 +2398,67 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) return 0; } +/* + * fxsave fpu state. Taken from x86_64/processor.h. To be killed when + * we have asm/x86/processor.h + */ +struct fxsave { + u16 cwd; + u16 swd; + u16 twd; + u16 fop; + u64 rip; + u64 rdp; + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ +#ifdef CONFIG_X86_64 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ +#else + u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ +#endif +}; + +static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fpu->fpr, fxsave->st_space, 128); + fpu->fcw = fxsave->cwd; + fpu->fsw = fxsave->swd; + fpu->ftwx = fxsave->twd; + fpu->last_opcode = fxsave->fop; + fpu->last_ip = fxsave->rip; + fpu->last_dp = fxsave->rdp; + memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); + + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fxsave->st_space, fpu->fpr, 128); + fxsave->cwd = fpu->fcw; + fxsave->swd = fpu->fsw; + fxsave->twd = fpu->ftwx; + fxsave->fop = fpu->last_opcode; + fxsave->rip = fpu->last_ip; + fxsave->rdp = fpu->last_dp; + memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); + + vcpu_put(vcpu); + + return 0; +} + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2542,6 +2603,31 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); break; } + case KVM_GET_FPU: { + struct kvm_fpu fpu; + + memset(&fpu, 0, sizeof fpu); + r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &fpu, sizeof fpu)) + goto out; + r = 0; + break; + } + case KVM_SET_FPU: { + struct kvm_fpu fpu; + + r = -EFAULT; + if (copy_from_user(&fpu, argp, sizeof fpu)) + goto out; + r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu); + if (r) + goto out; + r = 0; + break; + } default: ; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index da9b23fa4b6c..07bf353eeb6f 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -126,6 +126,21 @@ struct kvm_regs { __u64 rip, rflags; }; +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + struct kvm_segment { __u64 base; __u32 limit; @@ -285,5 +300,7 @@ struct kvm_signal_mask { #define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs) #define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid) #define KVM_SET_SIGNAL_MASK _IOW(KVMIO, 0x8b, struct kvm_signal_mask) +#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu) +#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) #endif From 80b7706e4cbaa51d65bd6fea83bd0e59856f50e9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 30 Mar 2007 17:02:14 +0300 Subject: [PATCH 41/66] KVM: SVM: enable LBRV virtualization if available This patch enables the virtualization of the last branch record MSRs on SVM if this feature is available in hardware. It also introduces a small and simple check feature for specific SVM extensions. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 303e95957e7b..b7e1410e4007 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -44,6 +44,10 @@ MODULE_LICENSE("GPL"); #define KVM_EFER_LMA (1 << 10) #define KVM_EFER_LME (1 << 8) +#define SVM_FEATURE_NPT (1 << 0) +#define SVM_FEATURE_LBRV (1 << 1) +#define SVM_DEATURE_SVML (1 << 2) + unsigned long iopm_base; unsigned long msrpm_base; @@ -68,6 +72,7 @@ struct svm_cpu_data { }; static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +static uint32_t svm_features; struct svm_init_data { int cpu; @@ -82,6 +87,11 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; #define MAX_INST_SIZE 15 +static inline u32 svm_has(u32 feat) +{ + return svm_features & feat; +} + static unsigned get_addr_size(struct kvm_vcpu *vcpu) { struct vmcb_save_area *sa = &vcpu->svm->vmcb->save; @@ -302,6 +312,7 @@ static void svm_hardware_enable(void *garbage) svm_data->asid_generation = 1; svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; svm_data->next_asid = svm_data->max_asid + 1; + svm_features = cpuid_edx(SVM_CPUID_FUNC); asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); gdt = (struct desc_struct *)gdt_descr.address; @@ -511,6 +522,8 @@ static void init_vmcb(struct vmcb *vmcb) control->msrpm_base_pa = msrpm_base; control->tsc_offset = 0; control->int_ctl = V_INTR_MASKING_MASK; + if (svm_has(SVM_FEATURE_LBRV)) + control->lbr_ctl = 1ULL; init_seg(&save->es); init_seg(&save->ss); From 5008fdf5b6a31240da060c0867d8f16f08ce2384 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 2 Apr 2007 13:05:50 +0300 Subject: [PATCH 42/66] KVM: Use kernel-standard types Noted by Joerg Roedel. Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index b7e1410e4007..c9b700d87801 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -63,9 +63,9 @@ struct kvm_ldttss_desc { struct svm_cpu_data { int cpu; - uint64_t asid_generation; - uint32_t max_asid; - uint32_t next_asid; + u64 asid_generation; + u32 max_asid; + u32 next_asid; struct kvm_ldttss_desc *tss_desc; struct page *save_area; From 3964994bb5ba85a3d8b54ae618f7be1cecce916d Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn / Snakebyte Date: Mon, 9 Apr 2007 16:15:05 +0200 Subject: [PATCH 43/66] KVM: Fix overflow bug in overflow detection code The expression sp - 6 < sp where sp is a u16 is undefined in C since 'sp - 6' is promoted to int, and signed overflow is undefined in C. gcc 4.2 actually warns about it. Replace with a simpler test. Signed-off-by: Eric Sesterhenn Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 61a611691e50..8c0115b54802 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1182,7 +1182,7 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) u16 sp = vmcs_readl(GUEST_RSP); u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT); - if (sp > ss_limit || sp - 6 > sp) { + if (sp > ss_limit || sp < 6 ) { vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", __FUNCTION__, vmcs_readl(GUEST_RSP), From d917a6b92d0d1e4e2b98e86c584bc9e643cd5117 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 12 Apr 2007 13:03:01 +0300 Subject: [PATCH 44/66] KVM: Initialize cr0 to indicate an fpu is present Solaris panics if it sees a cpu with no fpu, and it seems to rely on this bit. Closes sourceforge bug 1698920. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index b065c4991568..0b30631585bd 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -2332,6 +2332,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, FX_IMAGE_ALIGN); vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; + vcpu->cr0 = 0x10; r = kvm_arch_ops->vcpu_create(vcpu); if (r < 0) From 417726a3fbecb2092f1054bbaee87bc442b05ef3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 12 Apr 2007 17:35:58 +0300 Subject: [PATCH 45/66] KVM: Handle partial pae pdptr Some guests (Solaris) do not set up all four pdptrs, but leave some invalid. kvm incorrectly treated these as valid page directories, pinning the wrong pages and causing general confusion. Fix by checking the valid bit of a pae pdpte. This closes sourceforge bug 1698922. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 8bdb9ca1811c..9ff74805c7d1 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -806,10 +806,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->mmu.pae_root[i]; - ASSERT(VALID_PAGE(root)); - root &= PT64_BASE_ADDR_MASK; - page = page_header(root); - --page->root_count; + if (root) { + ASSERT(VALID_PAGE(root)); + root &= PT64_BASE_ADDR_MASK; + page = page_header(root); + --page->root_count; + } vcpu->mmu.pae_root[i] = INVALID_PAGE; } vcpu->mmu.root_hpa = INVALID_PAGE; @@ -840,9 +842,13 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->mmu.pae_root[i]; ASSERT(!VALID_PAGE(root)); - if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) + if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) { + if (!is_present_pte(vcpu->pdptrs[i])) { + vcpu->mmu.pae_root[i] = 0; + continue; + } root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT; - else if (vcpu->mmu.root_level == 0) + } else if (vcpu->mmu.root_level == 0) root_gfn = 0; page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, !is_paging(vcpu), From b5a33a75720c03d58d8281a72b45ffd214f00ed7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 15 Apr 2007 16:31:09 +0300 Subject: [PATCH 46/66] KVM: Use slab caches to allocate mmu data structures Better leak detection, statistics, memory use, speed -- goodness all around. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 3 +++ drivers/kvm/kvm_main.c | 7 +++++++ drivers/kvm/mmu.c | 39 +++++++++++++++++++++++++++++++++++---- 3 files changed, 45 insertions(+), 4 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index fceeb840a255..b9c318a9e334 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -433,6 +433,9 @@ extern struct kvm_arch_ops *kvm_arch_ops; int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module); void kvm_exit_arch(void); +int kvm_mmu_module_init(void); +void kvm_mmu_module_exit(void); + void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); int kvm_mmu_setup(struct kvm_vcpu *vcpu); diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 0b30631585bd..ab4dbd7fa5f8 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -3063,6 +3063,10 @@ static __init int kvm_init(void) static struct page *bad_page; int r; + r = kvm_mmu_module_init(); + if (r) + goto out4; + r = register_filesystem(&kvm_fs_type); if (r) goto out3; @@ -3091,6 +3095,8 @@ out: out2: unregister_filesystem(&kvm_fs_type); out3: + kvm_mmu_module_exit(); +out4: return r; } @@ -3100,6 +3106,7 @@ static __exit void kvm_exit(void) __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); mntput(kvmfs_mnt); unregister_filesystem(&kvm_fs_type); + kvm_mmu_module_exit(); } module_init(kvm_init) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 9ff74805c7d1..a368ea8297f3 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -159,6 +159,9 @@ struct kvm_rmap_desc { struct kvm_rmap_desc *more; }; +static struct kmem_cache *pte_chain_cache; +static struct kmem_cache *rmap_desc_cache; + static int is_write_protection(struct kvm_vcpu *vcpu) { return vcpu->cr0 & CR0_WP_MASK; @@ -196,14 +199,14 @@ static int is_rmap_pte(u64 pte) } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - size_t objsize, int min) + struct kmem_cache *base_cache, int min) { void *obj; if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kzalloc(objsize, GFP_NOWAIT); + obj = kmem_cache_zalloc(base_cache, GFP_NOWAIT); if (!obj) return -ENOMEM; cache->objects[cache->nobjs++] = obj; @@ -222,11 +225,11 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) int r; r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache, - sizeof(struct kvm_pte_chain), 4); + pte_chain_cache, 4); if (r) goto out; r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, - sizeof(struct kvm_rmap_desc), 1); + rmap_desc_cache, 1); out: return r; } @@ -1333,6 +1336,34 @@ void kvm_mmu_zap_all(struct kvm_vcpu *vcpu) init_kvm_mmu(vcpu); } +void kvm_mmu_module_exit(void) +{ + if (pte_chain_cache) + kmem_cache_destroy(pte_chain_cache); + if (rmap_desc_cache) + kmem_cache_destroy(rmap_desc_cache); +} + +int kvm_mmu_module_init(void) +{ + pte_chain_cache = kmem_cache_create("kvm_pte_chain", + sizeof(struct kvm_pte_chain), + 0, 0, NULL, NULL); + if (!pte_chain_cache) + goto nomem; + rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", + sizeof(struct kvm_rmap_desc), + 0, 0, NULL, NULL); + if (!rmap_desc_cache) + goto nomem; + + return 0; + +nomem: + kvm_mmu_module_exit(); + return -ENOMEM; +} + #ifdef AUDIT static const char *audit_msg; From 8c4385024d31cb909ad84a2cafa5c83a4c5fab61 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 16 Apr 2007 11:53:17 +0300 Subject: [PATCH 47/66] KVM: Retry sleeping allocation if atomic allocation fails This avoids -ENOMEM under memory pressure. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index a368ea8297f3..c814394a966d 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -199,14 +199,15 @@ static int is_rmap_pte(u64 pte) } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - struct kmem_cache *base_cache, int min) + struct kmem_cache *base_cache, int min, + gfp_t gfp_flags) { void *obj; if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_NOWAIT); + obj = kmem_cache_zalloc(base_cache, gfp_flags); if (!obj) return -ENOMEM; cache->objects[cache->nobjs++] = obj; @@ -220,20 +221,35 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) kfree(mc->objects[--mc->nobjs]); } -static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags) { int r; r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache, - pte_chain_cache, 4); + pte_chain_cache, 4, gfp_flags); if (r) goto out; r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, - rmap_desc_cache, 1); + rmap_desc_cache, 1, gfp_flags); out: return r; } +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +{ + int r; + + r = __mmu_topup_memory_caches(vcpu, GFP_NOWAIT); + if (r < 0) { + spin_unlock(&vcpu->kvm->lock); + kvm_arch_ops->vcpu_put(vcpu); + r = __mmu_topup_memory_caches(vcpu, GFP_KERNEL); + kvm_arch_ops->vcpu_load(vcpu); + spin_lock(&vcpu->kvm->lock); + } + return r; +} + static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); From 364b625b561b1dd74e6fa696949ae3de28999a66 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 16 Apr 2007 14:28:40 +0300 Subject: [PATCH 48/66] KVM: SVM: Report hardware exit reason to userspace instead of dmesg Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index c9b700d87801..61ed7352e506 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1332,12 +1332,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || svm_exit_handlers[exit_code] == 0) { kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - printk(KERN_ERR "%s: 0x%x @ 0x%llx cr0 0x%lx rflags 0x%llx\n", - __FUNCTION__, - exit_code, - vcpu->svm->vmcb->save.rip, - vcpu->cr0, - vcpu->svm->vmcb->save.rflags); + kvm_run->hw.hardware_exit_reason = exit_code; return 0; } From c9047f533373e934b96d19d6a3d313ca2132fbe5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 17 Apr 2007 10:53:22 +0300 Subject: [PATCH 49/66] KVM: Handle guest page faults when emulating mmio Usually, guest page faults are detected by the kvm page fault handler, which detects if they are shadow faults, mmio faults, pagetable faults, or normal guest page faults. However, in ceratin circumstances, we can detect a page fault much later. One of these events is the following combination: - A two memory operand instruction (e.g. movsb) is executed. - The first operand is in mmio space (which is the fault reported to kvm) - The second operand is in an ummaped address (e.g. a guest page fault) The Windows 2000 installer does such an access, an promptly hangs. Fix by adding the missing page fault injection on that path. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index ab4dbd7fa5f8..03c0ee74d757 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1071,8 +1071,10 @@ static int emulator_write_emulated(unsigned long addr, struct kvm_vcpu *vcpu = ctxt->vcpu; gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); - if (gpa == UNMAPPED_GVA) + if (gpa == UNMAPPED_GVA) { + kvm_arch_ops->inject_page_fault(vcpu, addr, 2); return X86EMUL_PROPAGATE_FAULT; + } if (emulator_write_phys(vcpu, gpa, val, bytes)) return X86EMUL_CONTINUE; From 2345df8c555ecb92c0c36172c07d5ac321a92dc7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 17 Apr 2007 15:30:24 +0300 Subject: [PATCH 50/66] KVM: VMX: Reduce unnecessary saving of host msrs THe automatically switched msrs are never changed on the host (with the exception of MSR_KERNEL_GS_BASE) and thus there is no need to save them on every vm entry. This reduces vmexit latency by ~400 cycles on i386 and by ~900 cycles (10%) on x86_64. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 8c0115b54802..3745e6ccc5b4 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -78,6 +78,10 @@ static const u32 vmx_msr_index[] = { }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) +#ifdef CONFIG_X86_64 +static unsigned msr_offset_kernel_gs_base; +#endif + static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -1129,6 +1133,10 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->host_msrs[j].reserved = 0; vcpu->host_msrs[j].data = data; vcpu->guest_msrs[j] = vcpu->host_msrs[j]; +#ifdef CONFIG_X86_64 + if (index == MSR_KERNEL_GS_BASE) + msr_offset_kernel_gs_base = j; +#endif ++vcpu->nmsrs; } @@ -1760,7 +1768,9 @@ again: fx_save(vcpu->host_fx_image); fx_restore(vcpu->guest_fx_image); - save_msrs(vcpu->host_msrs, vcpu->nmsrs); +#ifdef CONFIG_X86_64 + save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); +#endif load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); asm ( From e38aea3e9330624d19a233c05f3e69c57519edd5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 19 Apr 2007 13:22:48 +0300 Subject: [PATCH 51/66] KVM: VMX: Don't switch 64-bit msrs for 32-bit guests Some msrs are only used by x86_64 instructions, and are therefore not needed when the guest is legacy mode. By not bothering to switch them, we reduce vmexit latency by 2400 cycles (from about 8800) when running a 32-bt guest on a 64-bit host. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 58 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 3745e6ccc5b4..6270df58e055 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -80,6 +80,9 @@ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 static unsigned msr_offset_kernel_gs_base; +#define NR_64BIT_MSRS 4 +#else +#define NR_64BIT_MSRS 0 #endif static inline int is_page_fault(u32 intr_info) @@ -300,6 +303,32 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) INTR_INFO_VALID_MASK); } +/* + * Set up the vmcs to automatically save and restore system + * msrs. Don't touch the 64-bit msrs if the guest is in legacy + * mode, as fiddling with msrs is very expensive. + */ +static void setup_msrs(struct kvm_vcpu *vcpu) +{ + int nr_skip, nr_good_msrs; + + if (is_long_mode(vcpu)) + nr_skip = NR_BAD_MSRS; + else + nr_skip = NR_64BIT_MSRS; + nr_good_msrs = vcpu->nmsrs - nr_skip; + + vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, + virt_to_phys(vcpu->guest_msrs + nr_skip)); + vmcs_writel(VM_EXIT_MSR_STORE_ADDR, + virt_to_phys(vcpu->guest_msrs + nr_skip)); + vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, + virt_to_phys(vcpu->host_msrs + nr_skip)); + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ +} + /* * reads and returns guest's timestamp counter "register" * guest_tsc = host_tsc + tsc_offset -- 21.3 @@ -825,6 +854,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) msr->data = efer & ~EFER_LME; } + setup_msrs(vcpu); } #endif @@ -988,7 +1018,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) struct descriptor_table dt; int i; int ret = 0; - int nr_good_msrs; extern asmlinkage void kvm_vmx_return(void); if (!init_rmode_tss(vcpu->kvm)) { @@ -1140,19 +1169,10 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) ++vcpu->nmsrs; } - nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS; - vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, - virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS)); - vmcs_writel(VM_EXIT_MSR_STORE_ADDR, - virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS)); - vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, - virt_to_phys(vcpu->host_msrs + NR_BAD_MSRS)); + setup_msrs(vcpu); + vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_CONTROLS, (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ - /* 22.2.1, 20.8.1 */ vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS, @@ -1769,9 +1789,11 @@ again: fx_restore(vcpu->guest_fx_image); #ifdef CONFIG_X86_64 - save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); + if (is_long_mode(vcpu)) { + save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); + load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + } #endif - load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); asm ( /* Store host registers */ @@ -1915,8 +1937,12 @@ again: } ++kvm_stat.exits; - save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - load_msrs(vcpu->host_msrs, NR_BAD_MSRS); +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + load_msrs(vcpu->host_msrs, NR_BAD_MSRS); + } +#endif fx_save(vcpu->guest_fx_image); fx_restore(vcpu->host_fx_image); From 35cc7f971188366f5a5c0d5da1456bb38cef5da9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 19 Apr 2007 13:26:39 +0300 Subject: [PATCH 52/66] KVM: Fold drivers/kvm/kvm_vmx.h into drivers/kvm/vmx.c No meat in that file. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_vmx.h | 14 -------------- drivers/kvm/vmx.c | 7 ++++++- 2 files changed, 6 insertions(+), 15 deletions(-) delete mode 100644 drivers/kvm/kvm_vmx.h diff --git a/drivers/kvm/kvm_vmx.h b/drivers/kvm/kvm_vmx.h deleted file mode 100644 index d139f73fb6e1..000000000000 --- a/drivers/kvm/kvm_vmx.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef __KVM_VMX_H -#define __KVM_VMX_H - -#ifdef CONFIG_X86_64 -/* - * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt - * mechanism (cpu bug AA24) - */ -#define NR_BAD_MSRS 2 -#else -#define NR_BAD_MSRS 0 -#endif - -#endif diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 6270df58e055..b61d4dd804e3 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -17,7 +17,6 @@ #include "kvm.h" #include "vmx.h" -#include "kvm_vmx.h" #include #include #include @@ -81,8 +80,14 @@ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 static unsigned msr_offset_kernel_gs_base; #define NR_64BIT_MSRS 4 +/* + * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt + * mechanism (cpu bug AA24) + */ +#define NR_BAD_MSRS 2 #else #define NR_64BIT_MSRS 0 +#define NR_BAD_MSRS 0 #endif static inline int is_page_fault(u32 intr_info) From 4d56c8a787aefb2e3fc4ac4be966db96c14d1ad8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 19 Apr 2007 14:28:44 +0300 Subject: [PATCH 53/66] KVM: VMX: Only save/restore MSR_K6_STAR if necessary Intel hosts only support syscall/sysret in long more (and only if efer.sce is enabled), so only reload the related MSR_K6_STAR if the guest will actually be able to use it. This reduces vmexit cost by about 500 cycles (6400 -> 5870) on my setup. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index b61d4dd804e3..37537af126d1 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -69,6 +69,10 @@ static struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; +/* + * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it + * away by decrementing the array size. + */ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, @@ -323,6 +327,18 @@ static void setup_msrs(struct kvm_vcpu *vcpu) nr_skip = NR_64BIT_MSRS; nr_good_msrs = vcpu->nmsrs - nr_skip; + /* + * MSR_K6_STAR is only needed on long mode guests, and only + * if efer.sce is enabled. + */ + if (find_msr_entry(vcpu, MSR_K6_STAR)) { + --nr_good_msrs; +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu) && (vcpu->shadow_efer & EFER_SCE)) + ++nr_good_msrs; +#endif + } + vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, virt_to_phys(vcpu->guest_msrs + nr_skip)); vmcs_writel(VM_EXIT_MSR_STORE_ADDR, From d6c69ee9a24b307ce94e55ebfba6208a830c9ecb Mon Sep 17 00:00:00 2001 From: Yaozu Dong Date: Wed, 25 Apr 2007 14:17:25 +0800 Subject: [PATCH 54/66] KVM: MMU: Avoid heavy ASSERT at non debug mode. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index c814394a966d..8ccf84e3fda8 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -52,11 +52,15 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} static int dbg = 1; #endif +#ifndef MMU_DEBUG +#define ASSERT(x) do { } while (0) +#else #define ASSERT(x) \ if (!(x)) { \ printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ __FILE__, __LINE__, #x); \ } +#endif #define PT64_PT_BITS 9 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -434,6 +438,7 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) } } +#ifdef MMU_DEBUG static int is_empty_shadow_page(hpa_t page_hpa) { u64 *pos; @@ -448,6 +453,7 @@ static int is_empty_shadow_page(hpa_t page_hpa) } return 1; } +#endif static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) { From 3fca03653010b8c5fa63b99fc94c78cbfb433d00 Mon Sep 17 00:00:00 2001 From: Yaozu Dong Date: Wed, 25 Apr 2007 16:49:19 +0300 Subject: [PATCH 55/66] KVM: VMX: Avoid unnecessary vcpu_load()/vcpu_put() cycles By checking if a reschedule is needed, we avoid dropping the vcpu. [With changes by me, based on Anthony Liguori's observations] Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 03c0ee74d757..f5356358acff 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1590,6 +1590,8 @@ static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) void kvm_resched(struct kvm_vcpu *vcpu) { + if (!need_resched()) + return; vcpu_put(vcpu); cond_resched(); vcpu_load(vcpu); From 1165f5fec18c077bdba88e7125fd41f8e3617cb4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 19 Apr 2007 17:27:43 +0300 Subject: [PATCH 56/66] KVM: Per-vcpu statistics Make the exit statistics per-vcpu instead of global. This gives a 3.5% boost when running one virtual machine per core on my two socket dual core (4 cores total) machine. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 35 +++++++++++----------- drivers/kvm/kvm_main.c | 61 +++++++++++++++++++++++++++------------ drivers/kvm/mmu.c | 2 +- drivers/kvm/paging_tmpl.h | 2 +- drivers/kvm/svm.c | 14 ++++----- drivers/kvm/vmx.c | 18 ++++++------ 6 files changed, 79 insertions(+), 53 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index b9c318a9e334..d1a90c5d76ce 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -236,6 +236,22 @@ struct kvm_pio_request { int rep; }; +struct kvm_stat { + u32 pf_fixed; + u32 pf_guest; + u32 tlb_flush; + u32 invlpg; + + u32 exits; + u32 io_exits; + u32 mmio_exits; + u32 signal_exits; + u32 irq_window_exits; + u32 halt_exits; + u32 request_irq_exits; + u32 irq_exits; +}; + struct kvm_vcpu { struct kvm *kvm; union { @@ -298,6 +314,8 @@ struct kvm_vcpu { int sigset_active; sigset_t sigset; + struct kvm_stat stat; + struct { int active; u8 save_iopl; @@ -347,22 +365,6 @@ struct kvm { struct file *filp; }; -struct kvm_stat { - u32 pf_fixed; - u32 pf_guest; - u32 tlb_flush; - u32 invlpg; - - u32 exits; - u32 io_exits; - u32 mmio_exits; - u32 signal_exits; - u32 irq_window_exits; - u32 halt_exits; - u32 request_irq_exits; - u32 irq_exits; -}; - struct descriptor_table { u16 limit; unsigned long base; @@ -424,7 +426,6 @@ struct kvm_arch_ops { unsigned char *hypercall_addr); }; -extern struct kvm_stat kvm_stat; extern struct kvm_arch_ops *kvm_arch_ops; #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index f5356358acff..911c8175cc08 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -51,27 +51,27 @@ static DEFINE_SPINLOCK(kvm_lock); static LIST_HEAD(vm_list); struct kvm_arch_ops *kvm_arch_ops; -struct kvm_stat kvm_stat; -EXPORT_SYMBOL_GPL(kvm_stat); + +#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) static struct kvm_stats_debugfs_item { const char *name; - u32 *data; + int offset; struct dentry *dentry; } debugfs_entries[] = { - { "pf_fixed", &kvm_stat.pf_fixed }, - { "pf_guest", &kvm_stat.pf_guest }, - { "tlb_flush", &kvm_stat.tlb_flush }, - { "invlpg", &kvm_stat.invlpg }, - { "exits", &kvm_stat.exits }, - { "io_exits", &kvm_stat.io_exits }, - { "mmio_exits", &kvm_stat.mmio_exits }, - { "signal_exits", &kvm_stat.signal_exits }, - { "irq_window", &kvm_stat.irq_window_exits }, - { "halt_exits", &kvm_stat.halt_exits }, - { "request_irq", &kvm_stat.request_irq_exits }, - { "irq_exits", &kvm_stat.irq_exits }, - { NULL, NULL } + { "pf_fixed", STAT_OFFSET(pf_fixed) }, + { "pf_guest", STAT_OFFSET(pf_guest) }, + { "tlb_flush", STAT_OFFSET(tlb_flush) }, + { "invlpg", STAT_OFFSET(invlpg) }, + { "exits", STAT_OFFSET(exits) }, + { "io_exits", STAT_OFFSET(io_exits) }, + { "mmio_exits", STAT_OFFSET(mmio_exits) }, + { "signal_exits", STAT_OFFSET(signal_exits) }, + { "irq_window", STAT_OFFSET(irq_window_exits) }, + { "halt_exits", STAT_OFFSET(halt_exits) }, + { "request_irq", STAT_OFFSET(request_irq_exits) }, + { "irq_exits", STAT_OFFSET(irq_exits) }, + { NULL } }; static struct dentry *debugfs_dir; @@ -2930,14 +2930,39 @@ static struct notifier_block kvm_cpu_notifier = { .priority = 20, /* must be > scheduler priority */ }; +static u64 stat_get(void *_offset) +{ + unsigned offset = (long)_offset; + u64 total = 0; + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; + + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = &kvm->vcpus[i]; + total += *(u32 *)((void *)vcpu + offset); + } + spin_unlock(&kvm_lock); + return total; +} + +static void stat_set(void *offset, u64 val) +{ +} + +DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, stat_set, "%llu\n"); + static __init void kvm_init_debug(void) { struct kvm_stats_debugfs_item *p; debugfs_dir = debugfs_create_dir("kvm", NULL); for (p = debugfs_entries; p->name; ++p) - p->dentry = debugfs_create_u32(p->name, 0444, debugfs_dir, - p->data); + p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, + (void *)(long)p->offset, + &stat_fops); } static void kvm_exit_debug(void) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 8ccf84e3fda8..32c64f682081 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -936,7 +936,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) { - ++kvm_stat.tlb_flush; + ++vcpu->stat.tlb_flush; kvm_arch_ops->tlb_flush(vcpu); } diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index b94010dad809..73ffbffb1097 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -448,7 +448,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (is_io_pte(*shadow_pte)) return 1; - ++kvm_stat.pf_fixed; + ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, "post page fault (fixed)"); return write_pt; diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 61ed7352e506..644efc5381ad 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -914,7 +914,7 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) case EMULATE_DONE: return 1; case EMULATE_DO_MMIO: - ++kvm_stat.mmio_exits; + ++vcpu->stat.mmio_exits; kvm_run->exit_reason = KVM_EXIT_MMIO; return 0; case EMULATE_FAIL: @@ -1054,7 +1054,7 @@ static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long count; gva_t address = 0; - ++kvm_stat.io_exits; + ++vcpu->stat.io_exits; vcpu->svm->next_rip = vcpu->svm->vmcb->control.exit_info_2; @@ -1096,7 +1096,7 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; kvm_run->exit_reason = KVM_EXIT_HLT; - ++kvm_stat.halt_exits; + ++vcpu->stat.halt_exits; return 0; } @@ -1264,7 +1264,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu, */ if (kvm_run->request_interrupt_window && !vcpu->irq_summary) { - ++kvm_stat.irq_window_exits; + ++vcpu->stat.irq_window_exits; kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; } @@ -1636,14 +1636,14 @@ again: r = handle_exit(vcpu, kvm_run); if (r > 0) { if (signal_pending(current)) { - ++kvm_stat.signal_exits; + ++vcpu->stat.signal_exits; post_kvm_run_save(vcpu, kvm_run); kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } if (dm_request_for_irq_injection(vcpu, kvm_run)) { - ++kvm_stat.request_irq_exits; + ++vcpu->stat.request_irq_exits; post_kvm_run_save(vcpu, kvm_run); kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; @@ -1672,7 +1672,7 @@ static void svm_inject_page_fault(struct kvm_vcpu *vcpu, { uint32_t exit_int_info = vcpu->svm->vmcb->control.exit_int_info; - ++kvm_stat.pf_guest; + ++vcpu->stat.pf_guest; if (is_page_fault(exit_int_info)) { diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 37537af126d1..10845b7ff297 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1396,7 +1396,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) case EMULATE_DONE: return 1; case EMULATE_DO_MMIO: - ++kvm_stat.mmio_exits; + ++vcpu->stat.mmio_exits; kvm_run->exit_reason = KVM_EXIT_MMIO; return 0; case EMULATE_FAIL: @@ -1425,7 +1425,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_external_interrupt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - ++kvm_stat.irq_exits; + ++vcpu->stat.irq_exits; return 1; } @@ -1492,7 +1492,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long count; gva_t address; - ++kvm_stat.io_exits; + ++vcpu->stat.io_exits; exit_qualification = vmcs_read64(EXIT_QUALIFICATION); in = (exit_qualification & 8) != 0; size = (exit_qualification & 7) + 1; @@ -1682,7 +1682,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, if (kvm_run->request_interrupt_window && !vcpu->irq_summary) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - ++kvm_stat.irq_window_exits; + ++vcpu->stat.irq_window_exits; return 0; } return 1; @@ -1695,7 +1695,7 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; kvm_run->exit_reason = KVM_EXIT_HLT; - ++kvm_stat.halt_exits; + ++vcpu->stat.halt_exits; return 0; } @@ -1956,7 +1956,7 @@ again: reload_tss(); } - ++kvm_stat.exits; + ++vcpu->stat.exits; #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { @@ -1988,14 +1988,14 @@ again: if (r > 0) { /* Give scheduler a change to reschedule. */ if (signal_pending(current)) { - ++kvm_stat.signal_exits; + ++vcpu->stat.signal_exits; post_kvm_run_save(vcpu, kvm_run); kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } if (dm_request_for_irq_injection(vcpu, kvm_run)) { - ++kvm_stat.request_irq_exits; + ++vcpu->stat.request_irq_exits; post_kvm_run_save(vcpu, kvm_run); kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; @@ -2021,7 +2021,7 @@ static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, { u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - ++kvm_stat.pf_guest; + ++vcpu->stat.pf_guest; if (is_page_fault(vect_info)) { printk(KERN_DEBUG "inject_page_fault: " From 4c690a1e8667a84b61a6114a4ad293681f32cb11 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 22 Apr 2007 15:28:19 +0300 Subject: [PATCH 57/66] KVM: Allow passing 64-bit values to the emulated read/write API This simplifies the API somewhat (by eliminating the special-case cmpxchg8b on i386). Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 45 ++++++++------------------------------ drivers/kvm/x86_emulate.c | 46 ++++++++------------------------------- drivers/kvm/x86_emulate.h | 32 +++++---------------------- 3 files changed, 24 insertions(+), 99 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 911c8175cc08..67554034d001 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -970,7 +970,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) } static int emulator_read_std(unsigned long addr, - unsigned long *val, + void *val, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { @@ -1006,7 +1006,7 @@ static int emulator_read_std(unsigned long addr, } static int emulator_write_std(unsigned long addr, - unsigned long val, + const void *val, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { @@ -1016,7 +1016,7 @@ static int emulator_write_std(unsigned long addr, } static int emulator_read_emulated(unsigned long addr, - unsigned long *val, + void *val, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { @@ -1044,7 +1044,7 @@ static int emulator_read_emulated(unsigned long addr, } static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, - unsigned long val, int bytes) + const void *val, int bytes) { struct page *page; void *virt; @@ -1057,14 +1057,14 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_pre_write(vcpu, gpa, bytes); mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); virt = kmap_atomic(page, KM_USER0); - memcpy(virt + offset_in_page(gpa), &val, bytes); + memcpy(virt + offset_in_page(gpa), val, bytes); kunmap_atomic(virt, KM_USER0); kvm_mmu_post_write(vcpu, gpa, bytes); return 1; } static int emulator_write_emulated(unsigned long addr, - unsigned long val, + const void *val, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { @@ -1083,14 +1083,14 @@ static int emulator_write_emulated(unsigned long addr, vcpu->mmio_phys_addr = gpa; vcpu->mmio_size = bytes; vcpu->mmio_is_write = 1; - memcpy(vcpu->mmio_data, &val, bytes); + memcpy(vcpu->mmio_data, val, bytes); return X86EMUL_CONTINUE; } static int emulator_cmpxchg_emulated(unsigned long addr, - unsigned long old, - unsigned long new, + const void *old, + const void *new, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { @@ -1103,30 +1103,6 @@ static int emulator_cmpxchg_emulated(unsigned long addr, return emulator_write_emulated(addr, new, bytes, ctxt); } -#ifdef CONFIG_X86_32 - -static int emulator_cmpxchg8b_emulated(unsigned long addr, - unsigned long old_lo, - unsigned long old_hi, - unsigned long new_lo, - unsigned long new_hi, - struct x86_emulate_ctxt *ctxt) -{ - static int reported; - int r; - - if (!reported) { - reported = 1; - printk(KERN_WARNING "kvm: emulating exchange8b as write\n"); - } - r = emulator_write_emulated(addr, new_lo, 4, ctxt); - if (r != X86EMUL_CONTINUE) - return r; - return emulator_write_emulated(addr+4, new_hi, 4, ctxt); -} - -#endif - static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { return kvm_arch_ops->get_segment_base(vcpu, seg); @@ -1201,9 +1177,6 @@ struct x86_emulate_ops emulate_ops = { .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, -#ifdef CONFIG_X86_32 - .cmpxchg8b_emulated = emulator_cmpxchg8b_emulated, -#endif }; int emulate_instruction(struct kvm_vcpu *vcpu, diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index bcf872bdaf74..7ade09086aa5 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -1045,7 +1045,7 @@ done_prefixes: if ((rc = ops->write_std( register_address(ctxt->ss_base, _regs[VCPU_REGS_RSP]), - dst.val, dst.bytes, ctxt)) != 0) + &dst.val, dst.bytes, ctxt)) != 0) goto done; dst.val = dst.orig_val; /* skanky: disable writeback */ break; @@ -1078,12 +1078,12 @@ writeback: case OP_MEM: if (lock_prefix) rc = ops->cmpxchg_emulated((unsigned long)dst. - ptr, dst.orig_val, - dst.val, dst.bytes, + ptr, &dst.orig_val, + &dst.val, dst.bytes, ctxt); else rc = ops->write_emulated((unsigned long)dst.ptr, - dst.val, dst.bytes, + &dst.val, dst.bytes, ctxt); if (rc != 0) goto done; @@ -1321,36 +1321,8 @@ twobyte_special_insn: realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags); break; case 0xc7: /* Grp9 (cmpxchg8b) */ -#if defined(__i386__) { - unsigned long old_lo, old_hi; - if (((rc = ops->read_emulated(cr2 + 0, &old_lo, 4, - ctxt)) != 0) - || ((rc = ops->read_emulated(cr2 + 4, &old_hi, 4, - ctxt)) != 0)) - goto done; - if ((old_lo != _regs[VCPU_REGS_RAX]) - || (old_hi != _regs[VCPU_REGS_RDX])) { - _regs[VCPU_REGS_RAX] = old_lo; - _regs[VCPU_REGS_RDX] = old_hi; - _eflags &= ~EFLG_ZF; - } else if (ops->cmpxchg8b_emulated == NULL) { - rc = X86EMUL_UNHANDLEABLE; - goto done; - } else { - if ((rc = ops->cmpxchg8b_emulated(cr2, old_lo, - old_hi, - _regs[VCPU_REGS_RBX], - _regs[VCPU_REGS_RCX], - ctxt)) != 0) - goto done; - _eflags |= EFLG_ZF; - } - break; - } -#elif defined(CONFIG_X86_64) - { - unsigned long old, new; + u64 old, new; if ((rc = ops->read_emulated(cr2, &old, 8, ctxt)) != 0) goto done; if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) || @@ -1359,15 +1331,15 @@ twobyte_special_insn: _regs[VCPU_REGS_RDX] = (u32) (old >> 32); _eflags &= ~EFLG_ZF; } else { - new = (_regs[VCPU_REGS_RCX] << 32) | (u32) _regs[VCPU_REGS_RBX]; - if ((rc = ops->cmpxchg_emulated(cr2, old, - new, 8, ctxt)) != 0) + new = ((u64)_regs[VCPU_REGS_RCX] << 32) + | (u32) _regs[VCPU_REGS_RBX]; + if ((rc = ops->cmpxchg_emulated(cr2, &old, + &new, 8, ctxt)) != 0) goto done; _eflags |= EFLG_ZF; } break; } -#endif } goto writeback; diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h index 5d41bd55125e..ea3407d7feee 100644 --- a/drivers/kvm/x86_emulate.h +++ b/drivers/kvm/x86_emulate.h @@ -59,8 +59,7 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*read_std)(unsigned long addr, - unsigned long *val, + int (*read_std)(unsigned long addr, void *val, unsigned int bytes, struct x86_emulate_ctxt * ctxt); /* @@ -71,8 +70,7 @@ struct x86_emulate_ops { * required). * @bytes: [IN ] Number of bytes to write to memory. */ - int (*write_std)(unsigned long addr, - unsigned long val, + int (*write_std)(unsigned long addr, const void *val, unsigned int bytes, struct x86_emulate_ctxt * ctxt); /* @@ -82,7 +80,7 @@ struct x86_emulate_ops { * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_emulated) (unsigned long addr, - unsigned long *val, + void *val, unsigned int bytes, struct x86_emulate_ctxt * ctxt); @@ -94,7 +92,7 @@ struct x86_emulate_ops { * @bytes: [IN ] Number of bytes to write to memory. */ int (*write_emulated) (unsigned long addr, - unsigned long val, + const void *val, unsigned int bytes, struct x86_emulate_ctxt * ctxt); @@ -107,29 +105,11 @@ struct x86_emulate_ops { * @bytes: [IN ] Number of bytes to access using CMPXCHG. */ int (*cmpxchg_emulated) (unsigned long addr, - unsigned long old, - unsigned long new, + const void *old, + const void *new, unsigned int bytes, struct x86_emulate_ctxt * ctxt); - /* - * cmpxchg8b_emulated: Emulate an atomic (LOCKed) CMPXCHG8B operation on an - * emulated/special memory area. - * @addr: [IN ] Linear address to access. - * @old: [IN ] Value expected to be current at @addr. - * @new: [IN ] Value to write to @addr. - * NOTES: - * 1. This function is only ever called when emulating a real CMPXCHG8B. - * 2. This function is *never* called on x86/64 systems. - * 2. Not defining this function (i.e., specifying NULL) is equivalent - * to defining a function that always returns X86EMUL_UNHANDLEABLE. - */ - int (*cmpxchg8b_emulated) (unsigned long addr, - unsigned long old_lo, - unsigned long old_hi, - unsigned long new_lo, - unsigned long new_hi, - struct x86_emulate_ctxt * ctxt); }; struct cpu_user_regs; From 7807fa6ca5af2e5660a0eb3cd90276ca0c5bdfc8 Mon Sep 17 00:00:00 2001 From: Anthony Liguori Date: Mon, 23 Apr 2007 09:17:21 -0500 Subject: [PATCH 58/66] KVM: Lazy FPU support for SVM Avoid saving and restoring the guest fpu state on every exit. This shaves ~100 cycles off the guest/host switch. Signed-off-by: Anthony Liguori Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 ++ drivers/kvm/svm.c | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index d1a90c5d76ce..61ff085df7e6 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -63,6 +63,7 @@ #define FX_BUF_SIZE (2 * FX_IMAGE_SIZE + FX_IMAGE_ALIGN) #define DE_VECTOR 0 +#define NM_VECTOR 7 #define DF_VECTOR 8 #define TS_VECTOR 10 #define NP_VECTOR 11 @@ -301,6 +302,7 @@ struct kvm_vcpu { char fx_buf[FX_BUF_SIZE]; char *host_fx_image; char *guest_fx_image; + int fpu_active; int mmio_needed; int mmio_read_completed; diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 644efc5381ad..2a7a0390bfb1 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -587,6 +587,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) init_vmcb(vcpu->svm->vmcb); fx_init(vcpu); + vcpu->fpu_active = 1; vcpu->apic_base = 0xfee00000 | /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | MSR_IA32_APICBASE_ENABLE; @@ -756,6 +757,11 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } } #endif + if ((vcpu->cr0 & CR0_TS_MASK) && !(cr0 & CR0_TS_MASK)) { + vcpu->svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); + vcpu->fpu_active = 1; + } + vcpu->cr0 = cr0; cr0 |= CR0_PG_MASK | CR0_WP_MASK; cr0 &= ~(CR0_CD_MASK | CR0_NW_MASK); @@ -928,6 +934,16 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } +static int nm_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + vcpu->svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); + if (!(vcpu->cr0 & CR0_TS_MASK)) + vcpu->svm->vmcb->save.cr0 &= ~CR0_TS_MASK; + vcpu->fpu_active = 1; + + return 1; +} + static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { /* @@ -1292,6 +1308,7 @@ static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, [SVM_EXIT_WRITE_DR5] = emulate_on_interception, [SVM_EXIT_WRITE_DR7] = emulate_on_interception, [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, + [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_INTR] = nop_on_interception, [SVM_EXIT_NMI] = nop_on_interception, [SVM_EXIT_SMI] = nop_on_interception, @@ -1481,8 +1498,10 @@ again: load_db_regs(vcpu->svm->db_regs); } - fx_save(vcpu->host_fx_image); - fx_restore(vcpu->guest_fx_image); + if (vcpu->fpu_active) { + fx_save(vcpu->host_fx_image); + fx_restore(vcpu->guest_fx_image); + } asm volatile ( #ifdef CONFIG_X86_64 @@ -1593,8 +1612,10 @@ again: #endif : "cc", "memory" ); - fx_save(vcpu->guest_fx_image); - fx_restore(vcpu->host_fx_image); + if (vcpu->fpu_active) { + fx_save(vcpu->guest_fx_image); + fx_restore(vcpu->host_fx_image); + } if ((vcpu->svm->vmcb->save.dr7 & 0xff)) load_db_regs(vcpu->svm->host_db_regs); @@ -1664,6 +1685,12 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { vcpu->svm->vmcb->save.cr3 = root; force_new_asid(vcpu); + + if (vcpu->fpu_active) { + vcpu->svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); + vcpu->svm->vmcb->save.cr0 |= CR0_TS_MASK; + vcpu->fpu_active = 0; + } } static void svm_inject_page_fault(struct kvm_vcpu *vcpu, From e0e5127d06957e76da3906b7a58d5d2665e81f59 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 25 Apr 2007 10:59:52 +0300 Subject: [PATCH 59/66] KVM: Don't complain about cpu erratum AA15 It slows down Windows x64 horribly. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 10845b7ff297..d28c848138ce 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1566,8 +1566,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) skip_emulated_instruction(vcpu); return 1; case 8: - printk(KERN_DEBUG "handle_cr: read CR8 " - "cpu erratum AA15\n"); vcpu_load_rsp_rip(vcpu); vcpu->regs[reg] = vcpu->cr8; vcpu_put_rsp_rip(vcpu); From 25c4c2762e31a75403eca0dd59f2cab85e3a2532 Mon Sep 17 00:00:00 2001 From: Anthony Liguori Date: Fri, 27 Apr 2007 09:29:21 +0300 Subject: [PATCH 60/66] KVM: VMX: Properly shadow the CR0 register in the vcpu struct Set all of the host mask bits for CR0 so that we can maintain a proper shadow of CR0. This exposes CR0.TS, paving the way for lazy fpu handling. Signed-off-by: Anthony Liguori Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 +- drivers/kvm/kvm_main.c | 8 +++----- drivers/kvm/svm.c | 4 ++-- drivers/kvm/vmx.c | 14 ++++++++------ 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 61ff085df7e6..f99e89e185b2 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -397,7 +397,7 @@ struct kvm_arch_ops { void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); - void (*decache_cr0_cr4_guest_bits)(struct kvm_vcpu *vcpu); + void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 67554034d001..cdf0b176851d 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -510,7 +510,6 @@ EXPORT_SYMBOL_GPL(set_cr0); void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(lmsw); @@ -1117,7 +1116,6 @@ int emulate_clts(struct kvm_vcpu *vcpu) { unsigned long cr0; - kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); cr0 = vcpu->cr0 & ~CR0_TS_MASK; kvm_arch_ops->set_cr0(vcpu, cr0); return X86EMUL_CONTINUE; @@ -1318,7 +1316,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) { - kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); + kvm_arch_ops->decache_cr4_guest_bits(vcpu); switch (cr) { case 0: return vcpu->cr0; @@ -1934,7 +1932,7 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->gdt.limit = dt.limit; sregs->gdt.base = dt.base; - kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); + kvm_arch_ops->decache_cr4_guest_bits(vcpu); sregs->cr0 = vcpu->cr0; sregs->cr2 = vcpu->cr2; sregs->cr3 = vcpu->cr3; @@ -1985,7 +1983,7 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, #endif vcpu->apic_base = sregs->apic_base; - kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); + kvm_arch_ops->decache_cr4_guest_bits(vcpu); mmu_reset_needed |= vcpu->cr0 != sregs->cr0; kvm_arch_ops->set_cr0(vcpu, sregs->cr0); diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 2a7a0390bfb1..bddd0238869d 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -738,7 +738,7 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) vcpu->svm->vmcb->save.gdtr.base = dt->base ; } -static void svm_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu) +static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } @@ -1759,7 +1759,7 @@ static struct kvm_arch_ops svm_arch_ops = { .get_segment = svm_get_segment, .set_segment = svm_set_segment, .get_cs_db_l_bits = svm_get_cs_db_l_bits, - .decache_cr0_cr4_guest_bits = svm_decache_cr0_cr4_guest_bits, + .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, .set_cr4 = svm_set_cr4, diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index d28c848138ce..09608114e29a 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -810,11 +810,8 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu) +static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->cr0 &= KVM_GUEST_CR0_MASK; - vcpu->cr0 |= vmcs_readl(GUEST_CR0) & ~KVM_GUEST_CR0_MASK; - vcpu->cr4 &= KVM_GUEST_CR4_MASK; vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; } @@ -1205,7 +1202,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vmcs_writel(TPR_THRESHOLD, 0); #endif - vmcs_writel(CR0_GUEST_HOST_MASK, KVM_GUEST_CR0_MASK); + vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); vcpu->cr0 = 0x60000010; @@ -1557,6 +1554,11 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; }; break; + case 2: /* clts */ + vcpu_load_rsp_rip(vcpu); + set_cr0(vcpu, vcpu->cr0 & ~CR0_TS_MASK); + skip_emulated_instruction(vcpu); + return 1; case 1: /*mov from cr*/ switch (cr) { case 3: @@ -2112,7 +2114,7 @@ static struct kvm_arch_ops vmx_arch_ops = { .get_segment = vmx_get_segment, .set_segment = vmx_set_segment, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .decache_cr0_cr4_guest_bits = vmx_decache_cr0_cr4_guest_bits, + .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, .set_cr4 = vmx_set_cr4, From 2ab455ccceb07945368709ba852e49f4c3119331 Mon Sep 17 00:00:00 2001 From: Anthony Liguori Date: Fri, 27 Apr 2007 09:29:49 +0300 Subject: [PATCH 61/66] KVM: VMX: Add lazy FPU support for VT Only save/restore the FPU host state when the guest is actually using the FPU. Signed-off-by: Anthony Liguori Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 61 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 09608114e29a..5a2a68dec6bf 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -101,6 +101,13 @@ static inline int is_page_fault(u32 intr_info) (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); } +static inline int is_no_device(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); +} + static inline int is_external_interrupt(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -216,6 +223,16 @@ static void vmcs_write64(unsigned long field, u64 value) #endif } +static void vmcs_clear_bits(unsigned long field, u32 mask) +{ + vmcs_writel(field, vmcs_readl(field) & ~mask); +} + +static void vmcs_set_bits(unsigned long field, u32 mask) +{ + vmcs_writel(field, vmcs_readl(field) | mask); +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -833,6 +850,11 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif + if (!(cr0 & CR0_TS_MASK)) { + vcpu->fpu_active = 1; + vmcs_clear_bits(EXCEPTION_BITMAP, CR0_TS_MASK); + } + vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); @@ -842,6 +864,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { vmcs_writel(GUEST_CR3, cr3); + + if (!(vcpu->cr0 & CR0_TS_MASK)) { + vcpu->fpu_active = 0; + vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); + vmcs_set_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + } } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1368,6 +1396,15 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm ("int $2"); return 1; } + + if (is_no_device(intr_info)) { + vcpu->fpu_active = 1; + vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + if (!(vcpu->cr0 & CR0_TS_MASK)) + vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + return 1; + } + error_code = 0; rip = vmcs_readl(GUEST_RIP); if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) @@ -1556,7 +1593,11 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) break; case 2: /* clts */ vcpu_load_rsp_rip(vcpu); - set_cr0(vcpu, vcpu->cr0 & ~CR0_TS_MASK); + vcpu->fpu_active = 1; + vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + vcpu->cr0 &= ~CR0_TS_MASK; + vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ @@ -1806,8 +1847,14 @@ again: if (vcpu->guest_debug.enabled) kvm_guest_debug_pre(vcpu); - fx_save(vcpu->host_fx_image); - fx_restore(vcpu->guest_fx_image); + if (vcpu->fpu_active) { + fx_save(vcpu->host_fx_image); + fx_restore(vcpu->guest_fx_image); + } + /* + * Loading guest fpu may have cleared host cr0.ts + */ + vmcs_writel(HOST_CR0, read_cr0()); #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { @@ -1965,8 +2012,11 @@ again: } #endif - fx_save(vcpu->guest_fx_image); - fx_restore(vcpu->host_fx_image); + if (vcpu->fpu_active) { + fx_save(vcpu->guest_fx_image); + fx_restore(vcpu->host_fx_image); + } + vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); @@ -2078,6 +2128,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmcs_clear(vmcs); vcpu->vmcs = vmcs; vcpu->launched = 0; + vcpu->fpu_active = 1; return 0; From 2807696c3791d6dd1dcf20f022eaa2dc7615bc5d Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 28 Apr 2007 21:20:48 +0200 Subject: [PATCH 62/66] KVM: fix an if() condition It might have worked in this case since PT_PRESENT_MASK is 1, but let's express this correctly. Signed-off-by: Adrian Bunk Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 32c64f682081..e8e228118de9 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1408,7 +1408,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { u64 ent = pt[i]; - if (!ent & PT_PRESENT_MASK) + if (!(ent & PT_PRESENT_MASK)) continue; va = canonicalize(va); From 94dfbdb3894eda2f673b70e20da2743c4a8d3968 Mon Sep 17 00:00:00 2001 From: Anthony Liguori Date: Sun, 29 Apr 2007 11:56:06 +0300 Subject: [PATCH 63/66] KVM: SVM: Only save/restore MSRs when needed We only have to save/restore MSR_GS_BASE on every VMEXIT. The rest can be saved/restored when we leave the VCPU. Since we don't emulate the DEBUGCTL MSRs and the guest cannot write to them, we don't have to worry about saving/restoring them at all. This shaves a whopping 40% off raw vmexit costs on AMD. Signed-off-by: Anthony Liguori Signed-off-by: Avi Kivity --- drivers/kvm/kvm_svm.h | 11 +++++------ drivers/kvm/svm.c | 26 +++++++++++++++----------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h index a1a9eba50d47..a869983d683d 100644 --- a/drivers/kvm/kvm_svm.h +++ b/drivers/kvm/kvm_svm.h @@ -9,17 +9,15 @@ #include "svm.h" #include "kvm.h" -static const u32 host_save_msrs[] = { +static const u32 host_save_user_msrs[] = { #ifdef CONFIG_X86_64 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, - MSR_FS_BASE, MSR_GS_BASE, + MSR_FS_BASE, #endif MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_IA32_DEBUGCTLMSR, /*MSR_IA32_LASTBRANCHFROMIP, - MSR_IA32_LASTBRANCHTOIP, MSR_IA32_LASTINTFROMIP,MSR_IA32_LASTINTTOIP,*/ }; -#define NR_HOST_SAVE_MSRS ARRAY_SIZE(host_save_msrs) +#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) #define NUM_DB_REGS 4 struct vcpu_svm { @@ -32,7 +30,8 @@ struct vcpu_svm { u64 next_rip; - u64 host_msrs[NR_HOST_SAVE_MSRS]; + u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; + u64 host_gs_base; unsigned long host_cr2; unsigned long host_db_regs[NUM_DB_REGS]; unsigned long host_dr6; diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index bddd0238869d..9c15f32eea18 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -522,8 +522,6 @@ static void init_vmcb(struct vmcb *vmcb) control->msrpm_base_pa = msrpm_base; control->tsc_offset = 0; control->int_ctl = V_INTR_MASKING_MASK; - if (svm_has(SVM_FEATURE_LBRV)) - control->lbr_ctl = 1ULL; init_seg(&save->es); init_seg(&save->ss); @@ -611,7 +609,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu) { - int cpu; + int cpu, i; cpu = get_cpu(); if (unlikely(cpu != vcpu->cpu)) { @@ -626,10 +624,18 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu) vcpu->svm->vmcb->control.tsc_offset += delta; vcpu->cpu = cpu; } + + for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) + rdmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) { + int i; + + for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) + wrmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]); + rdtscll(vcpu->host_tsc); put_cpu(); } @@ -815,18 +821,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) static void load_host_msrs(struct kvm_vcpu *vcpu) { - int i; - - for ( i = 0; i < NR_HOST_SAVE_MSRS; i++) - wrmsrl(host_save_msrs[i], vcpu->svm->host_msrs[i]); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, vcpu->svm->host_gs_base); +#endif } static void save_host_msrs(struct kvm_vcpu *vcpu) { - int i; - - for ( i = 0; i < NR_HOST_SAVE_MSRS; i++) - rdmsrl(host_save_msrs[i], vcpu->svm->host_msrs[i]); +#ifdef CONFIG_X86_64 + rdmsrl(MSR_GS_BASE, vcpu->svm->host_gs_base); +#endif } static void new_asid(struct kvm_vcpu *vcpu, struct svm_cpu_data *svm_data) From e7df56e4a00358b6975fae3b70dc9df1282d427a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 14 Mar 2007 15:54:54 +0200 Subject: [PATCH 64/66] KVM: Remove extraneous guest entry on mmio read When emulating an mmio read, we actually emulate twice: once to determine the physical address of the mmio, and, after we've exited to userspace to get the mmio value, we emulate again to place the value in the result register and update any flags. But we don't really need to enter the guest again for that, only to take an immediate vmexit. So, if we detect that we're doing an mmio read, emulate a single instruction before entering the guest again. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 1 + drivers/kvm/kvm_main.c | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index f99e89e185b2..41634fde8e13 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -310,6 +310,7 @@ struct kvm_vcpu { int mmio_size; unsigned char mmio_data[8]; gpa_t mmio_phys_addr; + gva_t mmio_fault_cr2; struct kvm_pio_request pio; void *pio_data; diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index cdf0b176851d..f267dbb52845 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1186,6 +1186,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, int r; int cs_db, cs_l; + vcpu->mmio_fault_cr2 = cr2; kvm_arch_ops->cache_regs(vcpu); kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); @@ -1804,14 +1805,23 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = complete_pio(vcpu); if (r) goto out; - } else { + } else if (!vcpu->mmio_is_write) { memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); vcpu->mmio_read_completed = 1; + vcpu->mmio_needed = 0; + r = emulate_instruction(vcpu, kvm_run, + vcpu->mmio_fault_cr2, 0); + if (r == EMULATE_DO_MMIO) { + /* + * Read-modify-write. Back to userspace. + */ + kvm_run->exit_reason = KVM_EXIT_MMIO; + r = 0; + goto out; + } } } - vcpu->mmio_needed = 0; - if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { kvm_arch_ops->cache_regs(vcpu); vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; From 02c83209726270ddf9597deabc45e08f6fc3942c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 29 Apr 2007 15:02:17 +0300 Subject: [PATCH 65/66] KVM: Don't require explicit indication of completion of mmio or pio It is illegal not to return from a pio or mmio request without completing it, as mmio or pio is an atomic operation. Therefore, we can simplify the userspace interface by avoiding the completion indication. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 44 +++++++++++++++++++++--------------------- include/linux/kvm.h | 5 ++--- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index f267dbb52845..c8b8cfa332bb 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1237,8 +1237,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu, kvm_arch_ops->decache_regs(vcpu); kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags); - if (vcpu->mmio_is_write) + if (vcpu->mmio_is_write) { + vcpu->mmio_needed = 0; return EMULATE_DO_MMIO; + } return EMULATE_DONE; } @@ -1692,8 +1694,6 @@ static int complete_pio(struct kvm_vcpu *vcpu) vcpu->regs[VCPU_REGS_RSI] += delta; } - vcpu->run->io_completed = 0; - kvm_arch_ops->decache_regs(vcpu); io->count -= io->cur_count; @@ -1800,25 +1800,25 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* re-sync apic's tpr */ vcpu->cr8 = kvm_run->cr8; - if (kvm_run->io_completed) { - if (vcpu->pio.cur_count) { - r = complete_pio(vcpu); - if (r) - goto out; - } else if (!vcpu->mmio_is_write) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); - vcpu->mmio_read_completed = 1; - vcpu->mmio_needed = 0; - r = emulate_instruction(vcpu, kvm_run, - vcpu->mmio_fault_cr2, 0); - if (r == EMULATE_DO_MMIO) { - /* - * Read-modify-write. Back to userspace. - */ - kvm_run->exit_reason = KVM_EXIT_MMIO; - r = 0; - goto out; - } + if (vcpu->pio.cur_count) { + r = complete_pio(vcpu); + if (r) + goto out; + } + + if (vcpu->mmio_needed) { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + vcpu->mmio_needed = 0; + r = emulate_instruction(vcpu, kvm_run, + vcpu->mmio_fault_cr2, 0); + if (r == EMULATE_DO_MMIO) { + /* + * Read-modify-write. Back to userspace. + */ + kvm_run->exit_reason = KVM_EXIT_MMIO; + r = 0; + goto out; } } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 07bf353eeb6f..738c2f50c774 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -11,7 +11,7 @@ #include #include -#define KVM_API_VERSION 10 +#define KVM_API_VERSION 11 /* * Architectural interrupt line count, and the size of the bitmap needed @@ -58,9 +58,8 @@ enum kvm_exit_reason { /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { /* in */ - __u32 io_completed; /* mmio/pio request completed */ __u8 request_interrupt_window; - __u8 padding1[3]; + __u8 padding1[7]; /* out */ __u32 exit_reason; From 2ff81f70b56dc1cdd3bf2f08414608069db6ef1a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 29 Apr 2007 16:25:49 +0300 Subject: [PATCH 66/66] KVM: Remove unused 'instruction_length' As we no longer emulate in userspace, this is meaningless. We don't compute it on SVM anyway. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 1 - include/linux/kvm.h | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 5a2a68dec6bf..724db0027f00 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1783,7 +1783,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) exit_reason != EXIT_REASON_EXCEPTION_NMI ) printk(KERN_WARNING "%s: unexpected, valid vectoring info and " "exit reason is 0x%x\n", __FUNCTION__, exit_reason); - kvm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 738c2f50c774..e6edca81ab84 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -11,7 +11,7 @@ #include #include -#define KVM_API_VERSION 11 +#define KVM_API_VERSION 12 /* * Architectural interrupt line count, and the size of the bitmap needed @@ -63,10 +63,9 @@ struct kvm_run { /* out */ __u32 exit_reason; - __u32 instruction_length; __u8 ready_for_interrupt_injection; __u8 if_flag; - __u8 padding2[6]; + __u8 padding2[2]; /* in (pre_kvm_run), out (post_kvm_run) */ __u64 cr8;