Merge branch 'kvm-ppc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc into HEAD

The highlights are: * Enable VFIO device on PowerPC, from David Gibson * Optimizations to speed up IPIs between vcpus in HV KVM, from Suresh Warrier (who is also Suresh E. Warrier) * In-kernel handling of IOMMU hypercalls, and support for dynamic DMA windows (DDW), from Alexey Kardashevskiy. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-11-10 22:21:40 +00:00 · 2016-03-03 14:35:44 +01:00 · 2016-03-03 14:35:44 +01:00 · 61ec84f145
commit 61ec84f145
parent 433da86023 58ded4201f
23 changed files with 1003 additions and 86 deletions
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@ -3035,6 +3035,63 @@ Returns: 0 on success, -1 on error

 Queues an SMI on the thread's vcpu.

+4.97 KVM_CAP_PPC_MULTITCE
+
+Capability: KVM_CAP_PPC_MULTITCE
+Architectures: ppc
+Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user space. So user space still has to have
+an implementation for these despite the in kernel acceleration.
+
+This capability is always enabled.
+
+4.98 KVM_CREATE_SPAPR_TCE_64
+
+Capability: KVM_CAP_SPAPR_TCE_64
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce_64 (in)
+Returns: file descriptor for manipulating the created TCE table
+
+This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit
+windows, described in 4.62 KVM_CREATE_SPAPR_TCE
+
+This capability uses extended struct in ioctl interface:
+
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+	__u64 liobn;
+	__u32 page_shift;
+	__u32 flags;
+	__u64 offset;	/* in pages */
+	__u64 size; 	/* in pages */
+};
+
+The aim of extension is to support an additional bigger DMA window with
+a variable page size.
+KVM_CREATE_SPAPR_TCE_64 receives a 64bit window size, an IOMMU page shift and
+a bus offset of the corresponding DMA window, @size and @offset are numbers
+of IOMMU pages.
+
+@flags are not used at the moment.
+
+The rest of functionality is identical to KVM_CREATE_SPAPR_TCE.
+
 5. The kvm_run structure
 ------------------------

--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@ -33,8 +33,6 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
 }
 #endif

-#define SPAPR_TCE_SHIFT		12
-
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 #define KVM_DEFAULT_HPT_ORDER	24	/* 16MB HPT by default */
 #endif
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@ -182,7 +182,10 @@ struct kvmppc_spapr_tce_table {
 	struct list_head list;
 	struct kvm *kvm;
 	u64 liobn;
-	u32 window_size;
+	struct rcu_head rcu;
+	u32 page_shift;
+	u64 offset;		/* in pages */
+	u64 size;		/* window size in pages */
 	struct page *pages[0];
 };

--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@ -165,9 +165,25 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);

 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-				struct kvm_create_spapr_tce *args);
+				struct kvm_create_spapr_tce_64 *args);
+extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
+		struct kvm_vcpu *vcpu, unsigned long liobn);
+extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+		unsigned long ioba, unsigned long npages);
+extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
+		unsigned long tce);
+extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+		unsigned long *ua, unsigned long **prmap);
+extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
+		unsigned long idx, unsigned long tce);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			     unsigned long ioba, unsigned long tce);
+extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_list, unsigned long npages);
+extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_value, unsigned long npages);
 extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			     unsigned long ioba);
 extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
@ -437,6 +453,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
 }
+extern void kvmppc_alloc_host_rm_ops(void);
+extern void kvmppc_free_host_rm_ops(void);
 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
 extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
@ -445,7 +463,11 @@ extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
 			struct kvm_vcpu *vcpu, u32 cpu);
+extern void kvmppc_xics_ipi_action(void);
+extern int h_ipi_redirect;
 #else
+static inline void kvmppc_alloc_host_rm_ops(void) {};
+static inline void kvmppc_free_host_rm_ops(void) {};
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 	{ return 0; }
 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
@ -459,6 +481,33 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 	{ return 0; }
 #endif

+/*
+ * Host-side operations we want to set up while running in real
+ * mode in the guest operating on the xics.
+ * Currently only VCPU wakeup is supported.
+ */
+
+union kvmppc_rm_state {
+	unsigned long raw;
+	struct {
+		u32 in_host;
+		u32 rm_action;
+	};
+};
+
+struct kvmppc_host_rm_core {
+	union kvmppc_rm_state rm_state;
+	void *rm_data;
+	char pad[112];
+};
+
+struct kvmppc_host_rm_ops {
+	struct kvmppc_host_rm_core	*rm_core;
+	void		(*vcpu_kick)(struct kvm_vcpu *vcpu);
+};
+
+extern struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
+
 static inline unsigned long kvmppc_get_epr(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_KVM_BOOKE_HV
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@ -78,6 +78,9 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 	}
 	return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift);
 }
+
+unsigned long vmalloc_to_phys(void *vmalloc_addr);
+
 #endif /* __ASSEMBLY__ */

 #endif /* _ASM_POWERPC_PGTABLE_H */
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@ -114,6 +114,9 @@ extern int cpu_to_core_id(int cpu);
 #define PPC_MSG_TICK_BROADCAST	2
 #define PPC_MSG_DEBUGGER_BREAK  3

+/* This is only used by the powernv kernel */
+#define PPC_MSG_RM_HOST_ACTION	4
+
 /* for irq controllers that have dedicated ipis per message (4) */
 extern int smp_request_message_ipi(int virq, int message);
 extern const char *smp_ipi_name[];
@ -121,6 +124,7 @@ extern const char *smp_ipi_name[];
 /* for irq controllers with only a single ipi */
 extern void smp_muxed_ipi_set_data(int cpu, unsigned long data);
 extern void smp_muxed_ipi_message_pass(int cpu, int msg);
+extern void smp_muxed_ipi_set_message(int cpu, int msg);
 extern irqreturn_t smp_ipi_demux(void);

 void smp_init_pSeries(void);
--- a/arch/powerpc/include/asm/xics.h
+++ b/arch/powerpc/include/asm/xics.h
@ -30,6 +30,7 @@
 #ifdef CONFIG_PPC_ICP_NATIVE
 extern int icp_native_init(void);
 extern void icp_native_flush_interrupt(void);
+extern void icp_native_cause_ipi_rm(int cpu);
 #else
 static inline int icp_native_init(void) { return -ENODEV; }
 #endif
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@ -333,6 +333,15 @@ struct kvm_create_spapr_tce {
 	__u32 window_size;
 };

+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+	__u64 liobn;
+	__u32 page_shift;
+	__u32 flags;
+	__u64 offset;	/* in pages */
+	__u64 size;	/* in pages */
+};
+
 /* for KVM_ALLOCATE_RMA */
 struct kvm_allocate_rma {
 	__u64 rma_size;
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@ -206,7 +206,7 @@ int smp_request_message_ipi(int virq, int msg)

 #ifdef CONFIG_PPC_SMP_MUXED_IPI
 struct cpu_messages {
-	int messages;			/* current messages */
+	long messages;			/* current messages */
 	unsigned long data;		/* data for cause ipi */
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
@ -218,7 +218,7 @@ void smp_muxed_ipi_set_data(int cpu, unsigned long data)
 	info->data = data;
 }

-void smp_muxed_ipi_message_pass(int cpu, int msg)
+void smp_muxed_ipi_set_message(int cpu, int msg)
 {
 	struct cpu_messages *info = &per_cpu(ipi_message, cpu);
 	char *message = (char *)&info->messages;
@ -228,6 +228,13 @@ void smp_muxed_ipi_message_pass(int cpu, int msg)
 	 */
 	smp_mb();
 	message[msg] = 1;
+}
+
+void smp_muxed_ipi_message_pass(int cpu, int msg)
+{
+	struct cpu_messages *info = &per_cpu(ipi_message, cpu);
+
+	smp_muxed_ipi_set_message(cpu, msg);
 	/*
 	 * cause_ipi functions are required to include a full barrier
 	 * before doing whatever causes the IPI.
@ -236,20 +243,31 @@ void smp_muxed_ipi_message_pass(int cpu, int msg)
 }

 #ifdef __BIG_ENDIAN__
-#define IPI_MESSAGE(A) (1 << (24 - 8 * (A)))
+#define IPI_MESSAGE(A) (1uL << ((BITS_PER_LONG - 8) - 8 * (A)))
 #else
-#define IPI_MESSAGE(A) (1 << (8 * (A)))
+#define IPI_MESSAGE(A) (1uL << (8 * (A)))
 #endif

 irqreturn_t smp_ipi_demux(void)
 {
 	struct cpu_messages *info = this_cpu_ptr(&ipi_message);
-	unsigned int all;
+	unsigned long all;

 	mb();	/* order any irq clear */

 	do {
 		all = xchg(&info->messages, 0);
+#if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+		/*
+		 * Must check for PPC_MSG_RM_HOST_ACTION messages
+		 * before PPC_MSG_CALL_FUNCTION messages because when
+		 * a VM is destroyed, we call kick_all_cpus_sync()
+		 * to ensure that any pending PPC_MSG_RM_HOST_ACTION
+		 * messages have completed before we free any VCPUs.
+		 */
+		if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION))
+			kvmppc_xics_ipi_action();
+#endif
 		if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION))
 			generic_smp_call_function_interrupt();
 		if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@ -8,7 +8,7 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
 KVM := ../../../virt/kvm

 common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-		$(KVM)/eventfd.o
+		$(KVM)/eventfd.o $(KVM)/vfio.o

 CFLAGS_e500_mmu.o := -I.
 CFLAGS_e500_mmu_host.o := -I.
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@ -807,7 +807,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 {

 #ifdef CONFIG_PPC64
-	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+	INIT_LIST_HEAD_RCU(&kvm->arch.spapr_tce_tables);
 	INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
 #endif

--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@ -14,6 +14,7 @@
 *
 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
 */

 #include <linux/types.h>
@ -36,28 +37,69 @@
 #include <asm/ppc-opcode.h>
 #include <asm/kvm_host.h>
 #include <asm/udbg.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>

-#define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
-
-static long kvmppc_stt_npages(unsigned long window_size)
+static unsigned long kvmppc_tce_pages(unsigned long iommu_pages)
 {
-	return ALIGN((window_size >> SPAPR_TCE_SHIFT)
-		     * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+	return ALIGN(iommu_pages * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
 }

-static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
 {
-	struct kvm *kvm = stt->kvm;
-	int i;
+	unsigned long stt_bytes = sizeof(struct kvmppc_spapr_tce_table) +
+			(tce_pages * sizeof(struct page *));

-	mutex_lock(&kvm->lock);
-	list_del(&stt->list);
-	for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+	return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
+}
+
+static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
+{
+	long ret = 0;
+
+	if (!current || !current->mm)
+		return ret; /* process exited */
+
+	down_write(&current->mm->mmap_sem);
+
+	if (inc) {
+		unsigned long locked, lock_limit;
+
+		locked = current->mm->locked_vm + stt_pages;
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+			ret = -ENOMEM;
+		else
+			current->mm->locked_vm += stt_pages;
+	} else {
+		if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
+			stt_pages = current->mm->locked_vm;
+
+		current->mm->locked_vm -= stt_pages;
+	}
+
+	pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
+			inc ? '+' : '-',
+			stt_pages << PAGE_SHIFT,
+			current->mm->locked_vm << PAGE_SHIFT,
+			rlimit(RLIMIT_MEMLOCK),
+			ret ? " - exceeded" : "");
+
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+static void release_spapr_tce_table(struct rcu_head *head)
+{
+	struct kvmppc_spapr_tce_table *stt = container_of(head,
+			struct kvmppc_spapr_tce_table, rcu);
+	unsigned long i, npages = kvmppc_tce_pages(stt->size);
+
+	for (i = 0; i < npages; i++)
 		__free_page(stt->pages[i]);
-	kfree(stt);
-	mutex_unlock(&kvm->lock);

-	kvm_put_kvm(kvm);
+	kfree(stt);
 }

 static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@ -65,7 +107,7 @@ static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
 	struct page *page;

-	if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+	if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
 		return VM_FAULT_SIGBUS;

 	page = stt->pages[vmf->pgoff];
@ -88,7 +130,14 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
 {
 	struct kvmppc_spapr_tce_table *stt = filp->private_data;

-	release_spapr_tce_table(stt);
+	list_del_rcu(&stt->list);
+
+	kvm_put_kvm(stt->kvm);
+
+	kvmppc_account_memlimit(
+		kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
+	call_rcu(&stt->rcu, release_spapr_tce_table);
+
 	return 0;
 }

@ -98,20 +147,29 @@ static const struct file_operations kvm_spapr_tce_fops = {
 };

 long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-				   struct kvm_create_spapr_tce *args)
+				   struct kvm_create_spapr_tce_64 *args)
 {
 	struct kvmppc_spapr_tce_table *stt = NULL;
-	long npages;
+	unsigned long npages, size;
 	int ret = -ENOMEM;
 	int i;

+	if (!args->size)
+		return -EINVAL;
+
 	/* Check this LIOBN hasn't been previously allocated */
 	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
 		if (stt->liobn == args->liobn)
 			return -EBUSY;
 	}

-	npages = kvmppc_stt_npages(args->window_size);
+	size = args->size;
+	npages = kvmppc_tce_pages(size);
+	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+	if (ret) {
+		stt = NULL;
+		goto fail;
+	}

 	stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
 		      GFP_KERNEL);
@ -119,7 +177,9 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 		goto fail;

 	stt->liobn = args->liobn;
-	stt->window_size = args->window_size;
+	stt->page_shift = args->page_shift;
+	stt->offset = args->offset;
+	stt->size = size;
 	stt->kvm = kvm;

 	for (i = 0; i < npages; i++) {
@ -131,7 +191,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	kvm_get_kvm(kvm);

 	mutex_lock(&kvm->lock);
-	list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+	list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);

 	mutex_unlock(&kvm->lock);

@ -148,3 +208,59 @@ fail:
 	}
 	return ret;
 }
+
+long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_list, unsigned long npages)
+{
+	struct kvmppc_spapr_tce_table *stt;
+	long i, ret = H_SUCCESS, idx;
+	unsigned long entry, ua = 0;
+	u64 __user *tces, tce;
+
+	stt = kvmppc_find_table(vcpu, liobn);
+	if (!stt)
+		return H_TOO_HARD;
+
+	entry = ioba >> stt->page_shift;
+	/*
+	 * SPAPR spec says that the maximum size of the list is 512 TCEs
+	 * so the whole table fits in 4K page
+	 */
+	if (npages > 512)
+		return H_PARAMETER;
+
+	if (tce_list & (SZ_4K - 1))
+		return H_PARAMETER;
+
+	ret = kvmppc_ioba_validate(stt, ioba, npages);
+	if (ret != H_SUCCESS)
+		return ret;
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
+		ret = H_TOO_HARD;
+		goto unlock_exit;
+	}
+	tces = (u64 __user *) ua;
+
+	for (i = 0; i < npages; ++i) {
+		if (get_user(tce, tces + i)) {
+			ret = H_TOO_HARD;
+			goto unlock_exit;
+		}
+		tce = be64_to_cpu(tce);
+
+		ret = kvmppc_tce_validate(stt, tce);
+		if (ret != H_SUCCESS)
+			goto unlock_exit;
+
+		kvmppc_tce_put(stt, entry + i, tce);
+	}
+
+unlock_exit:
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect);
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@ -14,6 +14,7 @@
 *
 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
 */

 #include <linux/types.h>
@ -30,76 +31,321 @@
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 #include <asm/mmu-hash64.h>
+#include <asm/mmu_context.h>
 #include <asm/hvcall.h>
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
 #include <asm/kvm_host.h>
 #include <asm/udbg.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/iommu.h>

 #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))

-/* WARNING: This will be called in real-mode on HV KVM and virtual
+/*
+ * Finds a TCE table descriptor by LIOBN.
+ *
+ * WARNING: This will be called in real or virtual mode on HV KVM and virtual
 *          mode on PR KVM
 */
-long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-		      unsigned long ioba, unsigned long tce)
+struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu,
+		unsigned long liobn)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvmppc_spapr_tce_table *stt;

+	list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list)
+		if (stt->liobn == liobn)
+			return stt;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(kvmppc_find_table);
+
+/*
+ * Validates IO address.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
+long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+		unsigned long ioba, unsigned long npages)
+{
+	unsigned long mask = (1ULL << stt->page_shift) - 1;
+	unsigned long idx = ioba >> stt->page_shift;
+
+	if ((ioba & mask) || (idx < stt->offset) ||
+			(idx - stt->offset + npages > stt->size) ||
+			(idx + npages < idx))
+		return H_PARAMETER;
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
+
+/*
+ * Validates TCE address.
+ * At the moment flags and page mask are validated.
+ * As the host kernel does not access those addresses (just puts them
+ * to the table and user space is supposed to process them), we can skip
+ * checking other things (such as TCE is a guest RAM address or the page
+ * was actually allocated).
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
+long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
+{
+	unsigned long page_mask = ~((1ULL << stt->page_shift) - 1);
+	unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ);
+
+	if (tce & mask)
+		return H_PARAMETER;
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
+
+/* Note on the use of page_address() in real mode,
+ *
+ * It is safe to use page_address() in real mode on ppc64 because
+ * page_address() is always defined as lowmem_page_address()
+ * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetic
+ * operation and does not access page struct.
+ *
+ * Theoretically page_address() could be defined different
+ * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL
+ * would have to be enabled.
+ * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64,
+ * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only
+ * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP
+ * is not expected to be enabled on ppc32, page_address()
+ * is safe for ppc32 as well.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
+static u64 *kvmppc_page_address(struct page *page)
+{
+#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL)
+#error TODO: fix to avoid page_address() here
+#endif
+	return (u64 *) page_address(page);
+}
+
+/*
+ * Handles TCE requests for emulated devices.
+ * Puts guest TCE values to the table and expects user space to convert them.
+ * Called in both real and virtual modes.
+ * Cannot fail so kvmppc_tce_validate must be called before it.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
+void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
+		unsigned long idx, unsigned long tce)
+{
+	struct page *page;
+	u64 *tbl;
+
+	idx -= stt->offset;
+	page = stt->pages[idx / TCES_PER_PAGE];
+	tbl = kvmppc_page_address(page);
+
+	tbl[idx % TCES_PER_PAGE] = tce;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_put);
+
+long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+		unsigned long *ua, unsigned long **prmap)
+{
+	unsigned long gfn = gpa >> PAGE_SHIFT;
+	struct kvm_memory_slot *memslot;
+
+	memslot = search_memslots(kvm_memslots(kvm), gfn);
+	if (!memslot)
+		return -EINVAL;
+
+	*ua = __gfn_to_hva_memslot(memslot, gfn) |
+		(gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	if (prmap)
+		*prmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+#endif
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+		      unsigned long ioba, unsigned long tce)
+{
+	struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+	long ret;
+
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */

-	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-		if (stt->liobn == liobn) {
-			unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-			struct page *page;
-			u64 *tbl;
+	if (!stt)
+		return H_TOO_HARD;

-			/* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
-			/* 	    liobn, stt, stt->window_size); */
-			if (ioba >= stt->window_size)
-				return H_PARAMETER;
+	ret = kvmppc_ioba_validate(stt, ioba, 1);
+	if (ret != H_SUCCESS)
+		return ret;

-			page = stt->pages[idx / TCES_PER_PAGE];
-			tbl = (u64 *)page_address(page);
+	ret = kvmppc_tce_validate(stt, tce);
+	if (ret != H_SUCCESS)
+		return ret;

-			/* FIXME: Need to validate the TCE itself */
-			/* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
-			tbl[idx % TCES_PER_PAGE] = tce;
-			return H_SUCCESS;
-		}
-	}
+	kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);

-	/* Didn't find the liobn, punt it to userspace */
-	return H_TOO_HARD;
+	return H_SUCCESS;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);

+static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
+		unsigned long ua, unsigned long *phpa)
+{
+	pte_t *ptep, pte;
+	unsigned shift = 0;
+
+	ptep = __find_linux_pte_or_hugepte(vcpu->arch.pgdir, ua, NULL, &shift);
+	if (!ptep || !pte_present(*ptep))
+		return -ENXIO;
+	pte = *ptep;
+
+	if (!shift)
+		shift = PAGE_SHIFT;
+
+	/* Avoid handling anything potentially complicated in realmode */
+	if (shift > PAGE_SHIFT)
+		return -EAGAIN;
+
+	if (!pte_young(pte))
+		return -EAGAIN;
+
+	*phpa = (pte_pfn(pte) << PAGE_SHIFT) | (ua & ((1ULL << shift) - 1)) |
+			(ua & ~PAGE_MASK);
+
+	return 0;
+}
+
+long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_list,	unsigned long npages)
+{
+	struct kvmppc_spapr_tce_table *stt;
+	long i, ret = H_SUCCESS;
+	unsigned long tces, entry, ua = 0;
+	unsigned long *rmap = NULL;
+
+	stt = kvmppc_find_table(vcpu, liobn);
+	if (!stt)
+		return H_TOO_HARD;
+
+	entry = ioba >> stt->page_shift;
+	/*
+	 * The spec says that the maximum size of the list is 512 TCEs
+	 * so the whole table addressed resides in 4K page
+	 */
+	if (npages > 512)
+		return H_PARAMETER;
+
+	if (tce_list & (SZ_4K - 1))
+		return H_PARAMETER;
+
+	ret = kvmppc_ioba_validate(stt, ioba, npages);
+	if (ret != H_SUCCESS)
+		return ret;
+
+	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
+		return H_TOO_HARD;
+
+	rmap = (void *) vmalloc_to_phys(rmap);
+
+	/*
+	 * Synchronize with the MMU notifier callbacks in
+	 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
+	 * While we have the rmap lock, code running on other CPUs
+	 * cannot finish unmapping the host real page that backs
+	 * this guest real page, so we are OK to access the host
+	 * real page.
+	 */
+	lock_rmap(rmap);
+	if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
+		ret = H_TOO_HARD;
+		goto unlock_exit;
+	}
+
+	for (i = 0; i < npages; ++i) {
+		unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
+
+		ret = kvmppc_tce_validate(stt, tce);
+		if (ret != H_SUCCESS)
+			goto unlock_exit;
+
+		kvmppc_tce_put(stt, entry + i, tce);
+	}
+
+unlock_exit:
+	unlock_rmap(rmap);
+
+	return ret;
+}
+
+long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_value, unsigned long npages)
+{
+	struct kvmppc_spapr_tce_table *stt;
+	long i, ret;
+
+	stt = kvmppc_find_table(vcpu, liobn);
+	if (!stt)
+		return H_TOO_HARD;
+
+	ret = kvmppc_ioba_validate(stt, ioba, npages);
+	if (ret != H_SUCCESS)
+		return ret;
+
+	/* Check permission bits only to allow userspace poison TCE for debug */
+	if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
+		return H_PARAMETER;
+
+	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
+		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
+
 long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		      unsigned long ioba)
 {
-	struct kvm *kvm = vcpu->kvm;
-	struct kvmppc_spapr_tce_table *stt;
+	struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+	long ret;
+	unsigned long idx;
+	struct page *page;
+	u64 *tbl;

-	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-		if (stt->liobn == liobn) {
-			unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-			struct page *page;
-			u64 *tbl;
+	if (!stt)
+		return H_TOO_HARD;

-			if (ioba >= stt->window_size)
-				return H_PARAMETER;
+	ret = kvmppc_ioba_validate(stt, ioba, 1);
+	if (ret != H_SUCCESS)
+		return ret;

-			page = stt->pages[idx / TCES_PER_PAGE];
-			tbl = (u64 *)page_address(page);
+	idx = (ioba >> stt->page_shift) - stt->offset;
+	page = stt->pages[idx / TCES_PER_PAGE];
+	tbl = (u64 *)page_address(page);

-			vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
-			return H_SUCCESS;
-		}
-	}
+	vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];

-	/* Didn't find the liobn, punt it to userspace */
-	return H_TOO_HARD;
+	return H_SUCCESS;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_get_tce);
+
+#endif /* KVM_BOOK3S_HV_POSSIBLE */
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@ -81,6 +81,17 @@ static int target_smt_mode;
 module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");

+#ifdef CONFIG_KVM_XICS
+static struct kernel_param_ops module_param_ops = {
+	.set = param_set_int,
+	.get = param_get_int,
+};
+
+module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
+							S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
+#endif
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);

@ -768,7 +779,31 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 		if (kvmppc_xics_enabled(vcpu)) {
 			ret = kvmppc_xics_hcall(vcpu, req);
 			break;
-		} /* fallthrough */
+		}
+		return RESUME_HOST;
+	case H_PUT_TCE:
+		ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5),
+						kvmppc_get_gpr(vcpu, 6));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_PUT_TCE_INDIRECT:
+		ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5),
+						kvmppc_get_gpr(vcpu, 6),
+						kvmppc_get_gpr(vcpu, 7));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_STUFF_TCE:
+		ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5),
+						kvmppc_get_gpr(vcpu, 6),
+						kvmppc_get_gpr(vcpu, 7));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
 	default:
 		return RESUME_HOST;
 	}
@ -2278,6 +2313,46 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 	spin_unlock(&vc->lock);
 }

+/*
+ * Clear core from the list of active host cores as we are about to
+ * enter the guest. Only do this if it is the primary thread of the
+ * core (not if a subcore) that is entering the guest.
+ */
+static inline void kvmppc_clear_host_core(int cpu)
+{
+	int core;
+
+	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+		return;
+	/*
+	 * Memory barrier can be omitted here as we will do a smp_wmb()
+	 * later in kvmppc_start_thread and we need ensure that state is
+	 * visible to other CPUs only after we enter guest.
+	 */
+	core = cpu >> threads_shift;
+	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
+}
+
+/*
+ * Advertise this core as an active host core since we exited the guest
+ * Only need to do this if it is the primary thread of the core that is
+ * exiting.
+ */
+static inline void kvmppc_set_host_core(int cpu)
+{
+	int core;
+
+	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+		return;
+
+	/*
+	 * Memory barrier can be omitted here because we do a spin_unlock
+	 * immediately after this which provides the memory barrier.
+	 */
+	core = cpu >> threads_shift;
+	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
+}
+
 /*
 * Run a set of guest threads on a physical core.
 * Called with vc->lock held.
@ -2390,6 +2465,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		}
 	}

+	kvmppc_clear_host_core(pcpu);
+
 	/* Start all the threads */
 	active = 0;
 	for (sub = 0; sub < core_info.n_subcores; ++sub) {
@ -2486,6 +2563,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 			kvmppc_ipi_thread(pcpu + i);
 	}

+	kvmppc_set_host_core(pcpu);
+
 	spin_unlock(&vc->lock);

 	/* make sure updates to secondary vcpu structs are visible now */
@ -2984,6 +3063,114 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 	goto out_srcu;
 }

+#ifdef CONFIG_KVM_XICS
+static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action,
+			void *hcpu)
+{
+	unsigned long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		kvmppc_set_host_core(cpu);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		kvmppc_clear_host_core(cpu);
+		break;
+#endif
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kvmppc_cpu_notifier = {
+	    .notifier_call = kvmppc_cpu_notify,
+};
+
+/*
+ * Allocate a per-core structure for managing state about which cores are
+ * running in the host versus the guest and for exchanging data between
+ * real mode KVM and CPU running in the host.
+ * This is only done for the first VM.
+ * The allocated structure stays even if all VMs have stopped.
+ * It is only freed when the kvm-hv module is unloaded.
+ * It's OK for this routine to fail, we just don't support host
+ * core operations like redirecting H_IPI wakeups.
+ */
+void kvmppc_alloc_host_rm_ops(void)
+{
+	struct kvmppc_host_rm_ops *ops;
+	unsigned long l_ops;
+	int cpu, core;
+	int size;
+
+	/* Not the first time here ? */
+	if (kvmppc_host_rm_ops_hv != NULL)
+		return;
+
+	ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
+	if (!ops)
+		return;
+
+	size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
+	ops->rm_core = kzalloc(size, GFP_KERNEL);
+
+	if (!ops->rm_core) {
+		kfree(ops);
+		return;
+	}
+
+	get_online_cpus();
+
+	for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
+		if (!cpu_online(cpu))
+			continue;
+
+		core = cpu >> threads_shift;
+		ops->rm_core[core].rm_state.in_host = 1;
+	}
+
+	ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
+
+	/*
+	 * Make the contents of the kvmppc_host_rm_ops structure visible
+	 * to other CPUs before we assign it to the global variable.
+	 * Do an atomic assignment (no locks used here), but if someone
+	 * beats us to it, just free our copy and return.
+	 */
+	smp_wmb();
+	l_ops = (unsigned long) ops;
+
+	if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
+		put_online_cpus();
+		kfree(ops->rm_core);
+		kfree(ops);
+		return;
+	}
+
+	register_cpu_notifier(&kvmppc_cpu_notifier);
+
+	put_online_cpus();
+}
+
+void kvmppc_free_host_rm_ops(void)
+{
+	if (kvmppc_host_rm_ops_hv) {
+		unregister_cpu_notifier(&kvmppc_cpu_notifier);
+		kfree(kvmppc_host_rm_ops_hv->rm_core);
+		kfree(kvmppc_host_rm_ops_hv);
+		kvmppc_host_rm_ops_hv = NULL;
+	}
+}
+#endif
+
 static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 {
 	unsigned long lpcr, lpid;
@ -2996,6 +3183,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 		return -ENOMEM;
 	kvm->arch.lpid = lpid;

+	kvmppc_alloc_host_rm_ops();
+
 	/*
 	 * Since we don't flush the TLB when tearing down a VM,
 	 * and this lpid might have previously been used,
@ -3229,6 +3418,7 @@ static int kvmppc_book3s_init_hv(void)

 static void kvmppc_book3s_exit_hv(void)
 {
+	kvmppc_free_host_rm_ops();
 	kvmppc_hv_ops = NULL;
 }

--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@ -283,3 +283,6 @@ void kvmhv_commence_exit(int trap)
 			kvmhv_interrupt_vcore(vc, ee);
 	}
 }
+
+struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
+EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@ -17,12 +17,16 @@
 #include <asm/xics.h>
 #include <asm/debug.h>
 #include <asm/synch.h>
+#include <asm/cputhreads.h>
 #include <asm/ppc-opcode.h>

 #include "book3s_xics.h"

 #define DEBUG_PASSUP

+int h_ipi_redirect = 1;
+EXPORT_SYMBOL(h_ipi_redirect);
+
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 			    u32 new_irq);

@ -50,11 +54,84 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics,

 /* -- ICP routines -- */

+#ifdef CONFIG_SMP
+static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
+{
+	int hcpu;
+
+	hcpu = hcore << threads_shift;
+	kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
+	smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
+	icp_native_cause_ipi_rm(hcpu);
+}
+#else
+static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { }
+#endif
+
+/*
+ * We start the search from our current CPU Id in the core map
+ * and go in a circle until we get back to our ID looking for a
+ * core that is running in host context and that hasn't already
+ * been targeted for another rm_host_ops.
+ *
+ * In the future, could consider using a fairer algorithm (one
+ * that distributes the IPIs better)
+ *
+ * Returns -1, if no CPU could be found in the host
+ * Else, returns a CPU Id which has been reserved for use
+ */
+static inline int grab_next_hostcore(int start,
+		struct kvmppc_host_rm_core *rm_core, int max, int action)
+{
+	bool success;
+	int core;
+	union kvmppc_rm_state old, new;
+
+	for (core = start + 1; core < max; core++)  {
+		old = new = READ_ONCE(rm_core[core].rm_state);
+
+		if (!old.in_host || old.rm_action)
+			continue;
+
+		/* Try to grab this host core if not taken already. */
+		new.rm_action = action;
+
+		success = cmpxchg64(&rm_core[core].rm_state.raw,
+						old.raw, new.raw) == old.raw;
+		if (success) {
+			/*
+			 * Make sure that the store to the rm_action is made
+			 * visible before we return to caller (and the
+			 * subsequent store to rm_data) to synchronize with
+			 * the IPI handler.
+			 */
+			smp_wmb();
+			return core;
+		}
+	}
+
+	return -1;
+}
+
+static inline int find_available_hostcore(int action)
+{
+	int core;
+	int my_core = smp_processor_id() >> threads_shift;
+	struct kvmppc_host_rm_core *rm_core = kvmppc_host_rm_ops_hv->rm_core;
+
+	core = grab_next_hostcore(my_core, rm_core, cpu_nr_cores(), action);
+	if (core == -1)
+		core = grab_next_hostcore(core, rm_core, my_core, action);
+
+	return core;
+}
+
 static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 				struct kvm_vcpu *this_vcpu)
 {
 	struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
 	int cpu;
+	int hcore;

 	/* Mark the target VCPU as having an interrupt pending */
 	vcpu->stat.queue_intr++;
@ -66,11 +143,22 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 		return;
 	}

-	/* Check if the core is loaded, if not, too hard */
+	/*
+	 * Check if the core is loaded,
+	 * if not, find an available host core to post to wake the VCPU,
+	 * if we can't find one, set up state to eventually return too hard.
+	 */
 	cpu = vcpu->arch.thread_cpu;
 	if (cpu < 0 || cpu >= nr_cpu_ids) {
-		this_icp->rm_action |= XICS_RM_KICK_VCPU;
-		this_icp->rm_kick_target = vcpu;
+		hcore = -1;
+		if (kvmppc_host_rm_ops_hv && h_ipi_redirect)
+			hcore = find_available_hostcore(XICS_RM_KICK_VCPU);
+		if (hcore != -1) {
+			icp_send_hcore_msg(hcore, vcpu);
+		} else {
+			this_icp->rm_action |= XICS_RM_KICK_VCPU;
+			this_icp->rm_kick_target = vcpu;
+		}
 		return;
 	}

@ -623,3 +711,40 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 bail:
 	return check_too_hard(xics, icp);
 }
+
+/*  --- Non-real mode XICS-related built-in routines ---  */
+
+/**
+ * Host Operations poked by RM KVM
+ */
+static void rm_host_ipi_action(int action, void *data)
+{
+	switch (action) {
+	case XICS_RM_KICK_VCPU:
+		kvmppc_host_rm_ops_hv->vcpu_kick(data);
+		break;
+	default:
+		WARN(1, "Unexpected rm_action=%d data=%p\n", action, data);
+		break;
+	}
+
+}
+
+void kvmppc_xics_ipi_action(void)
+{
+	int core;
+	unsigned int cpu = smp_processor_id();
+	struct kvmppc_host_rm_core *rm_corep;
+
+	core = cpu >> threads_shift;
+	rm_corep = &kvmppc_host_rm_ops_hv->rm_core[core];
+
+	if (rm_corep->rm_data) {
+		rm_host_ipi_action(rm_corep->rm_state.rm_action,
+							rm_corep->rm_data);
+		/* Order these stores against the real mode KVM */
+		rm_corep->rm_data = NULL;
+		smp_wmb();
+		rm_corep->rm_state.rm_action = 0;
+	}
+}
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@ -2006,8 +2006,8 @@ hcall_real_table:
 	.long	0		/* 0x12c */
 	.long	0		/* 0x130 */
 	.long	DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
-	.long	0		/* 0x138 */
-	.long	0		/* 0x13c */
+	.long	DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table
+	.long	DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table
 	.long	0		/* 0x140 */
 	.long	0		/* 0x144 */
 	.long	0		/* 0x148 */
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@ -280,6 +280,37 @@ static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }

+static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
+{
+	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
+	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
+	long rc;
+
+	rc = kvmppc_h_put_tce_indirect(vcpu, liobn, ioba,
+			tce, npages);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
+{
+	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+	unsigned long tce_value = kvmppc_get_gpr(vcpu, 6);
+	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
+	long rc;
+
+	rc = kvmppc_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
 static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 {
 	long rc = kvmppc_xics_hcall(vcpu, cmd);
@ -306,6 +337,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 		return kvmppc_h_pr_bulk_remove(vcpu);
 	case H_PUT_TCE:
 		return kvmppc_h_pr_put_tce(vcpu);
+	case H_PUT_TCE_INDIRECT:
+		return kvmppc_h_pr_put_tce_indirect(vcpu);
+	case H_STUFF_TCE:
+		return kvmppc_h_pr_stuff_tce(vcpu);
 	case H_CEDE:
 		kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
 		kvm_vcpu_block(vcpu);
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@ -33,6 +33,7 @@
 #include <asm/tlbflush.h>
 #include <asm/cputhreads.h>
 #include <asm/irqflags.h>
+#include <asm/iommu.h>
 #include "timing.h"
 #include "irq.h"
 #include "../mm/mmu_decl.h"
@ -437,6 +438,16 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	unsigned int i;
 	struct kvm_vcpu *vcpu;

+#ifdef CONFIG_KVM_XICS
+	/*
+	 * We call kick_all_cpus_sync() to ensure that all
+	 * CPUs have executed any pending IPIs before we
+	 * continue and free VCPUs structures below.
+	 */
+	if (is_kvmppc_hv_enabled(kvm))
+		kick_all_cpus_sync();
+#endif
+
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		kvm_arch_vcpu_free(vcpu);

@ -509,6 +520,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)

 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_SPAPR_TCE:
+	case KVM_CAP_SPAPR_TCE_64:
 	case KVM_CAP_PPC_ALLOC_HTAB:
 	case KVM_CAP_PPC_RTAS:
 	case KVM_CAP_PPC_FIXUP_HCALL:
@ -569,6 +581,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_PPC_GET_SMMU_INFO:
 		r = 1;
 		break;
+	case KVM_CAP_SPAPR_MULTITCE:
+		r = 1;
+		break;
 #endif
 	default:
 		r = 0;
@ -1331,13 +1346,34 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 #ifdef CONFIG_PPC_BOOK3S_64
+	case KVM_CREATE_SPAPR_TCE_64: {
+		struct kvm_create_spapr_tce_64 create_tce_64;
+
+		r = -EFAULT;
+		if (copy_from_user(&create_tce_64, argp, sizeof(create_tce_64)))
+			goto out;
+		if (create_tce_64.flags) {
+			r = -EINVAL;
+			goto out;
+		}
+		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64);
+		goto out;
+	}
 	case KVM_CREATE_SPAPR_TCE: {
 		struct kvm_create_spapr_tce create_tce;
+		struct kvm_create_spapr_tce_64 create_tce_64;

 		r = -EFAULT;
 		if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
 			goto out;
-		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
+
+		create_tce_64.liobn = create_tce.liobn;
+		create_tce_64.page_shift = IOMMU_PAGE_SHIFT_4K;
+		create_tce_64.offset = 0;
+		create_tce_64.size = create_tce.window_size >>
+				IOMMU_PAGE_SHIFT_4K;
+		create_tce_64.flags = 0;
+		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64);
 		goto out;
 	}
 	case KVM_PPC_GET_SMMU_INFO: {
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@ -243,3 +243,11 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 }
 #endif /* CONFIG_DEBUG_VM */

+unsigned long vmalloc_to_phys(void *va)
+{
+	unsigned long pfn = vmalloc_to_pfn(va);
+
+	BUG_ON(!pfn);
+	return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va);
+}
+EXPORT_SYMBOL_GPL(vmalloc_to_phys);
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@ -493,14 +493,6 @@ static size_t event_to_attr_ct(struct hv_24x7_event_data *event)
 	}
 }

-static unsigned long vmalloc_to_phys(void *v)
-{
-	struct page *p = vmalloc_to_page(v);
-
-	BUG_ON(!p);
-	return page_to_phys(p) + offset_in_page(v);
-}
-
 /* */
 struct event_uniq {
 	struct rb_node node;
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@ -159,6 +159,27 @@ static void icp_native_cause_ipi(int cpu, unsigned long data)
 	icp_native_set_qirr(cpu, IPI_PRIORITY);
 }

+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void icp_native_cause_ipi_rm(int cpu)
+{
+	/*
+	 * Currently not used to send IPIs to another CPU
+	 * on the same core. Only caller is KVM real mode.
+	 * Need the physical address of the XICS to be
+	 * previously saved in kvm_hstate in the paca.
+	 */
+	unsigned long xics_phys;
+
+	/*
+	 * Just like the cause_ipi functions, it is required to
+	 * include a full barrier (out8 includes a sync) before
+	 * causing the IPI.
+	 */
+	xics_phys = paca[cpu].kvm_hstate.xics_phys;
+	out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY);
+}
+#endif
+
 /*
 * Called when an interrupt is received on an off-line CPU to
 * clear the interrupt, so that the CPU can go back to nap mode.
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@ -862,6 +862,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
 #define KVM_CAP_HYPERV_SYNIC 123
 #define KVM_CAP_S390_RI 124
+#define KVM_CAP_SPAPR_TCE_64 125

 #ifdef KVM_CAP_IRQ_ROUTING

@ -1154,6 +1155,8 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_PPC_ALLOC_HTAB */
 #define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
 #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
+#define KVM_CREATE_SPAPR_TCE_64	  _IOW(KVMIO,  0xa8, \
+				       struct kvm_create_spapr_tce_64)
 /* Available with KVM_CAP_RMA */
 #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 /* Available with KVM_CAP_PPC_HTAB_FD */