Blob Blame History Raw
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 19 Oct 2017 14:11:23 +1100
Subject: KVM: PPC: Book3S HV: Run HPT guests on POWER9 radix hosts
Patch-mainline: v4.15-rc1
Git-commit: c01015091a77035de1939ef106bfbcaf9a21395f
References: bsc#1077761

This patch removes the restriction that a radix host can only run
radix guests, allowing us to run HPT (hashed page table) guests as
well.  This is useful because it provides a way to run old guest
kernels that know about POWER8 but not POWER9.

Unfortunately, POWER9 currently has a restriction that all threads
in a given code must either all be in HPT mode, or all in radix mode.
This means that when entering a HPT guest, we have to obtain control
of all 4 threads in the core and get them to switch their LPIDR and
LPCR registers, even if they are not going to run a guest.  On guest
exit we also have to get all threads to switch LPIDR and LPCR back
to host values.

To make this feasible, we require that KVM not be in the "independent
threads" mode, and that the CPU cores be in single-threaded mode from
the host kernel's perspective (only thread 0 online; threads 1, 2 and
3 offline).  That allows us to use the same code as on POWER8 for
obtaining control of the secondary threads.

To manage the LPCR/LPIDR changes required, we extend the kvm_split_info
struct to contain the information needed by the secondary threads.
All threads perform a barrier synchronization (where all threads wait
for every other thread to reach the synchronization point) on guest
entry, both before and after loading LPCR and LPIDR.  On guest exit,
they all once again perform a barrier synchronization both before
and after loading host values into LPCR and LPIDR.

Finally, it is also currently necessary to flush the entire TLB every
time we enter a HPT guest on a radix host.  We do this on thread 0
with a loop of tlbiel instructions.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Acked-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_book3s_asm.h |   13 +++
 arch/powerpc/kernel/asm-offsets.c         |    3 
 arch/powerpc/kvm/book3s_hv.c              |   47 +++++++++++---
 arch/powerpc/kvm/book3s_hv_builtin.c      |   99 +++++++++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |   67 ++++++++++++++++++--
 5 files changed, 212 insertions(+), 17 deletions(-)

--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -82,6 +82,16 @@
 	u8		do_nap;
 	u8		napped[MAX_SMT_THREADS];
 	struct kvmppc_vcore *vc[MAX_SUBCORES];
+	/* Bits for changing lpcr on P9 */
+	unsigned long	lpcr_req;
+	unsigned long	lpidr_req;
+	unsigned long	host_lpcr;
+	u32		do_set;
+	u32		do_restore;
+	union {
+		u32	allphases;
+		u8	phase[4];
+	} lpcr_sync;
 };
 
 /*
@@ -107,7 +117,8 @@
 	u8 hwthread_req;
 	u8 hwthread_state;
 	u8 host_ipi;
-	u8 ptid;
+	u8 ptid;		/* thread number within subcore when split */
+	u8 tid;			/* thread number within whole core */
 	struct kvm_vcpu *kvm_vcpu;
 	struct kvmppc_vcore *kvm_vcore;
 	void __iomem *xics_phys;
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -647,6 +647,7 @@
 	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
 	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
 	HSTATE_FIELD(HSTATE_PTID, ptid);
+	HSTATE_FIELD(HSTATE_TID, tid);
 	HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
 	HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
 	HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]);
@@ -672,6 +673,8 @@
 	OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
 	OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
 	OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
+	OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
+	OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #ifdef CONFIG_PPC_BOOK3S_64
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2415,6 +2415,11 @@
 	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
 		return false;
 
+	/* POWER9 currently requires all threads to be in the same MMU mode */
+	if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+	    kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
+		return false;
+
 	if (n_threads < cip->max_subcore_threads)
 		n_threads = cip->max_subcore_threads;
 	if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
@@ -2453,9 +2458,6 @@
 	for_each_runnable_thread(i, vcpu, vc) {
 		if (signal_pending(vcpu->arch.run_task))
 			vcpu->arch.ret = -EINTR;
-		else if (kvm_is_radix(vc->kvm) != radix_enabled())
-			/* can't actually run HPT guest on radix host yet... */
-			vcpu->arch.ret = -EINVAL;
 		else if (vcpu->arch.vpa.update_pending ||
 			 vcpu->arch.slb_shadow.update_pending ||
 			 vcpu->arch.dtl.update_pending)
@@ -2647,6 +2649,7 @@
 	int controlled_threads;
 	int trap;
 	bool is_power8;
+	bool hpt_on_radix;
 
 	/*
 	 * Remove from the list any threads that have a signal pending
@@ -2675,9 +2678,13 @@
 	 * Make sure we are running on primary threads, and that secondary
 	 * threads are offline.  Also check if the number of threads in this
 	 * guest are greater than the current system threads per guest.
+	 * On POWER9, we need to be not in independent-threads mode if
+	 * this is a HPT guest on a radix host.
 	 */
-	if ((controlled_threads > 1) &&
-	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
+	hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm);
+	if (((controlled_threads > 1) &&
+	     ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
+	    (hpt_on_radix && vc->kvm->arch.threads_indep)) {
 		for_each_runnable_thread(i, vcpu, vc) {
 			vcpu->arch.ret = -EBUSY;
 			kvmppc_remove_runnable(vc, vcpu);
@@ -2745,7 +2752,7 @@
 	is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
 		&& !cpu_has_feature(CPU_FTR_ARCH_300);
 
-	if (split > 1) {
+	if (split > 1 || hpt_on_radix) {
 		sip = &split_info;
 		memset(&split_info, 0, sizeof(split_info));
 		for (sub = 0; sub < core_info.n_subcores; ++sub)
@@ -2767,13 +2774,24 @@
 			split_info.subcore_size = subcore_size;
 		} else {
 			split_info.subcore_size = 1;
+			if (hpt_on_radix) {
+				/* Use the split_info for LPCR/LPIDR changes */
+				split_info.lpcr_req = vc->lpcr;
+				split_info.lpidr_req = vc->kvm->arch.lpid;
+				split_info.host_lpcr = vc->kvm->arch.host_lpcr;
+				split_info.do_set = 1;
+			}
 		}
 
 		/* order writes to split_info before kvm_split_mode pointer */
 		smp_wmb();
 	}
-	for (thr = 0; thr < controlled_threads; ++thr)
+
+	for (thr = 0; thr < controlled_threads; ++thr) {
+		paca[pcpu + thr].kvm_hstate.tid = thr;
+		paca[pcpu + thr].kvm_hstate.napping = 0;
 		paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
+	}
 
 	/* Initiate micro-threading (split-core) on POWER8 if required */
 	if (cmd_bit) {
@@ -2826,8 +2844,10 @@
 	 * When doing micro-threading, poke the inactive threads as well.
 	 * This gets them to the nap instruction after kvm_do_nap,
 	 * which reduces the time taken to unsplit later.
+	 * For POWER9 HPT guest on radix host, we need all the secondary
+	 * threads woken up so they can do the LPCR/LPIDR change.
 	 */
-	if (cmd_bit) {
+	if (cmd_bit || hpt_on_radix) {
 		split_info.do_nap = 1;	/* ask secondaries to nap when done */
 		for (thr = 1; thr < threads_per_subcore; ++thr)
 			if (!(active & (1 << thr)))
@@ -2885,8 +2905,17 @@
 			cpu_relax();
 			++loops;
 		}
-		split_info.do_nap = 0;
+	} else if (hpt_on_radix) {
+		/* Wait for all threads to have seen final sync */
+		for (thr = 1; thr < controlled_threads; ++thr) {
+			while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) {
+				HMT_low();
+				barrier();
+			}
+			HMT_medium();
+		}
 	}
+	split_info.do_nap = 0;
 
 	kvmppc_set_host_core(pcpu);
 
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -278,7 +278,8 @@
 	struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
 	int ptid = local_paca->kvm_hstate.ptid;
 	struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
-	int me, ee, i;
+	int me, ee, i, t;
+	int cpu0;
 
 	/* Set our bit in the threads-exiting-guest map in the 0xff00
 	   bits of vcore->entry_exit_map */
@@ -320,6 +321,22 @@
 		if ((ee >> 8) == 0)
 			kvmhv_interrupt_vcore(vc, ee);
 	}
+
+	/*
+	 * On POWER9 when running a HPT guest on a radix host (sip != NULL),
+	 * we have to interrupt inactive CPU threads to get them to
+	 * restore the host LPCR value.
+	 */
+	if (sip->lpcr_req) {
+		if (cmpxchg(&sip->do_restore, 0, 1) == 0) {
+			vc = local_paca->kvm_hstate.kvm_vcore;
+			cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid;
+			for (t = 1; t < threads_per_core; ++t) {
+				if (sip->napped[t])
+					kvmhv_rm_send_ipi(cpu0 + t);
+			}
+		}
+	}
 }
 
 struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
@@ -619,3 +636,83 @@
 	die("Bad interrupt in KVM entry/exit code", regs, SIGABRT);
 	panic("Bad KVM trap");
 }
+
+/*
+ * Functions used to switch LPCR HR and UPRT bits on all threads
+ * when entering and exiting HPT guests on a radix host.
+ */
+
+#define PHASE_REALMODE		1	/* in real mode */
+#define PHASE_SET_LPCR		2	/* have set LPCR */
+#define PHASE_OUT_OF_GUEST	4	/* have finished executing in guest */
+#define PHASE_RESET_LPCR	8	/* have reset LPCR to host value */
+
+#define ALL(p)		(((p) << 24) | ((p) << 16) | ((p) << 8) | (p))
+
+static void wait_for_sync(struct kvm_split_mode *sip, int phase)
+{
+	int thr = local_paca->kvm_hstate.tid;
+
+	sip->lpcr_sync.phase[thr] |= phase;
+	phase = ALL(phase);
+	while ((sip->lpcr_sync.allphases & phase) != phase) {
+		HMT_low();
+		barrier();
+	}
+	HMT_medium();
+}
+
+void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip)
+{
+	unsigned long rb, set;
+
+	/* wait for every other thread to get to real mode */
+	wait_for_sync(sip, PHASE_REALMODE);
+
+	/* Set LPCR and LPIDR */
+	mtspr(SPRN_LPCR, sip->lpcr_req);
+	mtspr(SPRN_LPID, sip->lpidr_req);
+	isync();
+
+	/* Invalidate the TLB on thread 0 */
+	if (local_paca->kvm_hstate.tid == 0) {
+		sip->do_set = 0;
+		asm volatile("ptesync" : : : "memory");
+		for (set = 0; set < POWER9_TLB_SETS_RADIX; ++set) {
+			rb = TLBIEL_INVAL_SET_LPID +
+				(set << TLBIEL_INVAL_SET_SHIFT);
+			asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : :
+				     "r" (rb), "r" (0));
+		}
+		asm volatile("ptesync" : : : "memory");
+	}
+
+	/* indicate that we have done so and wait for others */
+	wait_for_sync(sip, PHASE_SET_LPCR);
+	/* order read of sip->lpcr_sync.allphases vs. sip->do_set */
+	smp_rmb();
+}
+
+/*
+ * Called when a thread that has been in the guest needs
+ * to reload the host LPCR value - but only on POWER9 when
+ * running a HPT guest on a radix host.
+ */
+void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
+{
+	/* we're out of the guest... */
+	wait_for_sync(sip, PHASE_OUT_OF_GUEST);
+
+	mtspr(SPRN_LPID, 0);
+	mtspr(SPRN_LPCR, sip->host_lpcr);
+	isync();
+
+	if (local_paca->kvm_hstate.tid == 0) {
+		sip->do_restore = 0;
+		smp_wmb();	/* order store of do_restore vs. phase */
+	}
+
+	wait_for_sync(sip, PHASE_RESET_LPCR);
+	smp_mb();
+	local_paca->kvm_hstate.kvm_split_mode = NULL;
+}
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -82,6 +82,19 @@
 	RFI
 
 kvmppc_call_hv_entry:
+BEGIN_FTR_SECTION
+	/* On P9, do LPCR setting, if necessary */
+	ld	r3, HSTATE_SPLIT_MODE(r13)
+	cmpdi	r3, 0
+	beq	46f
+	lwz	r4, KVM_SPLIT_DO_SET(r3)
+	cmpwi	r4, 0
+	beq	46f
+	bl	kvmhv_p9_set_lpcr
+	nop
+46:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
 	ld	r4, HSTATE_KVM_VCPU(r13)
 	bl	kvmppc_hv_entry
 
@@ -385,10 +398,10 @@
 	ld	r6, 0(r6)
 	mtspr	SPRN_HDEC, r6
 	/* and set per-LPAR registers, if doing dynamic micro-threading */
-BEGIN_FTR_SECTION
 	ld	r6, HSTATE_SPLIT_MODE(r13)
 	cmpdi	r6, 0
 	beq	63f
+BEGIN_FTR_SECTION
 	ld	r0, KVM_SPLIT_RPR(r6)
 	mtspr	SPRN_RPR, r0
 	ld	r0, KVM_SPLIT_PMMAR(r6)
@@ -396,7 +409,15 @@
 	ld	r0, KVM_SPLIT_LDBAR(r6)
 	mtspr	SPRN_LDBAR, r0
 	isync
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+FTR_SECTION_ELSE
+	/* On P9 we use the split_info for coordinating LPCR changes */
+	lwz	r4, KVM_SPLIT_DO_SET(r6)
+	cmpwi	r4, 0
+	beq	63f
+	mr	r3, r6
+	bl	kvmhv_p9_set_lpcr
+	nop
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 63:
 	/* Order load of vcpu after load of vcore */
 	lwsync
@@ -467,6 +488,12 @@
 	ld	r3, HSTATE_SPLIT_MODE(r13)
 	cmpdi	r3, 0
 	beq	kvm_no_guest
+	lwz	r0, KVM_SPLIT_DO_SET(r3)
+	cmpwi	r0, 0
+	bne	kvmhv_do_set
+	lwz	r0, KVM_SPLIT_DO_RESTORE(r3)
+	cmpwi	r0, 0
+	bne	kvmhv_do_restore
 	lbz	r0, KVM_SPLIT_DO_NAP(r3)
 	cmpwi	r0, 0
 	beq	kvm_no_guest
@@ -479,6 +506,19 @@
 	stb	r0, HSTATE_HWTHREAD_STATE(r13)
 	b	kvm_no_guest
 
+kvmhv_do_set:
+	/* Set LPCR, LPIDR etc. on P9 */
+	HMT_MEDIUM
+	bl	kvmhv_p9_set_lpcr
+	nop
+	b	kvm_no_guest
+
+kvmhv_do_restore:
+	HMT_MEDIUM
+	bl	kvmhv_p9_restore_lpcr
+	nop
+	b	kvm_no_guest
+
 /*
  * Here the primary thread is trying to return the core to
  * whole-core mode, so we need to nap.
@@ -516,8 +556,7 @@
 	/* Set kvm_split_mode.napped[tid] = 1 */
 	ld	r3, HSTATE_SPLIT_MODE(r13)
 	li	r0, 1
-	lhz	r4, PACAPACAINDEX(r13)
-	clrldi	r4, r4, 61	/* micro-threading => P8 => 8 threads/core */
+	lbz	r4, HSTATE_TID(r13)
 	addi	r4, r4, KVM_SPLIT_NAPPED
 	stbx	r0, r3, r4
 	/* Check the do_nap flag again after setting napped[] */
@@ -1913,10 +1952,26 @@
 19:	lis	r8,0x7fff		/* MAX_INT@h */
 	mtspr	SPRN_HDEC,r8
 
-16:	ld	r8,KVM_HOST_LPCR(r4)
+16:
+BEGIN_FTR_SECTION
+	/* On POWER9 with HPT-on-radix we need to wait for all other threads */
+	ld	r3, HSTATE_SPLIT_MODE(r13)
+	cmpdi	r3, 0
+	beq	47f
+	lwz	r8, KVM_SPLIT_DO_RESTORE(r3)
+	cmpwi	r8, 0
+	beq	47f
+	stw	r12, STACK_SLOT_TRAP(r1)
+	bl	kvmhv_p9_restore_lpcr
+	nop
+	lwz	r12, STACK_SLOT_TRAP(r1)
+	b	48f
+47:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+	ld	r8,KVM_HOST_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
-
+48:
 	/* load host SLB entries */
 BEGIN_MMU_FTR_SECTION
 	b	0f