KVM: PPC: Book3S HV: Close race with testing for signals on guest entry

At present, interrupts are hard-disabled fairly late in the guest
entry path, in the assembly code.  Since we check for pending signals
for the vCPU(s) task(s) earlier in the guest entry path, it is
possible for a signal to be delivered before we enter the guest but
not be noticed until after we exit the guest for some other reason.

Similarly, it is possible for the scheduler to request a reschedule
while we are in the guest entry path, and we won't notice until after
we have run the guest, potentially for a whole timeslice.

Furthermore, with a radix guest on POWER9, we can take the interrupt
with the MMU on.  In this case we end up leaving interrupts
hard-disabled after the guest exit, and they are likely to stay
hard-disabled until we exit to userspace or context-switch to
another process.  This was masking the fact that we were also not
setting the RI (recoverable interrupt) bit in the MSR, meaning
that if we had taken an interrupt, it would have crashed the host
kernel with an unrecoverable interrupt message.

To close these races, we need to check for signals and reschedule
requests after hard-disabling interrupts, and then keep interrupts
hard-disabled until we enter the guest.  If there is a signal or a
reschedule request from another CPU, it will send an IPI, which will
cause a guest exit.

This puts the interrupt disabling before we call kvmppc_start_thread()
for all the secondary threads of this core that are going to run vCPUs.
The reason for that is that once we have started the secondary threads
there is no easy way to back out without going through at least part
of the guest entry path.  However, kvmppc_start_thread() includes some
code for radix guests which needs to call smp_call_function(), which
must be called with interrupts enabled.  To solve this problem, this
patch moves that code into a separate function that is called earlier.

When the guest exit is caused by an external interrupt, a hypervisor
doorbell or a hypervisor maintenance interrupt, we now handle these
using the replay facility.  __kvmppc_vcore_entry() now returns the
trap number that caused the exit on this thread, and instead of the
assembly code jumping to the handler entry, we return to C code with
interrupts still hard-disabled and set the irq_happened flag in the
PACA, so that when we do local_irq_enable() the appropriate handler
gets called.

With all this, we now have the interrupt soft-enable flag clear while
we are in the guest.  This is useful because code in the real-mode
hypercall handlers that checks whether interrupts are enabled will
now see that they are disabled, which is correct, since interrupts
are hard-disabled in the real-mode code.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
This commit is contained in:
Paul Mackerras 2017-06-26 15:45:51 +10:00
parent 898b25b202
commit 8b24e69fc4
3 changed files with 127 additions and 79 deletions

View File

@ -647,6 +647,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
unsigned long stolen;
unsigned long core_stolen;
u64 now;
unsigned long flags;
dt = vcpu->arch.dtl_ptr;
vpa = vcpu->arch.vpa.pinned_addr;
@ -654,10 +655,10 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
core_stolen = vcore_stolen_time(vc, now);
stolen = core_stolen - vcpu->arch.stolen_logged;
vcpu->arch.stolen_logged = core_stolen;
spin_lock_irq(&vcpu->arch.tbacct_lock);
spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
stolen += vcpu->arch.busy_stolen;
vcpu->arch.busy_stolen = 0;
spin_unlock_irq(&vcpu->arch.tbacct_lock);
spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
if (!dt || !vpa)
return;
memset(dt, 0, sizeof(struct dtl_entry));
@ -2085,7 +2086,7 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
}
}
extern void __kvmppc_vcore_entry(void);
extern int __kvmppc_vcore_entry(void);
static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
struct kvm_vcpu *vcpu)
@ -2167,6 +2168,31 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
smp_call_function_single(cpu + i, do_nothing, NULL, 1);
}
static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
{
struct kvm *kvm = vcpu->kvm;
/*
* With radix, the guest can do TLB invalidations itself,
* and it could choose to use the local form (tlbiel) if
* it is invalidating a translation that has only ever been
* used on one vcpu. However, that doesn't mean it has
* only ever been used on one physical cpu, since vcpus
* can move around between pcpus. To cope with this, when
* a vcpu moves from one pcpu to another, we need to tell
* any vcpus running on the same core as this vcpu previously
* ran to flush the TLB. The TLB is shared between threads,
* so we use a single bit in .need_tlb_flush for all 4 threads.
*/
if (vcpu->arch.prev_cpu != pcpu) {
if (vcpu->arch.prev_cpu >= 0 &&
cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
cpu_first_thread_sibling(pcpu))
radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
vcpu->arch.prev_cpu = pcpu;
}
}
static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
{
int cpu;
@ -2182,26 +2208,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
cpu += vcpu->arch.ptid;
vcpu->cpu = vc->pcpu;
vcpu->arch.thread_cpu = cpu;
/*
* With radix, the guest can do TLB invalidations itself,
* and it could choose to use the local form (tlbiel) if
* it is invalidating a translation that has only ever been
* used on one vcpu. However, that doesn't mean it has
* only ever been used on one physical cpu, since vcpus
* can move around between pcpus. To cope with this, when
* a vcpu moves from one pcpu to another, we need to tell
* any vcpus running on the same core as this vcpu previously
* ran to flush the TLB. The TLB is shared between threads,
* so we use a single bit in .need_tlb_flush for all 4 threads.
*/
if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) {
if (vcpu->arch.prev_cpu >= 0 &&
cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
cpu_first_thread_sibling(cpu))
radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
vcpu->arch.prev_cpu = cpu;
}
cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
}
tpaca = &paca[cpu];
@ -2470,6 +2476,18 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
spin_unlock(&lp->lock);
}
static bool recheck_signals(struct core_info *cip)
{
int sub, i;
struct kvm_vcpu *vcpu;
for (sub = 0; sub < cip->n_subcores; ++sub)
for_each_runnable_thread(i, vcpu, cip->vc[sub])
if (signal_pending(vcpu->arch.run_task))
return true;
return false;
}
static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
{
int still_running = 0, i;
@ -2568,6 +2586,21 @@ static inline int kvmppc_set_host_core(unsigned int cpu)
return 0;
}
static void set_irq_happened(int trap)
{
switch (trap) {
case BOOK3S_INTERRUPT_EXTERNAL:
local_paca->irq_happened |= PACA_IRQ_EE;
break;
case BOOK3S_INTERRUPT_H_DOORBELL:
local_paca->irq_happened |= PACA_IRQ_DBELL;
break;
case BOOK3S_INTERRUPT_HMI:
local_paca->irq_happened |= PACA_IRQ_HMI;
break;
}
}
/*
* Run a set of guest threads on a physical core.
* Called with vc->lock held.
@ -2587,6 +2620,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
int pcpu, thr;
int target_threads;
int controlled_threads;
int trap;
/*
* Remove from the list any threads that have a signal pending
@ -2638,6 +2672,43 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
if (vc->num_threads < target_threads)
collect_piggybacks(&core_info, target_threads);
/*
* On radix, arrange for TLB flushing if necessary.
* This has to be done before disabling interrupts since
* it uses smp_call_function().
*/
pcpu = smp_processor_id();
if (kvm_is_radix(vc->kvm)) {
for (sub = 0; sub < core_info.n_subcores; ++sub)
for_each_runnable_thread(i, vcpu, core_info.vc[sub])
kvmppc_prepare_radix_vcpu(vcpu, pcpu);
}
/*
* Hard-disable interrupts, and check resched flag and signals.
* If we need to reschedule or deliver a signal, clean up
* and return without going into the guest(s).
*/
local_irq_disable();
hard_irq_disable();
if (lazy_irq_pending() || need_resched() ||
recheck_signals(&core_info)) {
local_irq_enable();
vc->vcore_state = VCORE_INACTIVE;
/* Unlock all except the primary vcore */
for (sub = 1; sub < core_info.n_subcores; ++sub) {
pvc = core_info.vc[sub];
/* Put back on to the preempted vcores list */
kvmppc_vcore_preempt(pvc);
spin_unlock(&pvc->lock);
}
for (i = 0; i < controlled_threads; ++i)
kvmppc_release_hwthread(pcpu + i);
return;
}
kvmppc_clear_host_core(pcpu);
/* Decide on micro-threading (split-core) mode */
subcore_size = threads_per_subcore;
cmd_bit = stat_bit = 0;
@ -2665,7 +2736,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
/* order writes to split_info before kvm_split_mode pointer */
smp_wmb();
}
pcpu = smp_processor_id();
for (thr = 0; thr < controlled_threads; ++thr)
paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
@ -2685,8 +2755,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
}
}
kvmppc_clear_host_core(pcpu);
/* Start all the threads */
active = 0;
for (sub = 0; sub < core_info.n_subcores; ++sub) {
@ -2738,14 +2806,25 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
for (sub = 0; sub < core_info.n_subcores; ++sub)
spin_unlock(&core_info.vc[sub]->lock);
/*
* Interrupts will be enabled once we get into the guest,
* so tell lockdep that we're about to enable interrupts.
*/
trace_hardirqs_on();
guest_enter();
srcu_idx = srcu_read_lock(&vc->kvm->srcu);
__kvmppc_vcore_entry();
trap = __kvmppc_vcore_entry();
srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
guest_exit();
trace_hardirqs_off();
set_irq_happened(trap);
spin_lock(&vc->lock);
/* prevent other vcpu threads from doing kvmppc_start_thread() now */
vc->vcore_state = VCORE_EXITING;
@ -2773,6 +2852,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
split_info.do_nap = 0;
}
kvmppc_set_host_core(pcpu);
local_irq_enable();
/* Let secondaries go back to the offline loop */
for (i = 0; i < controlled_threads; ++i) {
kvmppc_release_hwthread(pcpu + i);
@ -2781,13 +2864,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
}
kvmppc_set_host_core(pcpu);
spin_unlock(&vc->lock);
/* make sure updates to secondary vcpu structs are visible now */
smp_mb();
guest_exit();
for (sub = 0; sub < core_info.n_subcores; ++sub) {
pvc = core_info.vc[sub];

View File

@ -61,13 +61,6 @@ BEGIN_FTR_SECTION
std r3, HSTATE_DABR(r13)
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
/* Hard-disable interrupts */
mfmsr r10
std r10, HSTATE_HOST_MSR(r13)
rldicl r10,r10,48,1
rotldi r10,r10,16
mtmsrd r10,1
/* Save host PMU registers */
BEGIN_FTR_SECTION
/* Work around P8 PMAE bug */
@ -153,6 +146,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
*
* R1 = host R1
* R2 = host R2
* R3 = trap number on this thread
* R12 = exit handler id
* R13 = PACA
*/

View File

@ -69,6 +69,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
std r0, PPC_LR_STKOFF(r1)
stdu r1, -112(r1)
mfmsr r10
std r10, HSTATE_HOST_MSR(r13)
LOAD_REG_ADDR(r5, kvmppc_call_hv_entry)
li r0,MSR_RI
andc r0,r10,r0
@ -165,6 +166,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
addi r1, r1, 112
ld r7, HSTATE_HOST_MSR(r13)
/* Return the trap number on this thread as the return value */
mr r3, r12
/*
* If we came back from the guest via a relocation-on interrupt,
* we will be in virtual mode at this point, which makes it a
@ -174,59 +178,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
andi. r0, r0, MSR_IR /* in real mode? */
bne .Lvirt_return
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
beq 11f
cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
beq 15f /* Invoke the H_DOORBELL handler */
cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI
beq cr2, 14f /* HMI check */
/* RFI into the highmem handler, or branch to interrupt handler */
/* RFI into the highmem handler */
mfmsr r6
li r0, MSR_RI
andc r6, r6, r0
mtmsrd r6, 1 /* Clear RI in MSR */
mtsrr0 r8
mtsrr1 r7
/*
* BOOK3S_INTERRUPT_MACHINE_CHECK is handled at the
* time of guest exit
*/
RFI
/* On POWER7, we have external interrupts set to use HSRR0/1 */
11: mtspr SPRN_HSRR0, r8
mtspr SPRN_HSRR1, r7
ba 0x500
14: mtspr SPRN_HSRR0, r8
mtspr SPRN_HSRR1, r7
b hmi_exception_after_realmode
15: mtspr SPRN_HSRR0, r8
mtspr SPRN_HSRR1, r7
ba 0xe80
/* Virtual-mode return - can't get here for HMI or machine check */
/* Virtual-mode return */
.Lvirt_return:
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
beq 16f
cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
beq 17f
andi. r0, r7, MSR_EE /* were interrupts hard-enabled? */
beq 18f
mtmsrd r7, 1 /* if so then re-enable them */
18: mtlr r8
mtlr r8
blr
16: mtspr SPRN_HSRR0, r8 /* jump to reloc-on external vector */
mtspr SPRN_HSRR1, r7
b exc_virt_0x4500_hardware_interrupt
17: mtspr SPRN_HSRR0, r8
mtspr SPRN_HSRR1, r7
b exc_virt_0x4e80_h_doorbell
kvmppc_primary_no_guest:
/* We handle this much like a ceded vcpu */
/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
@ -1258,6 +1223,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
stw r12,VCPU_TRAP(r9)
/*
* Now that we have saved away SRR0/1 and HSRR0/1,
* interrupts are recoverable in principle, so set MSR_RI.
* This becomes important for relocation-on interrupts from
* the guest, which we can get in radix mode on POWER9.
*/
li r0, MSR_RI
mtmsrd r0, 1
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
addi r3, r9, VCPU_TB_RMINTR
mr r4, r9