Commit 6ffe2c6e authored by Nicholas Piggin's avatar Nicholas Piggin Committed by Michael Ellerman
Browse files

KVM: PPC: Book3S HV P9: Reduce irq_work vs guest decrementer races



irq_work's use of the DEC SPR is racy with guest<->host switch and guest
entry which flips the DEC interrupt to guest, which could lose a host
work interrupt.

This patch closes one race, and attempts to comment another class of
races.

Signed-off-by: default avatarNicholas Piggin <npiggin@gmail.com>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210528090752.3542186-11-npiggin@gmail.com
parent 413679e7
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
@@ -97,6 +97,18 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low,
extern void secondary_cpu_time_init(void);
extern void __init time_init(void);

#ifdef CONFIG_PPC64
static inline unsigned long test_irq_work_pending(void)
{
	unsigned long x;

	asm volatile("lbz %0,%1(13)"
		: "=r" (x)
		: "i" (offsetof(struct paca_struct, irq_work_pending)));
	return x;
}
#endif

DECLARE_PER_CPU(u64, decrementers_next_tb);

/* Convert timebase ticks to nanoseconds */
+0 −10
Original line number Diff line number Diff line
@@ -508,16 +508,6 @@ EXPORT_SYMBOL(profile_pc);
 * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
 */
#ifdef CONFIG_PPC64
static inline unsigned long test_irq_work_pending(void)
{
	unsigned long x;

	asm volatile("lbz %0,%1(13)"
		: "=r" (x)
		: "i" (offsetof(struct paca_struct, irq_work_pending)));
	return x;
}

static inline void set_irq_work_pending_flag(void)
{
	asm volatile("stb %0,%1(13)" : :
+15 −0
Original line number Diff line number Diff line
@@ -3708,6 +3708,18 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
	if (!(vcpu->arch.ctrl & 1))
		mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);

	/*
	 * When setting DEC, we must always deal with irq_work_raise via NMI vs
	 * setting DEC. The problem occurs right as we switch into guest mode
	 * if a NMI hits and sets pending work and sets DEC, then that will
	 * apply to the guest and not bring us back to the host.
	 *
	 * irq_work_raise could check a flag (or possibly LPCR[HDICE] for
	 * example) and set HDEC to 1? That wouldn't solve the nested hv
	 * case which needs to abort the hcall or zero the time limit.
	 *
	 * XXX: Another day's problem.
	 */
	mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());

	if (kvmhv_on_pseries()) {
@@ -3822,6 +3834,9 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
	vc->in_guest = 0;

	mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
	/* We may have raced with new irq work */
	if (test_irq_work_pending())
		set_dec(1);
	mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);

	kvmhv_load_host_pmu();