Commit 5d36acb7 authored by Chris Wilson's avatar Chris Wilson Committed by Andi Shyti
Browse files

drm/i915/gt: Batch TLB invalidations



Invalidate TLB in batches, in order to reduce performance regressions.

Currently, every caller performs a full barrier around a TLB
invalidation, ignoring all other invalidations that may have already
removed their PTEs from the cache. As this is a synchronous operation
and can be quite slow, we cause multiple threads to contend on the TLB
invalidate mutex blocking userspace.

We only need to invalidate the TLB once after replacing our PTE to
ensure that there is no possible continued access to the physical
address before releasing our pages. By tracking a seqno for each full
TLB invalidate we can quickly determine if one has been performed since
rewriting the PTE, and only if necessary trigger one for ourselves.

That helps to reduce the performance regression introduced by TLB
invalidate logic.

[mchehab: rebased to not require moving the code to a separate file]

Cc: stable@vger.kernel.org
Fixes: 7938d615 ("drm/i915: Flush TLBs before releasing backing store")
Suggested-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: default avatarChris Wilson <chris.p.wilson@intel.com>
Cc: Fei Yang <fei.yang@intel.com>
Signed-off-by: default avatarMauro Carvalho Chehab <mchehab@kernel.org>
Acked-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: default avatarAndi Shyti <andi.shyti@linux.intel.com>
Signed-off-by: default avatarAndi Shyti <andi.shyti@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/4e97ef5deb6739cadaaf40aa45620547e9c4ec06.1658924372.git.mchehab@kernel.org
parent be0366f1
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -335,7 +335,6 @@ struct drm_i915_gem_object {
#define I915_BO_READONLY          BIT(7)
#define I915_TILING_QUIRK_BIT     8 /* unknown swizzling; do not release! */
#define I915_BO_PROTECTED         BIT(9)
#define I915_BO_WAS_BOUND_BIT     10
	/**
	 * @mem_flags - Mutable placement-related flags
	 *
@@ -616,6 +615,8 @@ struct drm_i915_gem_object {
		 * pages were last acquired.
		 */
		bool dirty:1;

		u32 tlb;
	} mm;

	struct {
+13 −8
Original line number Diff line number Diff line
@@ -191,6 +191,18 @@ static void unmap_object(struct drm_i915_gem_object *obj, void *ptr)
		vunmap(ptr);
}

static void flush_tlb_invalidate(struct drm_i915_gem_object *obj)
{
	struct drm_i915_private *i915 = to_i915(obj->base.dev);
	struct intel_gt *gt = to_gt(i915);

	if (!obj->mm.tlb)
		return;

	intel_gt_invalidate_tlb(gt, obj->mm.tlb);
	obj->mm.tlb = 0;
}

struct sg_table *
__i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
{
@@ -216,14 +228,7 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
	__i915_gem_object_reset_page_iter(obj);
	obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;

	if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
		struct drm_i915_private *i915 = to_i915(obj->base.dev);
		struct intel_gt *gt = to_gt(i915);
		intel_wakeref_t wakeref;

		with_intel_gt_pm_if_awake(gt, wakeref)
			intel_gt_invalidate_tlbs(gt);
	}
	flush_tlb_invalidate(obj);

	return pages;
}
+40 −13
Original line number Diff line number Diff line
@@ -38,8 +38,6 @@ static void __intel_gt_init_early(struct intel_gt *gt)
{
	spin_lock_init(&gt->irq_lock);

	mutex_init(&gt->tlb_invalidate_lock);

	INIT_LIST_HEAD(&gt->closed_vma);
	spin_lock_init(&gt->closed_lock);

@@ -50,6 +48,8 @@ static void __intel_gt_init_early(struct intel_gt *gt)
	intel_gt_init_reset(gt);
	intel_gt_init_requests(gt);
	intel_gt_init_timelines(gt);
	mutex_init(&gt->tlb.invalidate_lock);
	seqcount_mutex_init(&gt->tlb.seqno, &gt->tlb.invalidate_lock);
	intel_gt_pm_init_early(gt);

	intel_uc_init_early(&gt->uc);
@@ -770,6 +770,7 @@ void intel_gt_driver_late_release_all(struct drm_i915_private *i915)
		intel_gt_fini_requests(gt);
		intel_gt_fini_reset(gt);
		intel_gt_fini_timelines(gt);
		mutex_destroy(&gt->tlb.invalidate_lock);
		intel_engines_free(gt);
	}
}
@@ -908,7 +909,7 @@ get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
	return rb;
}

void intel_gt_invalidate_tlbs(struct intel_gt *gt)
static void mmio_invalidate_full(struct intel_gt *gt)
{
	static const i915_reg_t gen8_regs[] = {
		[RENDER_CLASS]			= GEN8_RTCR,
@@ -931,12 +932,6 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
	const i915_reg_t *regs;
	unsigned int num = 0;

	if (I915_SELFTEST_ONLY(gt->awake == -ENODEV))
		return;

	if (intel_gt_is_wedged(gt))
		return;

	if (GRAPHICS_VER(i915) == 12) {
		regs = gen12_regs;
		num = ARRAY_SIZE(gen12_regs);
@@ -951,9 +946,6 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
			  "Platform does not implement TLB invalidation!"))
		return;

	GEM_TRACE("\n");

	mutex_lock(&gt->tlb_invalidate_lock);
	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);

	spin_lock_irq(&uncore->lock); /* serialise invalidate with GT reset */
@@ -973,6 +965,8 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
		awake |= engine->mask;
	}

	GT_TRACE(gt, "invalidated engines %08x\n", awake);

	/* Wa_2207587034:tgl,dg1,rkl,adl-s,adl-p */
	if (awake &&
	    (IS_TIGERLAKE(i915) ||
@@ -1012,5 +1006,38 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
	 * transitions.
	 */
	intel_uncore_forcewake_put_delayed(uncore, FORCEWAKE_ALL);
	mutex_unlock(&gt->tlb_invalidate_lock);
}

static bool tlb_seqno_passed(const struct intel_gt *gt, u32 seqno)
{
	u32 cur = intel_gt_tlb_seqno(gt);

	/* Only skip if a *full* TLB invalidate barrier has passed */
	return (s32)(cur - ALIGN(seqno, 2)) > 0;
}

void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno)
{
	intel_wakeref_t wakeref;

	if (I915_SELFTEST_ONLY(gt->awake == -ENODEV))
		return;

	if (intel_gt_is_wedged(gt))
		return;

	if (tlb_seqno_passed(gt, seqno))
		return;

	with_intel_gt_pm_if_awake(gt, wakeref) {
		mutex_lock(&gt->tlb.invalidate_lock);
		if (tlb_seqno_passed(gt, seqno))
			goto unlock;

		mmio_invalidate_full(gt);

		write_seqcount_invalidate(&gt->tlb.seqno);
unlock:
		mutex_unlock(&gt->tlb.invalidate_lock);
	}
}
+11 −1
Original line number Diff line number Diff line
@@ -101,6 +101,16 @@ void intel_gt_info_print(const struct intel_gt_info *info,

void intel_gt_watchdog_work(struct work_struct *work);

void intel_gt_invalidate_tlbs(struct intel_gt *gt);
static inline u32 intel_gt_tlb_seqno(const struct intel_gt *gt)
{
	return seqprop_sequence(&gt->tlb.seqno);
}

static inline u32 intel_gt_next_invalidate_tlb_full(const struct intel_gt *gt)
{
	return intel_gt_tlb_seqno(gt) | 1;
}

void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno);

#endif /* __INTEL_GT_H__ */
+17 −1
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@
#include <linux/llist.h>
#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/seqlock.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/workqueue.h>
@@ -88,7 +89,22 @@ struct intel_gt {
	struct intel_uc uc;
	struct intel_gsc gsc;

	struct mutex tlb_invalidate_lock;
	struct {
		/* Serialize global tlb invalidations */
		struct mutex invalidate_lock;

		/*
		 * Batch TLB invalidations
		 *
		 * After unbinding the PTE, we need to ensure the TLB
		 * are invalidated prior to releasing the physical pages.
		 * But we only need one such invalidation for all unbinds,
		 * so we track how many TLB invalidations have been
		 * performed since unbind the PTE and only emit an extra
		 * invalidate if no full barrier has been passed.
		 */
		seqcount_mutex_t seqno;
	} tlb;

	struct i915_wa_list wa_list;

Loading