Commit 9d5526da authored by Dusica Milinkovic's avatar Dusica Milinkovic Committed by Jialin Zhang
Browse files

drm/amdgpu: Increase tlb flush timeout for sriov

stable inclusion
from stable-v5.10.141
commit afa169f79d479edb23f1a727a37265a25d5a1e8e
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I685FC

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=afa169f79d479edb23f1a727a37265a25d5a1e8e



--------------------------------

[ Upstream commit 373008bf ]

[Why]
During multi-vf executing benchmark (Luxmark) observed kiq error timeout.
It happenes because all of VFs do the tlb invalidation at the same time.
Although each VF has the invalidate register set, from hardware side
the invalidate requests are queue to execute.

[How]
In case of 12 VF increase timeout on 12*100ms

Signed-off-by: default avatarDusica Milinkovic <Dusica.Milinkovic@amd.com>
Acked-by: default avatarShaoyun Liu <shaoyun.liu@amd.com>
Acked-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
Signed-off-by: default avatarJialin Zhang <zhangjialin11@huawei.com>
Reviewed-by: default avatarZheng Zengkai <zhengzengkai@huawei.com>
parent 162ee5e0
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -283,7 +283,7 @@ enum amdgpu_kiq_irq {
	AMDGPU_CP_KIQ_IRQ_DRIVER0 = 0,
	AMDGPU_CP_KIQ_IRQ_LAST
};

#define SRIOV_USEC_TIMEOUT  1200000 /* wait 12 * 100ms for SRIOV */
#define MAX_KIQ_REG_WAIT       5000 /* in usecs, 5ms */
#define MAX_KIQ_REG_BAILOUT_INTERVAL   5 /* in msecs, 5ms */
#define MAX_KIQ_REG_TRY 80 /* 20 -> 80 */
+2 −1
Original line number Diff line number Diff line
@@ -371,6 +371,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
	uint32_t seq;
	uint16_t queried_pasid;
	bool ret;
	u32 usec_timeout = amdgpu_sriov_vf(adev) ? SRIOV_USEC_TIMEOUT : adev->usec_timeout;
	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
	struct amdgpu_kiq *kiq = &adev->gfx.kiq;

@@ -389,7 +390,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,

		amdgpu_ring_commit(ring);
		spin_unlock(&adev->gfx.kiq.ring_lock);
		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
		r = amdgpu_fence_wait_polling(ring, seq, usec_timeout);
		if (r < 1) {
			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
			return -ETIME;
+2 −1
Original line number Diff line number Diff line
@@ -839,6 +839,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
	uint32_t seq;
	uint16_t queried_pasid;
	bool ret;
	u32 usec_timeout = amdgpu_sriov_vf(adev) ? SRIOV_USEC_TIMEOUT : adev->usec_timeout;
	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
	struct amdgpu_kiq *kiq = &adev->gfx.kiq;

@@ -878,7 +879,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,

		amdgpu_ring_commit(ring);
		spin_unlock(&adev->gfx.kiq.ring_lock);
		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
		r = amdgpu_fence_wait_polling(ring, seq, usec_timeout);
		if (r < 1) {
			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
			up_read(&adev->reset_sem);