Commit d73b1d02 authored by Rob Clark's avatar Rob Clark
Browse files

drm/msm: Hangcheck progress detection



If the hangcheck timer expires, check if the fw's position in the
cmdstream has advanced (changed) since last timer expiration, and
allow it up to three additional "extensions" to it's alotted time.
The intention is to continue to catch "shader stuck in a loop" type
hangs quickly, but allow more time for things that are actually
making forward progress.

Because we need to sample the CP state twice to detect if there has
not been progress, this also cuts the the timer's duration in half.

v2: Fix typo (REG_A6XX_CP_CSQ_IB2_STAT), add comment
v3: Only halve hangcheck timer duration for generations which
    support progress detection (hdanton); removed unused a5xx
    progress (without knowing how to adjust for data buffered
    in ROQ it is too likely to report a false negative)
v4: Comment updates to better describe the total hangcheck
    duration when progress detection is applied

Reviewed-by: default avatarChia-I Wu <olvaffe@gmail.com>
Tested-by: Chia-I Wu <olvaffe@gmail.com> # dEQP-GLES2.functional.flush_finish.wait
Signed-off-by: default avatarRob Clark <robdclark@chromium.org>
Reviewed-by: default avatarAkhil P Oommen <quic_akhilpo@quicinc.com>
Patchwork: https://patchwork.freedesktop.org/patch/511584/
Link: https://lore.kernel.org/r/20221114193049.1533391-3-robdclark@gmail.com
parent cade05b2
Loading
Loading
Loading
Loading
+34 −0
Original line number Diff line number Diff line
@@ -1843,6 +1843,39 @@ static uint32_t a6xx_get_rptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
	return ring->memptrs->rptr = gpu_read(gpu, REG_A6XX_CP_RB_RPTR);
}

static bool a6xx_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
{
	struct msm_cp_state cp_state = {
		.ib1_base = gpu_read64(gpu, REG_A6XX_CP_IB1_BASE),
		.ib2_base = gpu_read64(gpu, REG_A6XX_CP_IB2_BASE),
		.ib1_rem  = gpu_read(gpu, REG_A6XX_CP_IB1_REM_SIZE),
		.ib2_rem  = gpu_read(gpu, REG_A6XX_CP_IB2_REM_SIZE),
	};
	bool progress;

	/*
	 * Adjust the remaining data to account for what has already been
	 * fetched from memory, but not yet consumed by the SQE.
	 *
	 * This is not *technically* correct, the amount buffered could
	 * exceed the IB size due to hw prefetching ahead, but:
	 *
	 * (1) We aren't trying to find the exact position, just whether
	 *     progress has been made
	 * (2) The CP_REG_TO_MEM at the end of a submit should be enough
	 *     to prevent prefetching into an unrelated submit.  (And
	 *     either way, at some point the ROQ will be full.)
	 */
	cp_state.ib1_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB1_STAT) >> 16;
	cp_state.ib2_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB2_STAT) >> 16;

	progress = !!memcmp(&cp_state, &ring->last_cp_state, sizeof(cp_state));

	ring->last_cp_state = cp_state;

	return progress;
}

static u32 a618_get_speed_bin(u32 fuse)
{
	if (fuse == 0)
@@ -1959,6 +1992,7 @@ static const struct adreno_gpu_funcs funcs = {
		.create_address_space = a6xx_create_address_space,
		.create_private_address_space = a6xx_create_private_address_space,
		.get_rptr = a6xx_get_rptr,
		.progress = a6xx_progress,
	},
	.get_timestamp = a6xx_get_timestamp,
};
+0 −1
Original line number Diff line number Diff line
@@ -419,7 +419,6 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv)
	priv->dev = ddev;

	priv->wq = alloc_ordered_workqueue("msm", 0);
	priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD;

	INIT_LIST_HEAD(&priv->objects);
	mutex_init(&priv->obj_lock);
+7 −1
Original line number Diff line number Diff line
@@ -224,7 +224,13 @@ struct msm_drm_private {

	struct drm_atomic_state *pm_state;

	/* For hang detection, in ms */
	/**
	 * hangcheck_period: For hang detection, in ms
	 *
	 * Note that in practice, a submit/job will get at least two hangcheck
	 * periods, due to checking for progress being implemented as simply
	 * "have the CP position registers changed since last time?"
	 */
	unsigned int hangcheck_period;

	/**
+30 −1
Original line number Diff line number Diff line
@@ -492,6 +492,21 @@ static void hangcheck_timer_reset(struct msm_gpu *gpu)
			round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period)));
}

static bool made_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
{
	if (ring->hangcheck_progress_retries >= DRM_MSM_HANGCHECK_PROGRESS_RETRIES)
		return false;

	if (!gpu->funcs->progress)
		return false;

	if (!gpu->funcs->progress(gpu, ring))
		return false;

	ring->hangcheck_progress_retries++;
	return true;
}

static void hangcheck_handler(struct timer_list *t)
{
	struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer);
@@ -502,9 +517,12 @@ static void hangcheck_handler(struct timer_list *t)
	if (fence != ring->hangcheck_fence) {
		/* some progress has been made.. ya! */
		ring->hangcheck_fence = fence;
	} else if (fence_before(fence, ring->fctx->last_fence)) {
		ring->hangcheck_progress_retries = 0;
	} else if (fence_before(fence, ring->fctx->last_fence) &&
			!made_progress(gpu, ring)) {
		/* no progress and not done.. hung! */
		ring->hangcheck_fence = fence;
		ring->hangcheck_progress_retries = 0;
		DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
				gpu->name, ring->id);
		DRM_DEV_ERROR(dev->dev, "%s:     completed fence: %u\n",
@@ -830,6 +848,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
		struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
		const char *name, struct msm_gpu_config *config)
{
	struct msm_drm_private *priv = drm->dev_private;
	int i, ret, nr_rings = config->nr_rings;
	void *memptrs;
	uint64_t memptrs_iova;
@@ -857,6 +876,16 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
	kthread_init_work(&gpu->recover_work, recover_worker);
	kthread_init_work(&gpu->fault_work, fault_worker);

	priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD;

	/*
	 * If progress detection is supported, halve the hangcheck timer
	 * duration, as it takes two iterations of the hangcheck handler
	 * to detect a hang.
	 */
	if (funcs->progress)
		priv->hangcheck_period /= 2;

	timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0);

	spin_lock_init(&gpu->perf_lock);
+10 −0
Original line number Diff line number Diff line
@@ -78,6 +78,15 @@ struct msm_gpu_funcs {
	struct msm_gem_address_space *(*create_private_address_space)
		(struct msm_gpu *gpu);
	uint32_t (*get_rptr)(struct msm_gpu *gpu, struct msm_ringbuffer *ring);

	/**
	 * progress: Has the GPU made progress?
	 *
	 * Return true if GPU position in cmdstream has advanced (or changed)
	 * since the last call.  To avoid false negatives, this should account
	 * for cmdstream that is buffered in this FIFO upstream of the CP fw.
	 */
	bool (*progress)(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
};

/* Additional state for iommu faults: */
@@ -237,6 +246,7 @@ struct msm_gpu {
#define DRM_MSM_INACTIVE_PERIOD   66 /* in ms (roughly four frames) */

#define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 500 /* in ms */
#define DRM_MSM_HANGCHECK_PROGRESS_RETRIES 3
	struct timer_list hangcheck_timer;

	/* Fault info for most recent iova fault: */
Loading