drm/msm: Hangcheck progress detection (d73b1d02) · Commits · EulixOS / Software / Kernel

drivers/gpu/drm/msm/adreno/a6xx_gpu.c

+34 −0

Original line number	Diff line number	Diff line
		@@ -1843,6 +1843,39 @@ static uint32_t a6xx_get_rptr(struct msm_gpu gpu, struct msm_ringbuffer ring)
		return ring->memptrs->rptr = gpu_read(gpu, REG_A6XX_CP_RB_RPTR);
		}

		static bool a6xx_progress(struct msm_gpu gpu, struct msm_ringbuffer ring)
		{
		struct msm_cp_state cp_state = {
		.ib1_base = gpu_read64(gpu, REG_A6XX_CP_IB1_BASE),
		.ib2_base = gpu_read64(gpu, REG_A6XX_CP_IB2_BASE),
		.ib1_rem = gpu_read(gpu, REG_A6XX_CP_IB1_REM_SIZE),
		.ib2_rem = gpu_read(gpu, REG_A6XX_CP_IB2_REM_SIZE),
		};
		bool progress;

		/*
		* Adjust the remaining data to account for what has already been
		* fetched from memory, but not yet consumed by the SQE.
		*
		* This is not technically correct, the amount buffered could
		* exceed the IB size due to hw prefetching ahead, but:
		*
		* (1) We aren't trying to find the exact position, just whether
		* progress has been made
		* (2) The CP_REG_TO_MEM at the end of a submit should be enough
		* to prevent prefetching into an unrelated submit. (And
		* either way, at some point the ROQ will be full.)
		*/
		cp_state.ib1_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB1_STAT) >> 16;
		cp_state.ib2_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB2_STAT) >> 16;

		progress = !!memcmp(&cp_state, &ring->last_cp_state, sizeof(cp_state));

		ring->last_cp_state = cp_state;

		return progress;
		}

		static u32 a618_get_speed_bin(u32 fuse)
		{
		if (fuse == 0)
		@@ -1959,6 +1992,7 @@ static const struct adreno_gpu_funcs funcs = {
		.create_address_space = a6xx_create_address_space,
		.create_private_address_space = a6xx_create_private_address_space,
		.get_rptr = a6xx_get_rptr,
		.progress = a6xx_progress,
		},
		.get_timestamp = a6xx_get_timestamp,
		};

drivers/gpu/drm/msm/msm_drv.c

+0 −1

Original line number	Diff line number	Diff line
		@@ -419,7 +419,6 @@ static int msm_drm_init(struct device dev, const struct drm_driver drv)
		priv->dev = ddev;

		priv->wq = alloc_ordered_workqueue("msm", 0);
		priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD;

		INIT_LIST_HEAD(&priv->objects);
		mutex_init(&priv->obj_lock);

drivers/gpu/drm/msm/msm_drv.h

+7 −1

Original line number	Diff line number	Diff line
		@@ -224,7 +224,13 @@ struct msm_drm_private {

		struct drm_atomic_state *pm_state;

		/* For hang detection, in ms */
		/**
		* hangcheck_period: For hang detection, in ms
		*
		* Note that in practice, a submit/job will get at least two hangcheck
		* periods, due to checking for progress being implemented as simply
		* "have the CP position registers changed since last time?"
		*/
		unsigned int hangcheck_period;

		/**

drivers/gpu/drm/msm/msm_gpu.c

+30 −1

Original line number	Diff line number	Diff line
		@@ -492,6 +492,21 @@ static void hangcheck_timer_reset(struct msm_gpu *gpu)
		round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period)));
		}

		static bool made_progress(struct msm_gpu gpu, struct msm_ringbuffer ring)
		{
		if (ring->hangcheck_progress_retries >= DRM_MSM_HANGCHECK_PROGRESS_RETRIES)
		return false;

		if (!gpu->funcs->progress)
		return false;

		if (!gpu->funcs->progress(gpu, ring))
		return false;

		ring->hangcheck_progress_retries++;
		return true;
		}

		static void hangcheck_handler(struct timer_list *t)
		{
		struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer);
		@@ -502,9 +517,12 @@ static void hangcheck_handler(struct timer_list *t)
		if (fence != ring->hangcheck_fence) {
		/* some progress has been made.. ya! */
		ring->hangcheck_fence = fence;
		} else if (fence_before(fence, ring->fctx->last_fence)) {
		ring->hangcheck_progress_retries = 0;
		} else if (fence_before(fence, ring->fctx->last_fence) &&
		!made_progress(gpu, ring)) {
		/* no progress and not done.. hung! */
		ring->hangcheck_fence = fence;
		ring->hangcheck_progress_retries = 0;
		DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
		gpu->name, ring->id);
		DRM_DEV_ERROR(dev->dev, "%s: completed fence: %u\n",
		@@ -830,6 +848,7 @@ int msm_gpu_init(struct drm_device drm, struct platform_device pdev,
		struct msm_gpu gpu, const struct msm_gpu_funcs funcs,
		const char name, struct msm_gpu_config config)
		{
		struct msm_drm_private *priv = drm->dev_private;
		int i, ret, nr_rings = config->nr_rings;
		void *memptrs;
		uint64_t memptrs_iova;
		@@ -857,6 +876,16 @@ int msm_gpu_init(struct drm_device drm, struct platform_device pdev,
		kthread_init_work(&gpu->recover_work, recover_worker);
		kthread_init_work(&gpu->fault_work, fault_worker);

		priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD;

		/*
		* If progress detection is supported, halve the hangcheck timer
		* duration, as it takes two iterations of the hangcheck handler
		* to detect a hang.
		*/
		if (funcs->progress)
		priv->hangcheck_period /= 2;

		timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0);

		spin_lock_init(&gpu->perf_lock);

drivers/gpu/drm/msm/msm_gpu.h

+10 −0

Original line number	Diff line number	Diff line
		@@ -78,6 +78,15 @@ struct msm_gpu_funcs {
		struct msm_gem_address_space (create_private_address_space)
		(struct msm_gpu *gpu);
		uint32_t (get_rptr)(struct msm_gpu gpu, struct msm_ringbuffer *ring);

		/**
		* progress: Has the GPU made progress?
		*
		* Return true if GPU position in cmdstream has advanced (or changed)
		* since the last call. To avoid false negatives, this should account
		* for cmdstream that is buffered in this FIFO upstream of the CP fw.
		*/
		bool (progress)(struct msm_gpu gpu, struct msm_ringbuffer *ring);
		};

		/* Additional state for iommu faults: */
		@@ -237,6 +246,7 @@ struct msm_gpu {
		#define DRM_MSM_INACTIVE_PERIOD 66 /* in ms (roughly four frames) */

		#define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 500 /* in ms */
		#define DRM_MSM_HANGCHECK_PROGRESS_RETRIES 3
		struct timer_list hangcheck_timer;

		/* Fault info for most recent iova fault: */