Commit e25e92e0 authored by Rob Clark's avatar Rob Clark
Browse files

drm/msm: devcoredump iommu fault support



Wire up support to stall the SMMU on iova fault, and collect a devcore-
dump snapshot for easier debugging of faults.

Currently this is a6xx-only, but mostly only because so far it is the
only one using adreno-smmu-priv.

Signed-off-by: default avatarRob Clark <robdclark@chromium.org>
Acked-by: default avatarJordan Crouse <jordan@cosmicpenguin.net>
Link: https://lore.kernel.org/r/20210610214431.539029-6-robdclark@gmail.com


Signed-off-by: default avatarRob Clark <robdclark@chromium.org>
parent ba6014a4
Loading
Loading
Loading
Loading
+17 −2
Original line number Diff line number Diff line
@@ -1200,6 +1200,15 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
	struct drm_device *dev = gpu->dev;
	struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu);

	/*
	 * If stalled on SMMU fault, we could trip the GPU's hang detection,
	 * but the fault handler will trigger the devcore dump, and we want
	 * to otherwise resume normally rather than killing the submit, so
	 * just bail.
	 */
	if (gpu_read(gpu, REG_A5XX_RBBM_STATUS3) & BIT(24))
		return;

	DRM_DEV_ERROR(dev->dev, "gpu fault ring %d fence %x status %8.8X rb %4.4x/%4.4x ib1 %16.16llX/%4.4x ib2 %16.16llX/%4.4x\n",
		ring ? ring->id : -1, ring ? ring->seqno : 0,
		gpu_read(gpu, REG_A5XX_RBBM_STATUS),
@@ -1523,6 +1532,7 @@ static struct msm_gpu_state *a5xx_gpu_state_get(struct msm_gpu *gpu)
{
	struct a5xx_gpu_state *a5xx_state = kzalloc(sizeof(*a5xx_state),
			GFP_KERNEL);
	bool stalled = !!(gpu_read(gpu, REG_A5XX_RBBM_STATUS3) & BIT(24));

	if (!a5xx_state)
		return ERR_PTR(-ENOMEM);
@@ -1535,7 +1545,12 @@ static struct msm_gpu_state *a5xx_gpu_state_get(struct msm_gpu *gpu)

	a5xx_state->base.rbbm_status = gpu_read(gpu, REG_A5XX_RBBM_STATUS);

	/* Get the HLSQ regs with the help of the crashdumper */
	/*
	 * Get the HLSQ regs with the help of the crashdumper, but only if
	 * we are not stalled in an iommu fault (in which case the crashdumper
	 * would not have access to memory)
	 */
	if (!stalled)
		a5xx_gpu_state_get_hlsq_regs(gpu, a5xx_state);

	a5xx_set_hwcg(gpu, true);
+36 −2
Original line number Diff line number Diff line
@@ -1193,6 +1193,16 @@ static int a6xx_fault_handler(void *arg, unsigned long iova, int flags, void *da
	struct msm_gpu *gpu = arg;
	struct adreno_smmu_fault_info *info = data;
	const char *type = "UNKNOWN";
	const char *block;
	bool do_devcoredump = info && !READ_ONCE(gpu->crashstate);

	/*
	 * If we aren't going to be resuming later from fault_worker, then do
	 * it now.
	 */
	if (!do_devcoredump) {
		gpu->aspace->mmu->funcs->resume_translation(gpu->aspace->mmu);
	}

	/*
	 * Print a default message if we couldn't get the data from the
@@ -1216,15 +1226,30 @@ static int a6xx_fault_handler(void *arg, unsigned long iova, int flags, void *da
	else if (info->fsr & ARM_SMMU_FSR_EF)
		type = "EXTERNAL";

	block = a6xx_fault_block(gpu, info->fsynr1 & 0xff);

	pr_warn_ratelimited("*** gpu fault: ttbr0=%.16llx iova=%.16lx dir=%s type=%s source=%s (%u,%u,%u,%u)\n",
			info->ttbr0, iova,
			flags & IOMMU_FAULT_WRITE ? "WRITE" : "READ", type,
			a6xx_fault_block(gpu, info->fsynr1 & 0xff),
			flags & IOMMU_FAULT_WRITE ? "WRITE" : "READ",
			type, block,
			gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(4)),
			gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(5)),
			gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(6)),
			gpu_read(gpu, REG_A6XX_CP_SCRATCH_REG(7)));

	if (do_devcoredump) {
		/* Turn off the hangcheck timer to keep it from bothering us */
		del_timer(&gpu->hangcheck_timer);

		gpu->fault_info.ttbr0 = info->ttbr0;
		gpu->fault_info.iova  = iova;
		gpu->fault_info.flags = flags;
		gpu->fault_info.type  = type;
		gpu->fault_info.block = block;

		kthread_queue_work(gpu->worker, &gpu->fault_work);
	}

	return 0;
}

@@ -1276,6 +1301,15 @@ static void a6xx_fault_detect_irq(struct msm_gpu *gpu)
	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
	struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu);

	/*
	 * If stalled on SMMU fault, we could trip the GPU's hang detection,
	 * but the fault handler will trigger the devcore dump, and we want
	 * to otherwise resume normally rather than killing the submit, so
	 * just bail.
	 */
	if (gpu_read(gpu, REG_A6XX_RBBM_STATUS3) & A6XX_RBBM_STATUS3_SMMU_STALLED_ON_FAULT)
		return;

	/*
	 * Force the GPU to stay on until after we finish
	 * collecting information
+34 −8
Original line number Diff line number Diff line
@@ -832,6 +832,20 @@ static void a6xx_get_registers(struct msm_gpu *gpu,
		a6xx_get_ahb_gpu_registers(gpu,
				a6xx_state, &a6xx_vbif_reglist,
				&a6xx_state->registers[index++]);
	if (!dumper) {
		/*
		 * We can't use the crashdumper when the SMMU is stalled,
		 * because the GPU has no memory access until we resume
		 * translation (but we don't want to do that until after
		 * we have captured as much useful GPU state as possible).
		 * So instead collect registers via the CPU:
		 */
		for (i = 0; i < ARRAY_SIZE(a6xx_reglist); i++)
			a6xx_get_ahb_gpu_registers(gpu,
				a6xx_state, &a6xx_reglist[i],
				&a6xx_state->registers[index++]);
		return;
	}

	for (i = 0; i < ARRAY_SIZE(a6xx_reglist); i++)
		a6xx_get_crashdumper_registers(gpu,
@@ -905,11 +919,13 @@ static void a6xx_get_indexed_registers(struct msm_gpu *gpu,

struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu)
{
	struct a6xx_crashdumper dumper = { 0 };
	struct a6xx_crashdumper _dumper = { 0 }, *dumper = NULL;
	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
	struct a6xx_gpu_state *a6xx_state = kzalloc(sizeof(*a6xx_state),
		GFP_KERNEL);
	bool stalled = !!(gpu_read(gpu, REG_A6XX_RBBM_STATUS3) &
			A6XX_RBBM_STATUS3_SMMU_STALLED_ON_FAULT);

	if (!a6xx_state)
		return ERR_PTR(-ENOMEM);
@@ -928,14 +944,24 @@ struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu)
	/* Get the banks of indexed registers */
	a6xx_get_indexed_registers(gpu, a6xx_state);

	/* Try to initialize the crashdumper */
	if (!a6xx_crashdumper_init(gpu, &dumper)) {
		a6xx_get_registers(gpu, a6xx_state, &dumper);
		a6xx_get_shaders(gpu, a6xx_state, &dumper);
		a6xx_get_clusters(gpu, a6xx_state, &dumper);
		a6xx_get_dbgahb_clusters(gpu, a6xx_state, &dumper);
	/*
	 * Try to initialize the crashdumper, if we are not dumping state
	 * with the SMMU stalled.  The crashdumper needs memory access to
	 * write out GPU state, so we need to skip this when the SMMU is
	 * stalled in response to an iova fault
	 */
	if (!stalled && !a6xx_crashdumper_init(gpu, &_dumper)) {
		dumper = &_dumper;
	}

	a6xx_get_registers(gpu, a6xx_state, dumper);

	if (dumper) {
		a6xx_get_shaders(gpu, a6xx_state, dumper);
		a6xx_get_clusters(gpu, a6xx_state, dumper);
		a6xx_get_dbgahb_clusters(gpu, a6xx_state, dumper);

		msm_gem_kernel_put(dumper.bo, gpu->aspace, true);
		msm_gem_kernel_put(dumper->bo, gpu->aspace, true);
	}

	if (snapshot_debugbus)
+15 −0
Original line number Diff line number Diff line
@@ -684,6 +684,21 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
			adreno_gpu->info->revn, adreno_gpu->rev.core,
			adreno_gpu->rev.major, adreno_gpu->rev.minor,
			adreno_gpu->rev.patchid);
	/*
	 * If this is state collected due to iova fault, so fault related info
	 *
	 * TTBR0 would not be zero, so this is a good way to distinguish
	 */
	if (state->fault_info.ttbr0) {
		const struct msm_gpu_fault_info *info = &state->fault_info;

		drm_puts(p, "fault-info:\n");
		drm_printf(p, "  - ttbr0=%.16llx\n", info->ttbr0);
		drm_printf(p, "  - iova=%.16lx\n", info->iova);
		drm_printf(p, "  - dir=%s\n", info->flags & IOMMU_FAULT_WRITE ? "WRITE" : "READ");
		drm_printf(p, "  - type=%s\n", info->type);
		drm_printf(p, "  - source=%s\n", info->block);
	}

	drm_printf(p, "rbbm-status: 0x%08x\n", state->rbbm_status);

+1 −0
Original line number Diff line number Diff line
@@ -328,6 +328,7 @@ struct msm_gem_submit {
	struct dma_fence *fence;
	struct msm_gpu_submitqueue *queue;
	struct pid *pid;    /* submitting process */
	bool fault_dumped;  /* Limit devcoredump dumping to one per submit */
	bool valid;         /* true if no cmdstream patching needed */
	bool in_rb;         /* "sudo" mode, copy cmds into RB */
	struct msm_ringbuffer *ring;
Loading