Commit a6f0f9cf authored by Alan Previn's avatar Alan Previn Committed by Lucas De Marchi
Browse files

drm/i915/guc: Plumb GuC-capture into gpu_coredump



Add a flags parameter through all of the coredump creation
functions. Add a bitmask flag to indicate if the top
level gpu_coredump event is triggered in response to
a GuC context reset notification.

Using that flag, ensure all coredump functions that
read or print mmio-register values related to work submission
or command-streamer engines are skipped and replaced with
a calls guc-capture module equivalent functions to retrieve
or print the register dump.

While here, split out display related register reading
and printing into its own function that is called agnostic
to whether GuC had triggered the reset.

For now, introduce an empty printing function that can
filled in on a subsequent patch just to handle formatting.

Signed-off-by: default avatarAlan Previn <alan.previn.teres.alexis@intel.com>
Reviewed-by: default avatarUmesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Signed-off-by: default avatarLucas De Marchi <lucas.demarchi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220321164527.2500062-13-alan.previn.teres.alexis@intel.com
parent 247f8071
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -2229,11 +2229,11 @@ static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
	if (!cap->error)
		goto err_cap;

	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp, CORE_DUMP_FLAG_NONE);
	if (!cap->error->gt)
		goto err_gpu;

	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp, CORE_DUMP_FLAG_NONE);
	if (!cap->error->gt->engine)
		goto err_gt;

+1 −1
Original line number Diff line number Diff line
@@ -1318,7 +1318,7 @@ void intel_gt_handle_error(struct intel_gt *gt,
	engine_mask &= gt->info.engine_mask;

	if (flags & I915_ERROR_CAPTURE) {
		i915_capture_error_state(gt, engine_mask);
		i915_capture_error_state(gt, engine_mask, CORE_DUMP_FLAG_NONE);
		intel_gt_clear_error_registers(gt, engine_mask);
	}

+70 −0
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@
#include "gt/intel_engine_regs.h"
#include "gt/intel_gt.h"
#include "gt/intel_gt_regs.h"
#include "gt/intel_lrc.h"
#include "guc_capture_fwif.h"
#include "intel_guc_capture.h"
#include "intel_guc_fwif.h"
@@ -755,6 +756,18 @@ intel_guc_capture_output_min_size_est(struct intel_guc *guc)
 *                   data from GuC and then it's added into guc->capture->outlist linked
 *                   list. This list is used for matchup and printout by i915_gpu_coredump
 *                   and err_print_gt, (when user invokes the error capture sysfs).
 *
 * GUC --> notify context reset:
 * -----------------------------
 *     --> G2H CONTEXT RESET
 *                   L--> guc_handle_context_reset --> i915_capture_error_state
 *                          L--> i915_gpu_coredump(..IS_GUC_CAPTURE) --> gt_record_engines
 *                               --> capture_engine(..IS_GUC_CAPTURE)
 *                               L--> intel_guc_capture_get_matching_node is where
 *                                    detach C from internal linked list and add it into
 *                                    intel_engine_coredump struct (if the context and
 *                                    engine of the event notification matches a node
 *                                    in the link list).
 */

static int guc_capture_buf_cnt(struct __guc_capture_bufstate *buf)
@@ -1370,6 +1383,63 @@ static void __guc_capture_process_output(struct intel_guc *guc)
	__guc_capture_flushlog_complete(guc);
}

#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)

int intel_guc_capture_print_engine_node(struct drm_i915_error_state_buf *ebuf,
					const struct intel_engine_coredump *ee)
{
	return 0;
}

#endif //CONFIG_DRM_I915_CAPTURE_ERROR

void intel_guc_capture_free_node(struct intel_engine_coredump *ee)
{
	if (!ee || !ee->guc_capture_node)
		return;

	guc_capture_add_node_to_cachelist(ee->capture, ee->guc_capture_node);
	ee->capture = NULL;
	ee->guc_capture_node = NULL;
}

void intel_guc_capture_get_matching_node(struct intel_gt *gt,
					 struct intel_engine_coredump *ee,
					 struct intel_context *ce)
{
	struct __guc_capture_parsed_output *n, *ntmp;
	struct drm_i915_private *i915;
	struct intel_guc *guc;

	if (!gt || !ee || !ce)
		return;

	i915 = gt->i915;
	guc = &gt->uc.guc;
	if (!guc->capture)
		return;

	GEM_BUG_ON(ee->guc_capture_node);
	/*
	 * Look for a matching GuC reported error capture node from
	 * the internal output link-list based on lrca, guc-id and engine
	 * identification.
	 */
	list_for_each_entry_safe(n, ntmp, &guc->capture->outlist, link) {
		if (n->eng_inst == GUC_ID_TO_ENGINE_INSTANCE(ee->engine->guc_id) &&
		    n->eng_class == GUC_ID_TO_ENGINE_CLASS(ee->engine->guc_id) &&
		    n->guc_id && n->guc_id == ce->guc_id.id &&
		    (n->lrca & CTX_GTT_ADDRESS_MASK) && (n->lrca & CTX_GTT_ADDRESS_MASK) ==
		    (ce->lrc.lrca & CTX_GTT_ADDRESS_MASK)) {
			list_del(&n->link);
			ee->guc_capture_node = n;
			ee->capture = guc->capture;
			return;
		}
	}
	drm_dbg(&i915->drm, "GuC capture can't match ee to node\n");
}

void intel_guc_capture_process(struct intel_guc *guc)
{
	if (guc->capture)
+9 −0
Original line number Diff line number Diff line
@@ -8,9 +8,18 @@

#include <linux/types.h>

struct drm_i915_error_state_buf;
struct guc_gt_system_info;
struct intel_context;
struct intel_engine_coredump;
struct intel_gt;
struct intel_guc;

void intel_guc_capture_free_node(struct intel_engine_coredump *ee);
int intel_guc_capture_print_engine_node(struct drm_i915_error_state_buf *m,
					const struct intel_engine_coredump *ee);
void intel_guc_capture_get_matching_node(struct intel_gt *gt, struct intel_engine_coredump *ee,
					 struct intel_context *ce);
void intel_guc_capture_process(struct intel_guc *guc);
int intel_guc_capture_output_min_size_est(struct intel_guc *guc);
int intel_guc_capture_getlist(struct intel_guc *guc, u32 owner, u32 type, u32 classid,
+1 −1
Original line number Diff line number Diff line
@@ -4031,7 +4031,7 @@ static void capture_error_state(struct intel_guc *guc,

	intel_engine_set_hung_context(engine, ce);
	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
		i915_capture_error_state(gt, engine->mask);
		i915_capture_error_state(gt, engine->mask, CORE_DUMP_FLAG_IS_GUC_CAPTURE);
	atomic_inc(&i915->gpu_error.reset_engine_count[engine->uabi_class]);
}

Loading