Commit c41166f9 authored by Chris Wilson's avatar Chris Wilson
Browse files

drm/i915: Beware temporary wedging when determining -EIO

At a few points in our uABI, we check to see if the driver is wedged and
report -EIO back to the user in that case. However, as we perform the
check and reset asynchronously (where once before they were both
serialised by the struct_mutex), we may instead see the temporary wedging
used to cancel inflight rendering to avoid a deadlock during reset
(caused by either us timing out in our reset handler,
i915_wedge_on_timeout or with malice aforethought in intel_reset_prepare
for a stuck modeset). If we suspect this is the case, that is we see a
wedged driver *and* reset in progress, then wait until the reset is
resolved before reporting upon the wedged status.

v2: might_sleep() (Mika)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109580


Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190220145637.23503-1-chris@chris-wilson.co.uk
parent 47ed55a9
Loading
Loading
Loading
Loading
+12 −5
Original line number Diff line number Diff line
@@ -3833,11 +3833,18 @@ static const struct file_operations i915_cur_wm_latency_fops = {
static int
i915_wedged_get(void *data, u64 *val)
{
	struct drm_i915_private *dev_priv = data;

	*val = i915_terminally_wedged(&dev_priv->gpu_error);
	int ret = i915_terminally_wedged(data);

	switch (ret) {
	case -EIO:
		*val = 1;
		return 0;
	case 0:
		*val = 0;
		return 0;
	default:
		return ret;
	}
}

static int
@@ -3918,7 +3925,7 @@ i915_drop_caches_set(void *data, u64 val)
		mutex_unlock(&i915->drm.struct_mutex);
	}

	if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
	if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(i915))
		i915_handle_error(i915, ALL_ENGINES, 0, NULL);

	fs_reclaim_acquire(GFP_KERNEL);
+6 −1
Original line number Diff line number Diff line
@@ -3020,11 +3020,16 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
struct i915_request *
i915_gem_find_active_request(struct intel_engine_cs *engine);

static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
static inline bool __i915_wedged(struct i915_gpu_error *error)
{
	return unlikely(test_bit(I915_WEDGED, &error->flags));
}

static inline bool i915_reset_failed(struct drm_i915_private *i915)
{
	return __i915_wedged(&i915->gpu_error);
}

static inline u32 i915_reset_count(struct i915_gpu_error *error)
{
	return READ_ONCE(error->reset_count);
+13 −12
Original line number Diff line number Diff line
@@ -1928,7 +1928,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
		 * fail). But any other -EIO isn't ours (e.g. swap in failure)
		 * and so needs to be reported.
		 */
		if (!i915_terminally_wedged(&dev_priv->gpu_error))
		if (!i915_terminally_wedged(dev_priv))
			return VM_FAULT_SIGBUS;
		/* else: fall through */
	case -EAGAIN:
@@ -2958,7 +2958,7 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
	struct intel_engine_cs *engine;
	enum intel_engine_id id;

	if (i915_terminally_wedged(&i915->gpu_error))
	if (i915_reset_failed(i915))
		return;

	GEM_BUG_ON(i915->gt.active_requests);
@@ -3806,8 +3806,9 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
	long ret;

	/* ABI: return -EIO if already wedged */
	if (i915_terminally_wedged(&dev_priv->gpu_error))
		return -EIO;
	ret = i915_terminally_wedged(dev_priv);
	if (ret)
		return ret;

	spin_lock(&file_priv->mm.lock);
	list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
@@ -4460,7 +4461,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
	 * back to defaults, recovering from whatever wedged state we left it
	 * in and so worth trying to use the device once more.
	 */
	if (i915_terminally_wedged(&i915->gpu_error))
	if (i915_terminally_wedged(i915))
		i915_gem_unset_wedged(i915);

	/*
@@ -4504,7 +4505,7 @@ int i915_gem_suspend(struct drm_i915_private *i915)
	 * state. Fortunately, the kernel_context is disposable and we do
	 * not rely on its state.
	 */
	if (!i915_terminally_wedged(&i915->gpu_error)) {
	if (!i915_reset_failed(i915)) {
		ret = i915_gem_switch_to_kernel_context(i915);
		if (ret)
			goto err_unlock;
@@ -4625,8 +4626,9 @@ void i915_gem_resume(struct drm_i915_private *i915)
	return;

err_wedged:
	if (!i915_terminally_wedged(&i915->gpu_error)) {
		DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
	if (!i915_reset_failed(i915)) {
		dev_err(i915->drm.dev,
			"Failed to re-initialize GPU, declaring it wedged!\n");
		i915_gem_set_wedged(i915);
	}
	goto out_unlock;
@@ -4731,10 +4733,9 @@ int i915_gem_init_hw(struct drm_i915_private *dev_priv)
	init_unused_rings(dev_priv);

	BUG_ON(!dev_priv->kernel_context);
	if (i915_terminally_wedged(&dev_priv->gpu_error)) {
		ret = -EIO;
	ret = i915_terminally_wedged(dev_priv);
	if (ret)
		goto out;
	}

	ret = i915_ppgtt_init_hw(dev_priv);
	if (ret) {
@@ -5107,7 +5108,7 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
		 * wedged. But we only want to do this where the GPU is angry,
		 * for all other failure, such as an allocation failure, bail.
		 */
		if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
		if (!i915_reset_failed(dev_priv)) {
			i915_load_error(dev_priv,
					"Failed to initialize GPU, declaring it wedged!\n");
			i915_gem_set_wedged(dev_priv);
+3 −2
Original line number Diff line number Diff line
@@ -559,8 +559,9 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
	 * ABI: Before userspace accesses the GPU (e.g. execbuffer), report
	 * EIO if the GPU is already wedged.
	 */
	if (i915_terminally_wedged(&i915->gpu_error))
		return ERR_PTR(-EIO);
	ret = i915_terminally_wedged(i915);
	if (ret)
		return ERR_PTR(ret);

	/*
	 * Pinning the contexts may generate requests in order to acquire
+27 −2
Original line number Diff line number Diff line
@@ -1032,7 +1032,7 @@ void i915_reset(struct drm_i915_private *i915,

finish:
	reset_finish(i915);
	if (!i915_terminally_wedged(error))
	if (!__i915_wedged(error))
		reset_restart(i915);
	return;

@@ -1253,7 +1253,7 @@ void i915_handle_error(struct drm_i915_private *i915,
	 * Try engine reset when available. We fall back to full reset if
	 * single reset fails.
	 */
	if (intel_has_reset_engine(i915) && !i915_terminally_wedged(error)) {
	if (intel_has_reset_engine(i915) && !__i915_wedged(error)) {
		for_each_engine_masked(engine, i915, engine_mask, tmp) {
			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
@@ -1339,6 +1339,31 @@ __releases(&i915->gpu_error.reset_backoff_srcu)
	srcu_read_unlock(&error->reset_backoff_srcu, tag);
}

int i915_terminally_wedged(struct drm_i915_private *i915)
{
	struct i915_gpu_error *error = &i915->gpu_error;

	might_sleep();

	if (!__i915_wedged(error))
		return 0;

	/* Reset still in progress? Maybe we will recover? */
	if (!test_bit(I915_RESET_BACKOFF, &error->flags))
		return -EIO;

	/* XXX intel_reset_finish() still takes struct_mutex!!! */
	if (mutex_is_locked(&i915->drm.struct_mutex))
		return -EAGAIN;

	if (wait_event_interruptible(error->reset_queue,
				     !test_bit(I915_RESET_BACKOFF,
					       &error->flags)))
		return -EINTR;

	return __i915_wedged(error) ? -EIO : 0;
}

bool i915_reset_flush(struct drm_i915_private *i915)
{
	int err;
Loading