Commit eb619fdb authored by Jens Axboe's avatar Jens Axboe
Browse files

blk-mq: fix issue with shared tag queue re-running



This patch attempts to make the case of hctx re-running on driver tag
failure more robust. Without this patch, it's pretty easy to trigger a
stall condition with shared tags. An example is using null_blk like
this:

modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1

which sets up 4 devices, sharing the same tag set with a depth of 1.
Running a fio job ala:

[global]
bs=4k
rw=randread
norandommap
direct=1
ioengine=libaio
iodepth=4

[nullb0]
filename=/dev/nullb0
[nullb1]
filename=/dev/nullb1
[nullb2]
filename=/dev/nullb2
[nullb3]
filename=/dev/nullb3

will inevitably end with one or more threads being stuck waiting for a
scheduler tag. That IO is then stuck forever, until someone else
triggers a run of the queue.

Ensure that we always re-run the hardware queue, if the driver tag we
were waiting for got freed before we added our leftover request entries
back on the dispatch list.

Reviewed-by: default avatarBart Van Assche <bart.vanassche@wdc.com>
Tested-by: default avatarBart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: default avatarMing Lei <ming.lei@redhat.com>
Reviewed-by: default avatarOmar Sandoval <osandov@fb.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent e454d122
Loading
Loading
Loading
Loading
+0 −1
Original line number Original line Diff line number Diff line
@@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = {
	HCTX_STATE_NAME(STOPPED),
	HCTX_STATE_NAME(STOPPED),
	HCTX_STATE_NAME(TAG_ACTIVE),
	HCTX_STATE_NAME(TAG_ACTIVE),
	HCTX_STATE_NAME(SCHED_RESTART),
	HCTX_STATE_NAME(SCHED_RESTART),
	HCTX_STATE_NAME(TAG_WAITING),
	HCTX_STATE_NAME(START_ON_RUN),
	HCTX_STATE_NAME(START_ON_RUN),
};
};
#undef HCTX_STATE_NAME
#undef HCTX_STATE_NAME
+48 −37
Original line number Original line Diff line number Diff line
@@ -998,41 +998,55 @@ done:
	return rq->tag != -1;
	return rq->tag != -1;
}
}


static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
				void *key)
				int flags, void *key)
{
{
	struct blk_mq_hw_ctx *hctx;
	struct blk_mq_hw_ctx *hctx;


	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);


	list_del(&wait->entry);
	list_del_init(&wait->entry);
	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
	blk_mq_run_hw_queue(hctx, true);
	blk_mq_run_hw_queue(hctx, true);
	return 1;
	return 1;
}
}


static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
				     struct request *rq)
{
{
	struct blk_mq_hw_ctx *this_hctx = *hctx;
	wait_queue_entry_t *wait = &this_hctx->dispatch_wait;
	struct sbq_wait_state *ws;
	struct sbq_wait_state *ws;


	if (!list_empty_careful(&wait->entry))
		return false;

	spin_lock(&this_hctx->lock);
	if (!list_empty(&wait->entry)) {
		spin_unlock(&this_hctx->lock);
		return false;
	}

	ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
	add_wait_queue(&ws->wait, wait);

	/*
	/*
	 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
	 * It's possible that a tag was freed in the window between the
	 * The thread which wins the race to grab this bit adds the hardware
	 * allocation failure and adding the hardware queue to the wait
	 * queue to the wait queue.
	 * queue.
	 */
	 */
	if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
	if (!blk_mq_get_driver_tag(rq, hctx, false)) {
	    test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
		spin_unlock(&this_hctx->lock);
		return false;
		return false;

	}
	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);


	/*
	/*
	 * As soon as this returns, it's no longer safe to fiddle with
	 * We got a tag, remove ourselves from the wait queue to ensure
	 * hctx->dispatch_wait, since a completion can wake up the wait queue
	 * someone else gets the wakeup.
	 * and unlock the bit.
	 */
	 */
	add_wait_queue(&ws->wait, &hctx->dispatch_wait);
	spin_lock_irq(&ws->wait.lock);
	list_del_init(&wait->entry);
	spin_unlock_irq(&ws->wait.lock);
	spin_unlock(&this_hctx->lock);
	return true;
	return true;
}
}


@@ -1041,6 +1055,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
{
{
	struct blk_mq_hw_ctx *hctx;
	struct blk_mq_hw_ctx *hctx;
	struct request *rq, *nxt;
	struct request *rq, *nxt;
	bool no_tag = false;
	int errors, queued;
	int errors, queued;


	if (list_empty(list))
	if (list_empty(list))
@@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
			/*
			/*
			 * The initial allocation attempt failed, so we need to
			 * The initial allocation attempt failed, so we need to
			 * rerun the hardware queue when a tag is freed.
			 * rerun the hardware queue when a tag is freed. The
			 * waitqueue takes care of that. If the queue is run
			 * before we add this entry back on the dispatch list,
			 * we'll re-run it below.
			 */
			 */
			if (!blk_mq_dispatch_wait_add(hctx)) {
			if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
				if (got_budget)
					blk_mq_put_dispatch_budget(hctx);
				break;
			}

			/*
			 * It's possible that a tag was freed in the window
			 * between the allocation failure and adding the
			 * hardware queue to the wait queue.
			 */
			if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
				if (got_budget)
				if (got_budget)
					blk_mq_put_dispatch_budget(hctx);
					blk_mq_put_dispatch_budget(hctx);
				no_tag = true;
				break;
				break;
			}
			}
		}
		}
@@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
		 * it is no longer set that means that it was cleared by another
		 * it is no longer set that means that it was cleared by another
		 * thread and hence that a queue rerun is needed.
		 * thread and hence that a queue rerun is needed.
		 *
		 *
		 * If TAG_WAITING is set that means that an I/O scheduler has
		 * If 'no_tag' is set, that means that we failed getting
		 * been configured and another thread is waiting for a driver
		 * a driver tag with an I/O scheduler attached. If our dispatch
		 * tag. To guarantee fairness, do not rerun this hardware queue
		 * waitqueue is no longer active, ensure that we run the queue
		 * but let the other thread grab the driver tag.
		 * AFTER adding our entries back to the list.
		 *
		 *
		 * If no I/O scheduler has been configured it is possible that
		 * If no I/O scheduler has been configured it is possible that
		 * the hardware queue got stopped and restarted before requests
		 * the hardware queue got stopped and restarted before requests
@@ -1155,8 +1163,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
		 *   and dm-rq.
		 *   and dm-rq.
		 */
		 */
		if (!blk_mq_sched_needs_restart(hctx) &&
		if (!blk_mq_sched_needs_restart(hctx) ||
		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
		    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
			blk_mq_run_hw_queue(hctx, true);
			blk_mq_run_hw_queue(hctx, true);
	}
	}


@@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q,


	hctx->nr_ctx = 0;
	hctx->nr_ctx = 0;


	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);

	if (set->ops->init_hctx &&
	if (set->ops->init_hctx &&
	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
		goto free_bitmap;
		goto free_bitmap;
+2 −3
Original line number Original line Diff line number Diff line
@@ -181,8 +181,7 @@ enum {
	BLK_MQ_S_STOPPED	= 0,
	BLK_MQ_S_STOPPED	= 0,
	BLK_MQ_S_TAG_ACTIVE	= 1,
	BLK_MQ_S_TAG_ACTIVE	= 1,
	BLK_MQ_S_SCHED_RESTART	= 2,
	BLK_MQ_S_SCHED_RESTART	= 2,
	BLK_MQ_S_TAG_WAITING	= 3,
	BLK_MQ_S_START_ON_RUN	= 3,
	BLK_MQ_S_START_ON_RUN	= 4,


	BLK_MQ_MAX_DEPTH	= 10240,
	BLK_MQ_MAX_DEPTH	= 10240,