Commit 704b914f authored by Ming Lei's avatar Ming Lei Committed by Jens Axboe
Browse files

blk-mq: move srcu from blk_mq_hw_ctx to request_queue



In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch
critical area. However, this srcu instance stays at the end of hctx, and
it often takes standalone cacheline, often cold.

Inside srcu_read_lock() and srcu_read_unlock(), WRITE is always done on
the indirect percpu variable which is allocated from heap instead of
being embedded, srcu->srcu_idx is read only in srcu_read_lock(). It
doesn't matter if srcu structure stays in hctx or request queue.

So switch to per-request-queue srcu for protecting dispatch, and this
way simplifies quiesce a lot, not mention quiesce is always done on the
request queue wide.

Signed-off-by: default avatarMing Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211203131534.3668411-3-ming.lei@redhat.com


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 2a904d00
Loading
Loading
Loading
Loading
+22 −5
Original line number Diff line number Diff line
@@ -66,6 +66,7 @@ DEFINE_IDA(blk_queue_ida);
 * For queue allocation
 */
struct kmem_cache *blk_requestq_cachep;
struct kmem_cache *blk_requestq_srcu_cachep;

/*
 * Controlling structure to kblockd
@@ -437,21 +438,27 @@ static void blk_timeout_work(struct work_struct *work)
{
}

struct request_queue *blk_alloc_queue(int node_id)
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
{
	struct request_queue *q;
	int ret;

	q = kmem_cache_alloc_node(blk_requestq_cachep,
	q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
			GFP_KERNEL | __GFP_ZERO, node_id);
	if (!q)
		return NULL;

	if (alloc_srcu) {
		blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
		if (init_srcu_struct(q->srcu) != 0)
			goto fail_q;
	}

	q->last_merge = NULL;

	q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
	if (q->id < 0)
		goto fail_q;
		goto fail_srcu;

	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
	if (ret)
@@ -508,8 +515,11 @@ struct request_queue *blk_alloc_queue(int node_id)
	bioset_exit(&q->bio_split);
fail_id:
	ida_simple_remove(&blk_queue_ida, q->id);
fail_srcu:
	if (alloc_srcu)
		cleanup_srcu_struct(q->srcu);
fail_q:
	kmem_cache_free(blk_requestq_cachep, q);
	kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
	return NULL;
}

@@ -1301,6 +1311,9 @@ int __init blk_dev_init(void)
			sizeof_field(struct request, cmd_flags));
	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
			sizeof_field(struct bio, bi_opf));
	BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
			   __alignof__(struct request_queue)) !=
		     sizeof(struct request_queue));

	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
	kblockd_workqueue = alloc_workqueue("kblockd",
@@ -1311,6 +1324,10 @@ int __init blk_dev_init(void)
	blk_requestq_cachep = kmem_cache_create("request_queue",
			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);

	blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
			sizeof(struct request_queue) +
			sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);

	blk_debugfs_root = debugfs_create_dir("block", NULL);

	return 0;
+0 −2
Original line number Diff line number Diff line
@@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
	struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
						  kobj);

	if (hctx->flags & BLK_MQ_F_BLOCKING)
		cleanup_srcu_struct(hctx->srcu);
	blk_free_flush_queue(hctx->fq);
	sbitmap_free(&hctx->ctx_map);
	free_cpumask_var(hctx->cpumask);
+8 −29
Original line number Diff line number Diff line
@@ -260,17 +260,9 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 */
void blk_mq_wait_quiesce_done(struct request_queue *q)
{
	struct blk_mq_hw_ctx *hctx;
	unsigned int i;
	bool rcu = false;

	queue_for_each_hw_ctx(q, hctx, i) {
		if (hctx->flags & BLK_MQ_F_BLOCKING)
			synchronize_srcu(hctx->srcu);
	if (blk_queue_has_srcu(q))
		synchronize_srcu(q->srcu);
	else
			rcu = true;
	}
	if (rcu)
		synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
@@ -3400,20 +3392,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
	}
}

static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
{
	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);

	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
			   __alignof__(struct blk_mq_hw_ctx)) !=
		     sizeof(struct blk_mq_hw_ctx));

	if (tag_set->flags & BLK_MQ_F_BLOCKING)
		hw_ctx_size += sizeof(struct srcu_struct);

	return hw_ctx_size;
}

static int blk_mq_init_hctx(struct request_queue *q,
		struct blk_mq_tag_set *set,
		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
@@ -3451,7 +3429,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
	struct blk_mq_hw_ctx *hctx;
	gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;

	hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
	hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
	if (!hctx)
		goto fail_alloc_hctx;

@@ -3493,8 +3471,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
	if (!hctx->fq)
		goto free_bitmap;

	if (hctx->flags & BLK_MQ_F_BLOCKING)
		init_srcu_struct(hctx->srcu);
	blk_mq_hctx_kobj_init(hctx);

	return hctx;
@@ -3830,7 +3806,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
	struct request_queue *q;
	int ret;

	q = blk_alloc_queue(set->numa_node);
	q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
	if (!q)
		return ERR_PTR(-ENOMEM);
	q->queuedata = queuedata;
@@ -3979,6 +3955,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
		struct request_queue *q)
{
	WARN_ON_ONCE(blk_queue_has_srcu(q) !=
			!!(set->flags & BLK_MQ_F_BLOCKING));

	/* mark the queue as mq asap */
	q->mq_ops = set->ops;

+2 −2
Original line number Diff line number Diff line
@@ -385,9 +385,9 @@ do { \
		int srcu_idx;					\
								\
		might_sleep();					\
		srcu_idx = srcu_read_lock((hctx)->srcu);	\
		srcu_idx = srcu_read_lock((hctx)->queue->srcu);	\
		(dispatch_ops);					\
		srcu_read_unlock((hctx)->srcu, srcu_idx);	\
		srcu_read_unlock((hctx)->queue->srcu, srcu_idx); \
	}							\
} while (0)

+2 −1
Original line number Diff line number Diff line
@@ -735,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{
	struct request_queue *q = container_of(rcu_head, struct request_queue,
					       rcu_head);
	kmem_cache_free(blk_requestq_cachep, q);

	kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
}

/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
Loading