!9084 v5 xfs: atomic writes for xfs (a1125812) · Commits · EulixOS / Software / Kernel

Documentation/ABI/testing/sysfs-block

+52 −0

Original line number	Diff line number	Diff line
		@@ -97,6 +97,58 @@ Description:
		indicates how many bytes the beginning of the device is
		offset from the disk's natural alignment.

		What: /sys/block/<disk>/atomic_write_max_bytes
		Date: February 2024
		Contact: Himanshu Madhani <himanshu.madhani@oracle.com>
		Description:
		[RO] This parameter specifies the maximum atomic write
		size reported by the device. This parameter is relevant
		for merging of writes, where a merged atomic write
		operation must not exceed this number of bytes.
		This parameter may be greater to the value in
		atomic_write_unit_max_bytes as
		atomic_write_unit_max_bytes will be rounded down to a
		power-of-two and atomic_write_unit_max_bytes may also be
		limited by some other queue limits, such as max_segments.
		This parameter - along with atomic_write_unit_min_bytes
		and atomic_write_unit_max_bytes - will not be larger than
		max_hw_sectors_kb, but may be larger than max_sectors_kb.


		What: /sys/block/<disk>/atomic_write_unit_min_bytes
		Date: February 2024
		Contact: Himanshu Madhani <himanshu.madhani@oracle.com>
		Description:
		[RO] This parameter specifies the smallest block which can
		be written atomically with an atomic write operation. All
		atomic write operations must begin at a
		atomic_write_unit_min boundary and must be multiples of
		atomic_write_unit_min. This value must be a power-of-two.


		What: /sys/block/<disk>/atomic_write_unit_max_bytes
		Date: February 2024
		Contact: Himanshu Madhani <himanshu.madhani@oracle.com>
		Description:
		[RO] This parameter defines the largest block which can be
		written atomically with an atomic write operation. This
		value must be a multiple of atomic_write_unit_min and must
		be a power-of-two. This value will not be larger than
		atomic_write_max_bytes.


		What: /sys/block/<disk>/atomic_write_boundary_bytes
		Date: February 2024
		Contact: Himanshu Madhani <himanshu.madhani@oracle.com>
		Description:
		[RO] A device may need to internally split I/Os which
		straddle a given logical block address boundary. In that
		case a single atomic write operation will be processed as
		one of more sub-operations which each complete atomically.
		This parameter specifies the size in bytes of the atomic
		boundary if one is reported by the device. This value must
		be a power-of-two.

		What: /sys/block/<disk>/<partition>/alignment_offset
		Date: April 2009
		Contact: Martin K. Petersen <martin.petersen@oracle.com>

block/blk-core.c

+37 −2

Original line number	Diff line number	Diff line
		@@ -81,6 +81,7 @@ __setup("precise_iostat=", precise_iostat_setup);
		* For queue allocation
		*/
		struct kmem_cache *blk_requestq_cachep;
		struct kmem_cache *queue_atomic_write_cachep;

		/*
		* Controlling structure to kblockd
		@@ -433,6 +434,8 @@ static const struct {
		[BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" },
		[BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" },

		[BLK_STS_INVAL] = { -EINVAL, "invalid" },

		/* everything else not covered above: */
		[BLK_STS_IOERR] = { -EIO, "I/O" },
		};
		@@ -758,6 +761,7 @@ static void blk_timeout_work(struct work_struct *work)
		struct request_queue *blk_alloc_queue(int node_id)
		{
		struct request_queue *q;
		struct queue_atomic_write_limits *aw_limits;
		int ret;

		q = kmem_cache_alloc_node(blk_requestq_cachep,
		@@ -765,10 +769,17 @@ struct request_queue *blk_alloc_queue(int node_id)
		if (!q)
		return NULL;

		aw_limits = kmem_cache_alloc_node(queue_atomic_write_cachep,
		GFP_KERNEL \| __GFP_ZERO, node_id);
		if (!aw_limits)
		goto fail_q;

		q->limits.aw_limits = aw_limits;

		q->last_merge = NULL;

		if (blk_alloc_queue_dispatch_async(q))
		goto fail_q;
		goto fail_aw;

		q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
		if (q->id < 0)
		@@ -823,6 +834,7 @@ struct request_queue *blk_alloc_queue(int node_id)

		blk_queue_dma_alignment(q, 511);
		blk_set_default_limits(&q->limits);
		blk_set_default_atomic_write_limits(&q->limits);
		q->nr_requests = BLKDEV_MAX_RQ;

		return q;
		@@ -839,6 +851,8 @@ struct request_queue *blk_alloc_queue(int node_id)
		ida_simple_remove(&blk_queue_ida, q->id);
		fail_dispatch_async:
		blk_free_queue_dispatch_async(q);
		fail_aw:
		kmem_cache_free(queue_atomic_write_cachep, aw_limits);
		fail_q:
		kmem_cache_free(blk_requestq_cachep, q);
		return NULL;
		@@ -1052,6 +1066,18 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
		return BLK_STS_OK;
		}

		static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q,
		struct bio *bio)
		{
		if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q))
		return BLK_STS_INVAL;

		if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q))
		return BLK_STS_INVAL;

		return BLK_STS_OK;
		}

		static noinline_for_stack bool submit_bio_checks(struct bio *bio)
		{
		struct request_queue *q = bio->bi_disk->queue;
		@@ -1133,6 +1159,13 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
		if (!q->limits.max_write_zeroes_sectors)
		goto not_supported;
		break;
		case REQ_OP_WRITE:
		if (bio->bi_opf & REQ_ATOMIC) {
		status = blk_validate_atomic_write_op_size(q, bio);
		if (status != BLK_STS_OK)
		goto end_io;
		}
		break;
		default:
		break;
		}
		@@ -1391,7 +1424,7 @@ EXPORT_SYMBOL(submit_bio);
		static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
		struct request *rq)
		{
		unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
		unsigned int max_sectors = blk_queue_get_max_sectors_wrapper(rq);

		if (blk_rq_sectors(rq) > max_sectors) {
		/*
		@@ -2138,6 +2171,8 @@ int __init blk_dev_init(void)

		blk_requestq_cachep = kmem_cache_create("request_queue",
		sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
		queue_atomic_write_cachep = kmem_cache_create("queue_atomic_write",
		sizeof(struct queue_atomic_write_limits), 0, SLAB_PANIC, NULL);

		blk_debugfs_root = debugfs_create_dir("block", NULL);

block/blk-merge.c

+94 −1

Original line number	Diff line number	Diff line
		@@ -13,6 +13,46 @@
		#include "blk.h"
		#include "blk-rq-qos.h"

		/*
		* rq_straddles_atomic_write_boundary - check for boundary violation
		* @rq: request to check
		* @front: data size to be appended to front
		* @back: data size to be appended to back
		*
		* Determine whether merging a request or bio into another request will result
		* in a merged request which straddles an atomic write boundary.
		*
		* The value @front_adjust is the data which would be appended to the front of
		* @rq, while the value @back_adjust is the data which would be appended to the
		* back of @rq. Callers will typically only have either @front_adjust or
		* @back_adjust as non-zero.
		*
		*/
		static bool rq_straddles_atomic_write_boundary(struct request *rq,
		unsigned int front_adjust,
		unsigned int back_adjust)
		{
		unsigned int boundary = queue_atomic_write_boundary_bytes(rq->q);
		u64 mask, start_rq_pos, end_rq_pos;

		if (!boundary)
		return false;

		start_rq_pos = blk_rq_pos(rq) << SECTOR_SHIFT;
		end_rq_pos = start_rq_pos + blk_rq_bytes(rq) - 1;

		start_rq_pos -= front_adjust;
		end_rq_pos += back_adjust;

		mask = ~(boundary - 1);

		/* Top bits are different, so crossed a boundary */
		if ((start_rq_pos & mask) != (end_rq_pos & mask))
		return true;

		return false;
		}

		static inline bool bio_will_gap(struct request_queue *q,
		struct request prev_rq, struct bio prev, struct bio *next)
		{
		@@ -145,11 +185,20 @@ static inline unsigned get_max_io_size(struct request_queue *q,
		struct bio *bio)
		{
		unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0);
		unsigned max_sectors = sectors;
		unsigned max_sectors;
		unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
		unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
		unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1);

		/*
		* We ignore lim->max_sectors for atomic writes simply because
		* it may less than the bio size, which we cannot tolerate.
		*/
		if (bio->bi_opf & REQ_ATOMIC)
		max_sectors = q->limits.aw_limits->atomic_write_max_sectors;
		else
		max_sectors = sectors;

		max_sectors += start_offset;
		max_sectors &= ~(pbs - 1);
		if (max_sectors > start_offset)
		@@ -278,6 +327,11 @@ static struct bio blk_bio_segment_split(struct request_queue q,
		*segs = nsegs;
		return NULL;
		split:
		if (bio->bi_opf & REQ_ATOMIC) {
		bio->bi_status = BLK_STS_INVAL;
		bio_endio(bio);
		return ERR_PTR(-EINVAL);
		}
		*segs = nsegs;
		return bio_split(bio, sectors, GFP_NOIO, bs);
		}
		@@ -594,6 +648,13 @@ int ll_back_merge_fn(struct request req, struct bio bio, unsigned int nr_segs)
		return 0;
		}

		if (req->cmd_flags & REQ_ATOMIC) {
		if (rq_straddles_atomic_write_boundary(req,
		bio->bi_iter.bi_size, 0)) {
		return 0;
		}
		}

		return ll_new_hw_segment(req, bio, nr_segs);
		}

		@@ -613,6 +674,13 @@ static int ll_front_merge_fn(struct request req, struct bio bio,
		return 0;
		}

		if (req->cmd_flags & REQ_ATOMIC) {
		if (rq_straddles_atomic_write_boundary(req,
		0, bio->bi_iter.bi_size)) {
		return 0;
		}
		}

		return ll_new_hw_segment(req, bio, nr_segs);
		}

		@@ -649,6 +717,13 @@ static int ll_merge_requests_fn(struct request_queue q, struct request req,
		blk_rq_get_max_sectors(req, blk_rq_pos(req)))
		return 0;

		if (req->cmd_flags & REQ_ATOMIC) {
		if (rq_straddles_atomic_write_boundary(req,
		0, blk_rq_bytes(next))) {
		return 0;
		}
		}

		total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
		if (total_phys_segments > blk_rq_get_max_segments(req))
		return 0;
		@@ -721,6 +796,18 @@ static enum elv_merge blk_try_req_merge(struct request *req,
		return ELEVATOR_NO_MERGE;
		}

		static bool blk_atomic_write_mergeable_rq_bio(struct request *rq,
		struct bio *bio)
		{
		return (rq->cmd_flags & REQ_ATOMIC) == (bio->bi_opf & REQ_ATOMIC);
		}

		static bool blk_atomic_write_mergeable_rqs(struct request *rq,
		struct request *next)
		{
		return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC);
		}

		/*
		* For non-mq, this has to be called with the request spinlock acquired.
		* For mq with scheduling, the appropriate queue wide lock should be held.
		@@ -752,6 +839,9 @@ static struct request attempt_merge(struct request_queue q,
		if (req->ioprio != next->ioprio)
		return NULL;

		if (!blk_atomic_write_mergeable_rqs(req, next))
		return NULL;

		/*
		* If we are allowed to merge, then append bio list
		* from next to rq and release next. merge_requests_fn
		@@ -895,6 +985,9 @@ bool blk_rq_merge_ok(struct request rq, struct bio bio)
		if (rq->ioprio != bio_prio(bio))
		return false;

		if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
		return false;

		return true;
		}

block/blk-mq-debugfs.c

+1 −0

Original line number	Diff line number	Diff line
		@@ -306,6 +306,7 @@ static const char *const cmd_flag_name[] = {
		CMD_FLAG_NAME(NOWAIT),
		CMD_FLAG_NAME(NOUNMAP),
		CMD_FLAG_NAME(HIPRI),
		CMD_FLAG_NAME(ATOMIC),
		};
		#undef CMD_FLAG_NAME

block/blk-settings.c

+57 −0

Original line number	Diff line number	Diff line
		@@ -63,6 +63,20 @@ void blk_set_default_limits(struct queue_limits *lim)
		}
		EXPORT_SYMBOL(blk_set_default_limits);

		void blk_set_default_atomic_write_limits(struct queue_limits *lim)
		{
		if (lim->aw_limits) {
		lim->aw_limits->atomic_write_hw_max = 0;
		lim->aw_limits->atomic_write_max_sectors = 0;
		lim->aw_limits->atomic_write_hw_boundary = 0;
		lim->aw_limits->atomic_write_hw_unit_min = 0;
		lim->aw_limits->atomic_write_unit_min = 0;
		lim->aw_limits->atomic_write_hw_unit_max = 0;
		lim->aw_limits->atomic_write_unit_max = 0;
		}
		}
		EXPORT_SYMBOL(blk_set_default_atomic_write_limits);

		/**
		* blk_set_stacking_limits - set default limits for stacking devices
		* @lim: the queue_limits structure to reset
		@@ -127,6 +141,46 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
		}
		EXPORT_SYMBOL(blk_queue_bounce_limit);

		/*
		* Returns max guaranteed bytes which we can fit in a bio.
		*
		* We always assume that we can fit in at least PAGE_SIZE in a segment, apart
		* from first and last segments.
		*/
		static
		unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *limits)
		{
		unsigned int max_segments = min((u16)BIO_MAX_PAGES, limits->max_segments);
		unsigned int length;

		length = min(max_segments, 2U) * limits->logical_block_size;
		if (max_segments > 2)
		length += (max_segments - 2) * PAGE_SIZE;

		return length;
		}

		void blk_atomic_writes_update_limits(struct queue_limits *limits)
		{
		unsigned int unit_limit = min(limits->max_hw_sectors << SECTOR_SHIFT,
		blk_queue_max_guaranteed_bio(limits));

		unit_limit = rounddown_pow_of_two(unit_limit);

		if (!limits->aw_limits)
		return;

		limits->aw_limits->atomic_write_max_sectors =
		min(limits->aw_limits->atomic_write_hw_max >> SECTOR_SHIFT,
		limits->max_hw_sectors);
		limits->aw_limits->atomic_write_unit_min =
		min(limits->aw_limits->atomic_write_hw_unit_min, unit_limit);
		limits->aw_limits->atomic_write_unit_max =
		min(limits->aw_limits->atomic_write_hw_unit_max, unit_limit);
		}

		EXPORT_SYMBOL(blk_atomic_writes_update_limits);

		/**
		* blk_queue_max_hw_sectors - set max sectors for a request for this queue
		* @q: the request queue for the device
		@@ -161,6 +215,9 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
		max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
		max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
		limits->max_sectors = max_sectors;

		blk_atomic_writes_update_limits(limits);

		q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9);
		}
		EXPORT_SYMBOL(blk_queue_max_hw_sectors);