Commit 29802d7c authored by Andreas Hindborg's avatar Andreas Hindborg Committed by Jens Axboe
Browse files

ublk: enable zoned storage support



Add zoned storage support to ublk: report_zones and operations:
 - REQ_OP_ZONE_OPEN
 - REQ_OP_ZONE_CLOSE
 - REQ_OP_ZONE_FINISH
 - REQ_OP_ZONE_RESET
 - REQ_OP_ZONE_APPEND

The zone append feature uses the `addr` field of `struct ublksrv_io_cmd` to
communicate ALBA back to the kernel. Therefore ublk must be used with the
user copy feature (UBLK_F_USER_COPY) for zoned storage support to be
available. Without this feature, ublk will not allow zoned storage support.

Signed-off-by: default avatarAndreas Hindborg <a.hindborg@samsung.com>
Reviewed-by: default avatarMing Lei <ming.lei@redhat.com>
Tested-by: default avatarMing Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230804114610.179530-4-nmi@metaspace.dk


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 1a6e88b9
Loading
Loading
Loading
Loading
+316 −15
Original line number Diff line number Diff line
@@ -56,16 +56,21 @@
		| UBLK_F_USER_RECOVERY_REISSUE \
		| UBLK_F_UNPRIVILEGED_DEV \
		| UBLK_F_CMD_IOCTL_ENCODE \
		| UBLK_F_USER_COPY)
		| UBLK_F_USER_COPY \
		| UBLK_F_ZONED)

/* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
		UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
#define UBLK_PARAM_TYPE_ALL                                \
	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)

struct ublk_rq_data {
	struct llist_node node;

	struct kref ref;
	__u64 sector;
	__u32 operation;
	__u32 nr_zones;
};

struct ublk_uring_cmd_pdu {
@@ -185,11 +190,263 @@ struct ublk_params_header {
	__u32	types;
};

static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
						   int tag);

static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
{
	return ub->dev_info.flags & UBLK_F_USER_COPY;
}

static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
{
	return ub->dev_info.flags & UBLK_F_ZONED;
}

static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
{
	return ubq->flags & UBLK_F_ZONED;
}

#ifdef CONFIG_BLK_DEV_ZONED

static int ublk_get_nr_zones(const struct ublk_device *ub)
{
	const struct ublk_param_basic *p = &ub->params.basic;

	/* Zone size is a power of 2 */
	return p->dev_sectors >> ilog2(p->chunk_sectors);
}

static int ublk_revalidate_disk_zones(struct ublk_device *ub)
{
	return blk_revalidate_disk_zones(ub->ub_disk, NULL);
}

static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
{
	const struct ublk_param_zoned *p = &ub->params.zoned;
	int nr_zones;

	if (!ublk_dev_is_zoned(ub))
		return -EINVAL;

	if (!p->max_zone_append_sectors)
		return -EINVAL;

	nr_zones = ublk_get_nr_zones(ub);

	if (p->max_active_zones > nr_zones)
		return -EINVAL;

	if (p->max_open_zones > nr_zones)
		return -EINVAL;

	return 0;
}

static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
	const struct ublk_param_zoned *p = &ub->params.zoned;

	disk_set_zoned(ub->ub_disk, BLK_ZONED_HM);
	blk_queue_required_elevator_features(ub->ub_disk->queue,
					     ELEVATOR_F_ZBD_SEQ_WRITE);
	disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
	disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
	blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);

	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);

	return 0;
}

/* Based on virtblk_alloc_report_buffer */
static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
				      unsigned int nr_zones, size_t *buflen)
{
	struct request_queue *q = ublk->ub_disk->queue;
	size_t bufsize;
	void *buf;

	nr_zones = min_t(unsigned int, nr_zones,
			 ublk->ub_disk->nr_zones);

	bufsize = nr_zones * sizeof(struct blk_zone);
	bufsize =
		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);

	while (bufsize >= sizeof(struct blk_zone)) {
		buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
		if (buf) {
			*buflen = bufsize;
			return buf;
		}
		bufsize >>= 1;
	}

	*buflen = 0;
	return NULL;
}

static int ublk_report_zones(struct gendisk *disk, sector_t sector,
		      unsigned int nr_zones, report_zones_cb cb, void *data)
{
	struct ublk_device *ub = disk->private_data;
	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
	unsigned int done_zones = 0;
	unsigned int max_zones_per_request;
	int ret;
	struct blk_zone *buffer;
	size_t buffer_length;

	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
			 nr_zones);

	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
	if (!buffer)
		return -ENOMEM;

	max_zones_per_request = buffer_length / sizeof(struct blk_zone);

	while (done_zones < nr_zones) {
		unsigned int remaining_zones = nr_zones - done_zones;
		unsigned int zones_in_request =
			min_t(unsigned int, remaining_zones, max_zones_per_request);
		struct request *req;
		struct ublk_rq_data *pdu;
		blk_status_t status;

		memset(buffer, 0, buffer_length);

		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
		if (IS_ERR(req)) {
			ret = PTR_ERR(req);
			goto out;
		}

		pdu = blk_mq_rq_to_pdu(req);
		pdu->operation = UBLK_IO_OP_REPORT_ZONES;
		pdu->sector = sector;
		pdu->nr_zones = zones_in_request;

		ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
					GFP_KERNEL);
		if (ret) {
			blk_mq_free_request(req);
			goto out;
		}

		status = blk_execute_rq(req, 0);
		ret = blk_status_to_errno(status);
		blk_mq_free_request(req);
		if (ret)
			goto out;

		for (unsigned int i = 0; i < zones_in_request; i++) {
			struct blk_zone *zone = buffer + i;

			/* A zero length zone means no more zones in this response */
			if (!zone->len)
				break;

			ret = cb(zone, i, data);
			if (ret)
				goto out;

			done_zones++;
			sector += zone_size_sectors;

		}
	}

	ret = done_zones;

out:
	kvfree(buffer);
	return ret;
}

static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
					 struct request *req)
{
	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
	struct ublk_io *io = &ubq->ios[req->tag];
	struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
	u32 ublk_op;

	switch (req_op(req)) {
	case REQ_OP_ZONE_OPEN:
		ublk_op = UBLK_IO_OP_ZONE_OPEN;
		break;
	case REQ_OP_ZONE_CLOSE:
		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
		break;
	case REQ_OP_ZONE_FINISH:
		ublk_op = UBLK_IO_OP_ZONE_FINISH;
		break;
	case REQ_OP_ZONE_RESET:
		ublk_op = UBLK_IO_OP_ZONE_RESET;
		break;
	case REQ_OP_ZONE_APPEND:
		ublk_op = UBLK_IO_OP_ZONE_APPEND;
		break;
	case REQ_OP_DRV_IN:
		ublk_op = pdu->operation;
		switch (ublk_op) {
		case UBLK_IO_OP_REPORT_ZONES:
			iod->op_flags = ublk_op | ublk_req_build_flags(req);
			iod->nr_zones = pdu->nr_zones;
			iod->start_sector = pdu->sector;
			return BLK_STS_OK;
		default:
			return BLK_STS_IOERR;
		}
	case REQ_OP_ZONE_RESET_ALL:
	case REQ_OP_DRV_OUT:
		/* We do not support reset_all and drv_out */
		return BLK_STS_NOTSUPP;
	default:
		return BLK_STS_IOERR;
	}

	iod->op_flags = ublk_op | ublk_req_build_flags(req);
	iod->nr_sectors = blk_rq_sectors(req);
	iod->start_sector = blk_rq_pos(req);
	iod->addr = io->addr;

	return BLK_STS_OK;
}

#else

#define ublk_report_zones (NULL)

static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
{
	return -EOPNOTSUPP;
}

static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
	return -EOPNOTSUPP;
}

static int ublk_revalidate_disk_zones(struct ublk_device *ub)
{
	return 0;
}

static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
					 struct request *req)
{
	return -EOPNOTSUPP;
}

#endif

static inline void __ublk_complete_rq(struct request *req);
static void ublk_complete_rq(struct kref *ref);

@@ -286,6 +543,9 @@ static int ublk_validate_params(const struct ublk_device *ub)

		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
			return -EINVAL;

		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
			return -EINVAL;
	} else
		return -EINVAL;

@@ -304,6 +564,11 @@ static int ublk_validate_params(const struct ublk_device *ub)
	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
		return -EINVAL;

	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
		return ublk_dev_param_zoned_validate(ub);
	else if (ublk_dev_is_zoned(ub))
		return -EINVAL;

	return 0;
}

@@ -317,6 +582,9 @@ static int ublk_apply_params(struct ublk_device *ub)
	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
		ublk_dev_param_discard_apply(ub);

	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
		return ublk_dev_param_zoned_apply(ub);

	return 0;
}

@@ -487,6 +755,7 @@ static const struct block_device_operations ub_fops = {
	.owner =	THIS_MODULE,
	.open =		ublk_open,
	.free_disk =	ublk_free_disk,
	.report_zones =	ublk_report_zones,
};

#define UBLK_MAX_PIN_PAGES	32
@@ -601,7 +870,8 @@ static inline bool ublk_need_map_req(const struct request *req)

static inline bool ublk_need_unmap_req(const struct request *req)
{
	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ;
	return ublk_rq_has_data(req) &&
	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
}

static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
@@ -685,8 +955,13 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
{
	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
	struct ublk_io *io = &ubq->ios[req->tag];
	enum req_op op = req_op(req);
	u32 ublk_op;

	if (!ublk_queue_is_zoned(ubq) &&
	    (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
		return -EIO;

	switch (req_op(req)) {
	case REQ_OP_READ:
		ublk_op = UBLK_IO_OP_READ;
@@ -704,6 +979,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
		break;
	default:
		if (ublk_queue_is_zoned(ubq))
			return ublk_setup_iod_zoned(ubq, req);
		return BLK_STS_IOERR;
	}

@@ -756,7 +1033,8 @@ static inline void __ublk_complete_rq(struct request *req)
	 *
	 * Both the two needn't unmap.
	 */
	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE)
	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
	    req_op(req) != REQ_OP_DRV_IN)
		goto exit;

	/* for READ request, writing data in iod->addr to rq buffers */
@@ -1120,6 +1398,9 @@ static void ublk_commit_completion(struct ublk_device *ub,
	/* find the io request and complete */
	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);

	if (req_op(req) == REQ_OP_ZONE_APPEND)
		req->__sector = ub_cmd->zone_append_lba;

	if (req && likely(!blk_should_fake_timeout(req->q)))
		ublk_put_req_ref(ubq, req);
}
@@ -1468,8 +1749,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
			if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
						req_op(req) == REQ_OP_READ))
				goto out;
		} else if (ub_cmd->addr) {
			/* User copy requires addr to be unset */
		} else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
			/*
			 * User copy requires addr to be unset when command is
			 * not zone append
			 */
			ret = -EINVAL;
			goto out;
		}
@@ -1546,11 +1830,14 @@ static inline bool ublk_check_ubuf_dir(const struct request *req,
		int ubuf_dir)
{
	/* copy ubuf to request pages */
	if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
	    ubuf_dir == ITER_SOURCE)
		return true;

	/* copy request pages to ubuf */
	if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
	if ((req_op(req) == REQ_OP_WRITE ||
	     req_op(req) == REQ_OP_ZONE_APPEND) &&
	    ubuf_dir == ITER_DEST)
		return true;

	return false;
@@ -1889,17 +2176,24 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)

	get_device(&ub->cdev_dev);
	ub->dev_info.state = UBLK_S_DEV_LIVE;

	if (ublk_dev_is_zoned(ub)) {
		ret = ublk_revalidate_disk_zones(ub);
		if (ret)
			goto out_put_cdev;
	}

	ret = add_disk(disk);
	if (ret)
		goto out_put_cdev;

	set_bit(UB_STATE_USED, &ub->state);

out_put_cdev:
	if (ret) {
		/*
		 * Has to drop the reference since ->free_disk won't be
		 * called in case of add_disk failure.
		 */
		ub->dev_info.state = UBLK_S_DEV_DEAD;
		ublk_put_device(ub);
		goto out_put_disk;
	}
	set_bit(UB_STATE_USED, &ub->state);
out_put_disk:
	if (ret)
		put_disk(disk);
@@ -2049,6 +2343,13 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
	if (ublk_dev_is_user_copy(ub))
		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;

	/* Zoned storage support requires user copy feature */
	if (ublk_dev_is_zoned(ub) &&
	    (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) {
		ret = -EINVAL;
		goto out_free_dev_number;
	}

	/* We are not ready to support zero copy */
	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;

+54 −9
Original line number Diff line number Diff line
@@ -176,6 +176,12 @@
/* Copy between request and user buffer by pread()/pwrite() */
#define UBLK_F_USER_COPY	(1UL << 7)

/*
 * User space sets this flag when setting up the device to request zoned storage support. Kernel may
 * deny the request by returning an error.
 */
#define UBLK_F_ZONED (1ULL << 8)

/* device state */
#define UBLK_S_DEV_DEAD	0
#define UBLK_S_DEV_LIVE	1
@@ -235,6 +241,23 @@ struct ublksrv_ctrl_dev_info {
#define		UBLK_IO_OP_DISCARD		3
#define		UBLK_IO_OP_WRITE_SAME		4
#define		UBLK_IO_OP_WRITE_ZEROES		5
#define		UBLK_IO_OP_ZONE_OPEN		10
#define		UBLK_IO_OP_ZONE_CLOSE		11
#define		UBLK_IO_OP_ZONE_FINISH		12
#define		UBLK_IO_OP_ZONE_APPEND		13
#define		UBLK_IO_OP_ZONE_RESET		15
/*
 * Construct a zone report. The report request is carried in `struct
 * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone
 * and shall indicate the first zone of the report. The `nr_zones` shall
 * indicate how many zones should be reported at most. The report shall be
 * delivered as a `struct blk_zone` array. To report fewer zones than requested,
 * zero the last entry of the returned array.
 *
 * Related definitions(blk_zone, blk_zone_cond, blk_zone_type, ...) in
 * include/uapi/linux/blkzoned.h are part of ublk UAPI.
 */
#define		UBLK_IO_OP_REPORT_ZONES		18

#define		UBLK_IO_F_FAILFAST_DEV		(1U << 8)
#define		UBLK_IO_F_FAILFAST_TRANSPORT	(1U << 9)
@@ -255,7 +278,10 @@ struct ublksrv_io_desc {
	/* op: bit 0-7, flags: bit 8-31 */
	__u32		op_flags;

	union {
		__u32		nr_sectors;
		__u32		nr_zones; /* for UBLK_IO_OP_REPORT_ZONES */
	};

	/* start sector for this io */
	__u64		start_sector;
@@ -284,11 +310,21 @@ struct ublksrv_io_cmd {
	/* io result, it is valid for COMMIT* command only */
	__s32	result;

	union {
		/*
		 * userspace buffer address in ublksrv daemon process, valid for
		 * FETCH* command only
		 *
		 * `addr` should not be used when UBLK_F_USER_COPY is enabled,
		 * because userspace handles data copy by pread()/pwrite() over
		 * /dev/ublkcN. But in case of UBLK_F_ZONED, this union is
		 * re-used to pass back the allocated LBA for
		 * UBLK_IO_OP_ZONE_APPEND which actually depends on
		 * UBLK_F_USER_COPY
		 */
		__u64	addr;
		__u64	zone_append_lba;
	};
};

struct ublk_param_basic {
@@ -331,6 +367,13 @@ struct ublk_param_devt {
	__u32   disk_minor;
};

struct ublk_param_zoned {
	__u32	max_open_zones;
	__u32	max_active_zones;
	__u32	max_zone_append_sectors;
	__u8	reserved[20];
};

struct ublk_params {
	/*
	 * Total length of parameters, userspace has to set 'len' for both
@@ -342,11 +385,13 @@ struct ublk_params {
#define UBLK_PARAM_TYPE_BASIC           (1 << 0)
#define UBLK_PARAM_TYPE_DISCARD         (1 << 1)
#define UBLK_PARAM_TYPE_DEVT            (1 << 2)
#define UBLK_PARAM_TYPE_ZONED           (1 << 3)
	__u32	types;			/* types of parameter included */

	struct ublk_param_basic		basic;
	struct ublk_param_discard	discard;
	struct ublk_param_devt		devt;
	struct ublk_param_zoned	zoned;
};

#endif