Commit ca27f5b5 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge tag 'nvme-5.15-2021-08-18' of git://git.infradead.org/nvme into for-5.15/drivers

Pull NVMe updates from Christoph:

"nvme updates for Linux 5.15.

 - suspend improvements for devices with an HMB (Keith Busch)
 - handle double completions more gacefull (Sagi Grimberg)
 - cleanup the selects for the nvme core code a bit (Sagi Grimberg)
 - don't update queue count when failing to set io queues (Ruozhu Li)
 - various nvmet connect fixes (Amit Engel)
 - cleanup lightnvm leftovers (Keith Busch, me)
 - small cleanups (Colin Ian King, Hou Pu)
 - add tracing for the Set Features command (Hou Pu)
 - CMB sysfs cleanups (Keith Busch)
 - add a mutex_destroy call (Keith Busch)"

* tag 'nvme-5.15-2021-08-18' of git://git.infradead.org/nvme: (21 commits)
  nvme: remove the unused NVME_NS_* enum
  nvme: remove nvm_ndev from ns
  nvme: Have NVME_FABRICS select NVME_CORE instead of transport drivers
  nvmet: check that host sqsize does not exceed ctrl MQES
  nvmet: avoid duplicate qid in connect cmd
  nvmet: pass back cntlid on successful completion
  nvme-rdma: don't update queue count when failing to set io queues
  nvme-tcp: don't update queue count when failing to set io queues
  nvme-tcp: pair send_mutex init with destroy
  nvme: allow user toggling hmb usage
  nvme-pci: disable hmb on idle suspend
  nvmet: remove redundant assignments of variable status
  nvmet: add set feature tracing support
  nvme: add set feature tracing support
  nvme-fabrics: remove superfluous nvmf_host_put in nvmf_parse_options
  nvme-pci: cmb sysfs: one file, one value
  nvme-pci: use attribute group for cmb sysfs
  nvme: code command_id with a genctr for use-after-free validation
  nvme-tcp: don't check blk_mq_tag_to_rq when receiving pdu data
  nvme-pci: limit maximum queue depth to 4095
  ...
parents b1a81163 9891668e
Loading
Loading
Loading
Loading
+1 −3
Original line number Diff line number Diff line
@@ -33,12 +33,12 @@ config NVME_HWMON
	  in the system.

config NVME_FABRICS
	select NVME_CORE
	tristate

config NVME_RDMA
	tristate "NVM Express over Fabrics RDMA host driver"
	depends on INFINIBAND && INFINIBAND_ADDR_TRANS && BLOCK
	select NVME_CORE
	select NVME_FABRICS
	select SG_POOL
	help
@@ -55,7 +55,6 @@ config NVME_FC
	tristate "NVM Express over Fabrics FC host driver"
	depends on BLOCK
	depends on HAS_DMA
	select NVME_CORE
	select NVME_FABRICS
	select SG_POOL
	help
@@ -72,7 +71,6 @@ config NVME_TCP
	tristate "NVM Express over Fabrics TCP host driver"
	depends on INET
	depends on BLOCK
	select NVME_CORE
	select NVME_FABRICS
	select CRYPTO
	select CRYPTO_CRC32C
+2 −1
Original line number Diff line number Diff line
@@ -1026,7 +1026,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
		return BLK_STS_IOERR;
	}

	cmd->common.command_id = req->tag;
	nvme_req(req)->genctr++;
	cmd->common.command_id = nvme_cid(req);
	trace_nvme_setup_cmd(req, cmd);
	return ret;
}
+0 −1
Original line number Diff line number Diff line
@@ -719,7 +719,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
				ret = -EINVAL;
				goto out;
			}
			nvmf_host_put(opts->host);
			opts->host = nvmf_host_add(p);
			kfree(p);
			if (!opts->host) {
+46 −7
Original line number Diff line number Diff line
@@ -47,11 +47,6 @@ extern struct workqueue_struct *nvme_wq;
extern struct workqueue_struct *nvme_reset_wq;
extern struct workqueue_struct *nvme_delete_wq;

enum {
	NVME_NS_LBA		= 0,
	NVME_NS_LIGHTNVM	= 1,
};

/*
 * List of workarounds for devices that required behavior not specified in
 * the standard.
@@ -152,6 +147,7 @@ enum nvme_quirks {
struct nvme_request {
	struct nvme_command	*cmd;
	union nvme_result	result;
	u8			genctr;
	u8			retries;
	u8			flags;
	u16			status;
@@ -443,7 +439,6 @@ struct nvme_ns {
	u32 ana_grpid;
#endif
	struct list_head siblings;
	struct nvm_dev *ndev;
	struct kref kref;
	struct nvme_ns_head *head;

@@ -491,6 +486,49 @@ struct nvme_ctrl_ops {
	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
};

/*
 * nvme command_id is constructed as such:
 * | xxxx | xxxxxxxxxxxx |
 *   gen    request tag
 */
#define nvme_genctr_mask(gen)			(gen & 0xf)
#define nvme_cid_install_genctr(gen)		(nvme_genctr_mask(gen) << 12)
#define nvme_genctr_from_cid(cid)		((cid & 0xf000) >> 12)
#define nvme_tag_from_cid(cid)			(cid & 0xfff)

static inline u16 nvme_cid(struct request *rq)
{
	return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag;
}

static inline struct request *nvme_find_rq(struct blk_mq_tags *tags,
		u16 command_id)
{
	u8 genctr = nvme_genctr_from_cid(command_id);
	u16 tag = nvme_tag_from_cid(command_id);
	struct request *rq;

	rq = blk_mq_tag_to_rq(tags, tag);
	if (unlikely(!rq)) {
		pr_err("could not locate request for tag %#x\n",
			tag);
		return NULL;
	}
	if (unlikely(nvme_genctr_mask(nvme_req(rq)->genctr) != genctr)) {
		dev_err(nvme_req(rq)->ctrl->device,
			"request %#x genctr mismatch (got %#x expected %#x)\n",
			tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr));
		return NULL;
	}
	return rq;
}

static inline struct request *nvme_cid_to_rq(struct blk_mq_tags *tags,
                u16 command_id)
{
	return blk_mq_tag_to_rq(tags, nvme_tag_from_cid(command_id));
}

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj,
			    const char *dev_name);
@@ -588,7 +626,8 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl)

static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
{
	return !qid && command_id >= NVME_AQ_BLK_MQ_DEPTH;
	return !qid &&
		nvme_tag_from_cid(command_id) >= NVME_AQ_BLK_MQ_DEPTH;
}

void nvme_complete_rq(struct request *req);
+137 −44
Original line number Diff line number Diff line
@@ -60,6 +60,8 @@ MODULE_PARM_DESC(sgl_threshold,
		"Use SGLs when average request segment size is larger or equal to "
		"this size. Use 0 to disable SGLs.");

#define NVME_PCI_MIN_QUEUE_SIZE 2
#define NVME_PCI_MAX_QUEUE_SIZE 4095
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
	.set = io_queue_depth_set,
@@ -68,7 +70,7 @@ static const struct kernel_param_ops io_queue_depth_ops = {

static unsigned int io_queue_depth = 1024;
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2 and < 4096");

static int io_queue_count_set(const char *val, const struct kernel_param *kp)
{
@@ -135,6 +137,7 @@ struct nvme_dev {
	u32 cmbloc;
	struct nvme_ctrl ctrl;
	u32 last_ps;
	bool hmb;

	mempool_t *iod_mempool;

@@ -153,18 +156,14 @@ struct nvme_dev {
	unsigned int nr_allocated_queues;
	unsigned int nr_write_queues;
	unsigned int nr_poll_queues;

	bool attrs_added;
};

static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
	int ret;
	u32 n;

	ret = kstrtou32(val, 10, &n);
	if (ret != 0 || n < 2)
		return -EINVAL;

	return param_set_uint(val, kp);
	return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE,
			NVME_PCI_MAX_QUEUE_SIZE);
}

static inline unsigned int sq_idx(unsigned int qid, u32 stride)
@@ -1014,7 +1013,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
		return;
	}

	req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), command_id);
	req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id);
	if (unlikely(!req)) {
		dev_warn(nvmeq->dev->ctrl.device,
			"invalid id %d completed on queue %d\n",
@@ -1808,17 +1807,6 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
	return ret >= 0 ? 0 : ret;
}

static ssize_t nvme_cmb_show(struct device *dev,
			     struct device_attribute *attr,
			     char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

	return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
		       ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);

static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
{
	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
@@ -1887,20 +1875,6 @@ static void nvme_map_cmb(struct nvme_dev *dev)
	if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
			(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
		pci_p2pmem_publish(pdev, true);

	if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
				    &dev_attr_cmb.attr, NULL))
		dev_warn(dev->ctrl.device,
			 "failed to add sysfs attribute for CMB\n");
}

static inline void nvme_release_cmb(struct nvme_dev *dev)
{
	if (dev->cmb_size) {
		sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
					     &dev_attr_cmb.attr, NULL);
		dev->cmb_size = 0;
	}
}

static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
@@ -1923,7 +1897,9 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
		dev_warn(dev->ctrl.device,
			 "failed to set host mem (err %d, flags %#x).\n",
			 ret, bits);
	}
	} else
		dev->hmb = bits & NVME_HOST_MEM_ENABLE;

	return ret;
}

@@ -2080,6 +2056,102 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
	return ret;
}

static ssize_t cmb_show(struct device *dev, struct device_attribute *attr,
		char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

	return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz  : x%08x\n",
		       ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR_RO(cmb);

static ssize_t cmbloc_show(struct device *dev, struct device_attribute *attr,
		char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

	return sysfs_emit(buf, "%u\n", ndev->cmbloc);
}
static DEVICE_ATTR_RO(cmbloc);

static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr,
		char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

	return sysfs_emit(buf, "%u\n", ndev->cmbsz);
}
static DEVICE_ATTR_RO(cmbsz);

static ssize_t hmb_show(struct device *dev, struct device_attribute *attr,
			char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

	return sysfs_emit(buf, "%d\n", ndev->hmb);
}

static ssize_t hmb_store(struct device *dev, struct device_attribute *attr,
			 const char *buf, size_t count)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
	bool new;
	int ret;

	if (strtobool(buf, &new) < 0)
		return -EINVAL;

	if (new == ndev->hmb)
		return count;

	if (new) {
		ret = nvme_setup_host_mem(ndev);
	} else {
		ret = nvme_set_host_mem(ndev, 0);
		if (!ret)
			nvme_free_host_mem(ndev);
	}

	if (ret < 0)
		return ret;

	return count;
}
static DEVICE_ATTR_RW(hmb);

static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj,
		struct attribute *a, int n)
{
	struct nvme_ctrl *ctrl =
		dev_get_drvdata(container_of(kobj, struct device, kobj));
	struct nvme_dev *dev = to_nvme_dev(ctrl);

	if (a == &dev_attr_cmb.attr ||
	    a == &dev_attr_cmbloc.attr ||
	    a == &dev_attr_cmbsz.attr) {
	    	if (!dev->cmbsz)
			return 0;
	}
	if (a == &dev_attr_hmb.attr && !ctrl->hmpre)
		return 0;

	return a->mode;
}

static struct attribute *nvme_pci_attrs[] = {
	&dev_attr_cmb.attr,
	&dev_attr_cmbloc.attr,
	&dev_attr_cmbsz.attr,
	&dev_attr_hmb.attr,
	NULL,
};

static const struct attribute_group nvme_pci_attr_group = {
	.attrs		= nvme_pci_attrs,
	.is_visible	= nvme_pci_attrs_are_visible,
};

/*
 * nirqs is the number of interrupts available for write and read
 * queues. The core already reserved an interrupt for the admin queue.
@@ -2751,6 +2823,10 @@ static void nvme_reset_work(struct work_struct *work)
		goto out;
	}

	if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj,
			&nvme_pci_attr_group))
		dev->attrs_added = true;

	nvme_start_ctrl(&dev->ctrl);
	return;

@@ -2999,6 +3075,13 @@ static void nvme_shutdown(struct pci_dev *pdev)
	nvme_disable_prepare_reset(dev, true);
}

static void nvme_remove_attrs(struct nvme_dev *dev)
{
	if (dev->attrs_added)
		sysfs_remove_group(&dev->ctrl.device->kobj,
				   &nvme_pci_attr_group);
}

/*
 * The driver's remove may be called on a device in a partially initialized
 * state. This function must not have any dependencies on the device state in
@@ -3020,7 +3103,7 @@ static void nvme_remove(struct pci_dev *pdev)
	nvme_stop_ctrl(&dev->ctrl);
	nvme_remove_namespaces(&dev->ctrl);
	nvme_dev_disable(dev, true);
	nvme_release_cmb(dev);
	nvme_remove_attrs(dev);
	nvme_free_host_mem(dev);
	nvme_dev_remove_admin(dev);
	nvme_free_queues(dev, 0);
@@ -3047,8 +3130,13 @@ static int nvme_resume(struct device *dev)

	if (ndev->last_ps == U32_MAX ||
	    nvme_set_power_state(ctrl, ndev->last_ps) != 0)
		return nvme_try_sched_reset(&ndev->ctrl);
		goto reset;
	if (ctrl->hmpre && nvme_setup_host_mem(ndev))
		goto reset;

	return 0;
reset:
	return nvme_try_sched_reset(ctrl);
}

static int nvme_suspend(struct device *dev)
@@ -3072,15 +3160,9 @@ static int nvme_suspend(struct device *dev)
	 * the PCI bus layer to put it into D3 in order to take the PCIe link
	 * down, so as to allow the platform to achieve its minimum low-power
	 * state (which may not be possible if the link is up).
	 *
	 * If a host memory buffer is enabled, shut down the device as the NVMe
	 * specification allows the device to access the host memory buffer in
	 * host DRAM from all power states, but hosts will fail access to DRAM
	 * during S3.
	 */
	if (pm_suspend_via_firmware() || !ctrl->npss ||
	    !pcie_aspm_enabled(pdev) ||
	    ndev->nr_host_mem_descs ||
	    (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
		return nvme_disable_prepare_reset(ndev, true);

@@ -3091,6 +3173,17 @@ static int nvme_suspend(struct device *dev)
	if (ctrl->state != NVME_CTRL_LIVE)
		goto unfreeze;

	/*
	 * Host memory access may not be successful in a system suspend state,
	 * but the specification allows the controller to access memory in a
	 * non-operational power state.
	 */
	if (ndev->hmb) {
		ret = nvme_set_host_mem(ndev, 0);
		if (ret < 0)
			goto unfreeze;
	}

	ret = nvme_get_power_state(ctrl, &ndev->last_ps);
	if (ret < 0)
		goto unfreeze;
Loading