Unverified Commit 7dd4206a authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!2459 Bugfixes for RDMA/hns

Merge Pull Request from: @stinft 
 
Upload seven bugfixes to fix four issues.
1. Problems related to dca:
  (1) HNS_ROCE_UCTX_RSP_DCA_FLAGS is set only if HNS_ROCE_UCTX_CONFIG_DCA is configured;
  (2) The hr_qp can be a NULL pointer. A check has been added to avoid illegal access.
  (3) DCA debugfs is not needed when DCA is not set for this ucontext.
  (4) When unregistering the device or destroying of ucontext and accessing dca debugfs concurrently, 
      there may be a problem of accessing a null pointer. This patch fixes it by delaying the pointer 
      assignment to null until debugfs has been unregistered.
https://gitee.com/openeuler/kernel/issues/I87LCF

2. Fix printing level of asynchronous events:
   The current driver will print all asynchronous events. Some of the print levels are set improperly, 
   e.g. SRQ limit reach and SRQ last wqe reach, which may also occur during normal operation of the 
   software. Currently, the information of these event is printed as a warning, which causes a large 
   amount of printing even during normal use of the application. As a result, the service performance 
   deteriorates. This patch fixes the printing storms by modifying the print level.
https://gitee.com/openeuler/kernel/issues/I87LIY

3. Fix signed-unsigned mix with relational:
   The ib_mtu_enum_to_int() and uverbs_attr_get_len() may returns a negative value. In this case, 
   mixed comparisons of signed and unsigned types will throw wrong results.
https://gitee.com/openeuler/kernel/issues/I87LLN

4. Fix the concurrency error between bond and reset:
   In the concurrency process between setting bond and reset, when the reset process is finished, the 
   driver detects that bond resource has already been allocated, thus entering the bond recover 
   process, where the bond state is set to HNS_ROCE_BOND_IS_BONDED. But at this point
   the set bond process hasn't been executed yet(i.e. slaves haven't been uninited). This wrong bond 
   state leads to the abnormal reset result that 2 slaves are both registered as bond device.
   Thus delete the bond state setting in bond recover process. Besides, to fix other potential 
   concurrency errors between bond and reset, some improvements are also added:
   (1) For the situation that reset occurs before bond work, add a reset check at the beginning of 
       bond work. If there is an ongoing reset process, re-queue the bond work until the reset is 
       finished.
   (2) For the situation that reset occurs during bond work, add reset checks to bond init/uninit 
       process, treating this situation as an abnormal case.
https://gitee.com/openeuler/kernel/issues/I87LSW

 Chengchang Tang (6):
   RDMA/hns: Fix context dca configuration
   RDMA/hns: Fix potential NULL pointer in DCA memory query
   RDMA/hns: Fix registering dca debugfs when dca has not been set
   RDMA/hns: Fix printing level of asynchronous events
   RDMA/hns: Fix signed-unsigned mix with relational
   RDMA/hns: Fix unregistering device and accessing to debugfs concurrently
 
Junxian Huang (1):
   RDMA/hns: Fix the concurrency error between bond and reset. 
 
Link:https://gitee.com/openeuler/kernel/pulls/2459

 

Reviewed-by: default avatarChengchang Tang <tangchengchang@huawei.com>
Signed-off-by: default avatarJialin Zhang <zhangjialin11@huawei.com>
parents 17c17591 b927e306
Loading
Loading
Loading
Loading
+113 −41
Original line number Diff line number Diff line
@@ -212,8 +212,11 @@ static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp)

	for (i = ROCE_BOND_FUNC_MAX - 1; i >= 0; i--) {
		net_dev = bond_grp->bond_func_info[i].net_dev;
		if (net_dev)
			hns_roce_bond_uninit_client(bond_grp, i);
		if (net_dev) {
			ret = hns_roce_bond_uninit_client(bond_grp, i);
			if (ret)
				goto set_err;
		}
	}

	bond_grp->bond_state = HNS_ROCE_BOND_REGISTERING;
@@ -235,15 +238,19 @@ static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp)

	ret = bond_grp->main_hr_dev ?
	      hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND) : -EIO;
	if (ret)
		goto set_err;

	bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
	complete(&bond_grp->bond_work_done);
	ibdev_info(&bond_grp->main_hr_dev->ib_dev, "RoCE set bond finished!\n");

	if (ret)
	return;

set_err:
	bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
	BOND_ERR_LOG("failed to set RoCE bond, ret = %d.\n", ret);
	else
		ibdev_info(&bond_grp->main_hr_dev->ib_dev,
			   "RoCE set bond finished!\n");
	hns_roce_cleanup_bond(bond_grp);
}

static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp)
@@ -259,7 +266,11 @@ static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp)
	bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
	bond_grp->main_hr_dev = NULL;

	hns_roce_bond_uninit_client(bond_grp, main_func_idx);
	ret = hns_roce_bond_uninit_client(bond_grp, main_func_idx);
	if (ret) {
		BOND_ERR_LOG("failed to uninit bond, ret = %d.\n", ret);
		return;
	}

	for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
		net_dev = bond_grp->bond_func_info[i].net_dev;
@@ -273,8 +284,7 @@ static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp)
out:
	ret = hns_roce_cleanup_bond(bond_grp);
	if (!ret)
		ibdev_info(&bond_grp->main_hr_dev->ib_dev,
			   "RoCE clear bond finished!\n");
		ibdev_info(&hr_dev->ib_dev, "RoCE clear bond finished!\n");
}

static void hns_roce_slave_changestate(struct hns_roce_bond_group *bond_grp)
@@ -304,8 +314,15 @@ static void hns_roce_slave_inc(struct hns_roce_bond_group *bond_grp)
	int ret;

	while (inc_slave_map > 0) {
		if (inc_slave_map & 1)
			hns_roce_bond_uninit_client(bond_grp, inc_func_idx);
		if (inc_slave_map & 1) {
			ret = hns_roce_bond_uninit_client(bond_grp, inc_func_idx);
			if (ret) {
				BOND_ERR_LOG("failed to uninit slave %u, ret = %d.\n",
					     inc_func_idx, ret);
				bond_grp->bond_func_info[inc_func_idx].net_dev = NULL;
				bond_grp->slave_map &= ~(1U << inc_func_idx);
			}
		}
		inc_slave_map >>= 1;
		inc_func_idx++;
	}
@@ -326,22 +343,27 @@ static void hns_roce_slave_inc(struct hns_roce_bond_group *bond_grp)
			   "RoCE slave increase finished!\n");
}

static void hns_roce_slave_dec(struct hns_roce_bond_group *bond_grp)
static int switch_main_dev(struct hns_roce_bond_group *bond_grp,
			   u32 *dec_slave_map, u8 main_func_idx)
{
	u8 main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn);
	u32 dec_slave_map = bond_grp->slave_map_diff;
	struct hns_roce_dev *hr_dev;
	struct net_device *net_dev;
	u8 dec_func_idx = 0;
	int ret;
	int i;

	if (dec_slave_map & (1 << main_func_idx)) {
	bond_grp->main_hr_dev = NULL;
		hns_roce_bond_uninit_client(bond_grp, main_func_idx);
	ret = hns_roce_bond_uninit_client(bond_grp, main_func_idx);
	if (ret) {
		BOND_ERR_LOG("failed to uninit main dev %u, ret = %d.\n",
			     main_func_idx, ret);
		*dec_slave_map &= ~(1U << main_func_idx);
		bond_grp->slave_map |= (1U << main_func_idx);
		return ret;
	}

	for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
		net_dev = bond_grp->bond_func_info[i].net_dev;
			if (!(dec_slave_map & (1 << i)) && net_dev) {
		if (!(*dec_slave_map & (1 << i)) && net_dev) {
			bond_grp->bond_state = HNS_ROCE_BOND_REGISTERING;
			hr_dev = hns_roce_bond_init_client(bond_grp, i);
			if (hr_dev) {
@@ -350,12 +372,37 @@ static void hns_roce_slave_dec(struct hns_roce_bond_group *bond_grp)
			}
		}
	}

	if (!bond_grp->main_hr_dev)
		return -ENODEV;

	return 0;
}

static void hns_roce_slave_dec(struct hns_roce_bond_group *bond_grp)
{
	u8 main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn);
	u32 dec_slave_map = bond_grp->slave_map_diff;
	struct net_device *net_dev;
	u8 dec_func_idx = 0;
	int ret;

	if (dec_slave_map & (1 << main_func_idx)) {
		ret = switch_main_dev(bond_grp, &dec_slave_map, main_func_idx);
		if (ret == -ENODEV)
			goto dec_err;
	}

	while (dec_slave_map > 0) {
		if (dec_slave_map & 1) {
			net_dev = bond_grp->bond_func_info[dec_func_idx].net_dev;
			bond_grp->bond_func_info[dec_func_idx].net_dev = NULL;
			hns_roce_bond_init_client(bond_grp, dec_func_idx);
			if (!hns_roce_bond_init_client(bond_grp, dec_func_idx)) {
				BOND_ERR_LOG("failed to re-init slave %u.\n",
					     dec_func_idx);
				bond_grp->slave_map |= (1U << dec_func_idx);
				bond_grp->bond_func_info[dec_func_idx].net_dev = net_dev;
			}
		}
		dec_slave_map >>= 1;
		dec_func_idx++;
@@ -366,16 +413,20 @@ static void hns_roce_slave_dec(struct hns_roce_bond_group *bond_grp)

	ret = bond_grp->main_hr_dev ?
	      hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND) : -EIO;
	if (ret)
		goto dec_err;

	bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
	complete(&bond_grp->bond_work_done);

	if (ret)
		BOND_ERR_LOG("failed to decrease RoCE bond slave, ret = %d.\n",
			     ret);
	else
	ibdev_info(&bond_grp->main_hr_dev->ib_dev,
		   "RoCE slave decrease finished!\n");

	return;

dec_err:
	bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
	BOND_ERR_LOG("failed to decrease RoCE bond slave, ret = %d.\n", ret);
	hns_roce_cleanup_bond(bond_grp);
}

static void hns_roce_do_bond(struct hns_roce_bond_group *bond_grp)
@@ -415,7 +466,25 @@ static void hns_roce_do_bond(struct hns_roce_bond_group *bond_grp)
	}
}

void hns_roce_do_bond_work(struct work_struct *work)
bool is_bond_slave_in_reset(struct hns_roce_bond_group *bond_grp)
{
	struct hnae3_handle *handle;
	struct net_device *net_dev;
	int i;

	for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
		net_dev = bond_grp->bond_func_info[i].net_dev;
		handle = bond_grp->bond_func_info[i].handle;
		if (net_dev && handle &&
		    handle->rinfo.reset_state != HNS_ROCE_STATE_NON_RST &&
		    handle->rinfo.reset_state != HNS_ROCE_STATE_RST_INITED)
			return true;
	}

	return false;
}

static void hns_roce_do_bond_work(struct work_struct *work)
{
	struct delayed_work *delayed_work = to_delayed_work(work);
	struct hns_roce_bond_group *bond_grp =
@@ -423,15 +492,19 @@ void hns_roce_do_bond_work(struct work_struct *work)
			     bond_work);
	int status;

	if (is_bond_slave_in_reset(bond_grp))
		goto queue_work;

	status = mutex_trylock(&roce_bond_mutex);
	if (!status) {
		/* delay 1 sec */
		hns_roce_queue_bond_work(bond_grp, HZ);
		return;
	}
	if (!status)
		goto queue_work;

	hns_roce_do_bond(bond_grp);
	mutex_unlock(&roce_bond_mutex);
	return;

queue_work:
	hns_roce_queue_bond_work(bond_grp, HZ);
}

int hns_roce_bond_init(struct hns_roce_dev *hr_dev)
@@ -453,7 +526,6 @@ int hns_roce_bond_init(struct hns_roce_dev *hr_dev)
				  ret);
			return ret;
		}
		bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
	}

	hr_dev->bond_nb.notifier_call = hns_roce_bond_event;
+1 −0
Original line number Diff line number Diff line
@@ -87,5 +87,6 @@ bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev);
struct net_device *hns_roce_get_bond_netdev(struct hns_roce_dev *hr_dev);
struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev,
						  u8 bus_num);
bool is_bond_slave_in_reset(struct hns_roce_bond_group *bond_grp);

#endif
+7 −3
Original line number Diff line number Diff line
@@ -1752,13 +1752,17 @@ static int UVERBS_HANDLER(HNS_IB_METHOD_DCA_MEM_QUERY)(
	struct uverbs_attr_bundle *attrs)
{
	struct hns_roce_qp *hr_qp = uverbs_attr_to_hr_qp(attrs);
	struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device);
	struct hns_roce_dca_ctx *ctx = hr_qp_to_dca_ctx(hr_dev, hr_qp);
	struct dca_page_query_active_attr active_attr = {};
	struct hns_roce_dca_ctx *ctx = NULL;
	struct hns_roce_dev *hr_dev = NULL;
	u32 page_idx, page_ofs;
	int ret;

	if (!hr_qp)
	if (hr_qp)
		hr_dev = to_hr_dev(hr_qp->ibqp.device);
	if (hr_dev)
		ctx = hr_qp_to_dca_ctx(hr_dev, hr_qp);
	if (!ctx)
		return -EINVAL;

	ret = uverbs_copy_from(&page_idx, attrs,
+3 −4
Original line number Diff line number Diff line
@@ -508,7 +508,7 @@ void hns_roce_register_uctx_debugfs(struct hns_roce_dev *hr_dev,
		return;

	dca_dbgfs = dev_dbgfs->dca_root;
	if (dca_dbgfs) {
	if (dca_dbgfs && (uctx->config & HNS_ROCE_UCTX_CONFIG_DCA)) {
		uctx->dca_dbgfs = kzalloc(sizeof(struct hns_dca_ctx_debugfs),
					  GFP_KERNEL);
		if (!uctx->dca_dbgfs)
@@ -525,8 +525,8 @@ void hns_roce_unregister_uctx_debugfs(struct hns_roce_dev *hr_dev,
	struct hns_dca_ctx_debugfs *dbgfs = uctx->dca_dbgfs;

	if (dbgfs) {
		uctx->dca_dbgfs = NULL;
		cleanup_dca_ctx_debugfs(dbgfs);
		uctx->dca_dbgfs = NULL;
		kfree(dbgfs);
	}
}
@@ -567,14 +567,13 @@ void hns_roce_unregister_debugfs(struct hns_roce_dev *hr_dev)
	if (!dbgfs)
		return;

	hr_dev->dbgfs = NULL;

	if (dbgfs->dca_root) {
		destroy_dca_debugfs(dbgfs->dca_root);
		dbgfs->dca_root = NULL;
	}

	debugfs_remove_recursive(dbgfs->root);
	hr_dev->dbgfs = NULL;
	kfree(dbgfs);
}

+15 −7
Original line number Diff line number Diff line
@@ -282,7 +282,7 @@ static bool check_inl_data_len(struct hns_roce_qp *qp, unsigned int len)
	struct hns_roce_dev *hr_dev = to_hr_dev(qp->ibqp.device);
	int mtu = ib_mtu_enum_to_int(qp->path_mtu);

	if (len > qp->max_inline_data || len > mtu) {
	if (mtu < 0 || len > qp->max_inline_data || len > mtu) {
		ibdev_err(&hr_dev->ib_dev,
			  "invalid length of data, data len = %u, max inline len = %u, path mtu = %d.\n",
			  len, qp->max_inline_data, mtu);
@@ -6240,7 +6240,7 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
	case HNS_ROCE_EVENT_TYPE_COMM_EST:
		break;
	case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
		ibdev_warn(ibdev, "Send queue drained.\n");
		ibdev_dbg(ibdev, "Send queue drained.\n");
		break;
	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
		ibdev_err(ibdev, "Local work queue 0x%x catast error, sub_event type is: %d\n",
@@ -6255,10 +6255,10 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
			  irq_work->queue_num, irq_work->sub_type);
		break;
	case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
		ibdev_warn(ibdev, "SRQ limit reach.\n");
		ibdev_dbg(ibdev, "SRQ limit reach.\n");
		break;
	case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
		ibdev_warn(ibdev, "SRQ last wqe reach.\n");
		ibdev_dbg(ibdev, "SRQ last wqe reach.\n");
		break;
	case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
		ibdev_err(ibdev, "SRQ catas error.\n");
@@ -7401,6 +7401,9 @@ struct hns_roce_dev
	if (!handle || !handle->client)
		return NULL;

	if (is_bond_slave_in_reset(bond_grp))
		return NULL;

	ret = hns_roce_hw_v2_init_instance(handle);
	if (ret)
		return NULL;
@@ -7408,19 +7411,24 @@ struct hns_roce_dev
	return handle->priv;
}

void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp,
int hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp,
				int func_idx)
{
	struct hnae3_handle *handle = bond_grp->bond_func_info[func_idx].handle;

	if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED)
		return;
		return -EPERM;

	if (is_bond_slave_in_reset(bond_grp))
		return -EBUSY;

	handle->rinfo.instance_state = HNS_ROCE_STATE_BOND_UNINIT;

	__hns_roce_hw_v2_uninit_instance(handle, false, false);

	handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;

	return 0;
}

static void hns_roce_v2_reset_notify_user(struct hns_roce_dev *hr_dev)
Loading