Commit e6fb246c authored by Jason Gunthorpe's avatar Jason Gunthorpe
Browse files

RDMA/mlx5: Consolidate MR destruction to mlx5_ib_dereg_mr()

Now that the SRCU stuff has been removed the entire MR destroy logic can
be made a lot simpler. Currently there are many different ways to destroy a
MR and it makes it really hard to do this task correctly. Route all
destruction through mlx5_ib_dereg_mr() and make it work for all
situations.

Since it turns out all the different MR types do basically the same thing
this removes a lot of knowledge of MR internals from ODP and leaves ODP
just exporting an operation to clean up children.

This fixes a few weird corner cases bugs and firmly uses the correct
ordering of the MR destruction:
 - Stop parallel access to the mkey via the ODP xarray
 - Stop DMA
 - Release the umem
 - Clean up ODP children
 - Free/Recycle the MR

Link: https://lore.kernel.org/r/20210304120745.1090751-4-leon@kernel.org


Signed-off-by: default avatarLeon Romanovsky <leonro@nvidia.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@nvidia.com>
parent f18ec422
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -168,6 +168,10 @@ void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
{
	struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;

	dma_resv_lock(dmabuf->resv, NULL);
	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
	dma_resv_unlock(dmabuf->resv);

	dma_buf_detach(dmabuf, umem_dmabuf->attach);
	dma_buf_put(dmabuf);
	kfree(umem_dmabuf);
+1 −4
Original line number Diff line number Diff line
@@ -1285,8 +1285,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
					     struct ib_udata *udata,
					     int access_flags);
void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr);
void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr);
void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr);
struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
				    u64 length, u64 virt_addr, int access_flags,
				    struct ib_pd *pd, struct ib_udata *udata);
@@ -1334,8 +1333,6 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);

struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
				       unsigned int entry, int access_flags);
void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr);

int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
			    struct ib_mr_status *mr_status);
+62 −71
Original line number Diff line number Diff line
@@ -119,8 +119,6 @@ mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
				create_mkey_callback, context);
}

static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
static int mr_cache_max_order(struct mlx5_ib_dev *dev);
static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);

@@ -627,30 +625,10 @@ static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
	return NULL;
}

static void detach_mr_from_cache(struct mlx5_ib_mr *mr)
static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
{
	struct mlx5_cache_ent *ent = mr->cache_ent;

	mr->cache_ent = NULL;
	spin_lock_irq(&ent->lock);
	ent->total_mrs--;
	spin_unlock_irq(&ent->lock);
}

void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
{
	struct mlx5_cache_ent *ent = mr->cache_ent;

	if (!ent)
		return;

	if (mlx5_mr_cache_invalidate(mr)) {
		detach_mr_from_cache(mr);
		destroy_mkey(dev, mr);
		kfree(mr);
		return;
	}

	spin_lock_irq(&ent->lock);
	list_add_tail(&mr->list, &ent->head);
	ent->available_mrs++;
@@ -1503,7 +1481,7 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
		 */
		err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
		if (err) {
			dereg_mr(dev, mr);
			mlx5_ib_dereg_mr(&mr->ibmr, NULL);
			return ERR_PTR(err);
		}
	}
@@ -1560,7 +1538,7 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
	return &mr->ibmr;

err_dereg_mr:
	dereg_mr(dev, mr);
	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
	return ERR_PTR(err);
}

@@ -1657,7 +1635,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
	return &mr->ibmr;

err_dereg_mr:
	dereg_mr(dev, mr);
	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
	return ERR_PTR(err);
}

@@ -1669,7 +1647,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
 * and any DMA inprogress will be completed. Failure of this function
 * indicates the HW has failed catastrophically.
 */
int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr)
static int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr)
{
	struct mlx5_umr_wr umrwr = {};

@@ -1941,69 +1919,82 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
	}
}

static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
{
	if (mr->ibmr.type == IB_MR_TYPE_INTEGRITY) {
	struct mlx5_ib_mr *mr = to_mmr(ibmr);
	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
	int rc;

	/*
	 * Any async use of the mr must hold the refcount, once the refcount
	 * goes to zero no other thread, such as ODP page faults, prefetch, any
	 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
	 */
	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
	    refcount_read(&mr->mmkey.usecount) != 0 &&
	    xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
		mlx5r_deref_wait_odp_mkey(&mr->mmkey);

	if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
		xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), ibmr,
			   NULL, GFP_KERNEL);

		if (mr->mtt_mr) {
			rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
			if (rc)
				return rc;
			mr->mtt_mr = NULL;
		}
		if (mr->klm_mr) {
			mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
			if (rc)
				return rc;
			mr->klm_mr = NULL;
		}

		if (mlx5_core_destroy_psv(dev->mdev,
					  mr->sig->psv_memory.psv_idx))
			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
				     mr->sig->psv_memory.psv_idx);
		if (mlx5_core_destroy_psv(dev->mdev,
					  mr->sig->psv_wire.psv_idx))
		if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
				     mr->sig->psv_wire.psv_idx);
		xa_erase(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key));
		kfree(mr->sig);
		mr->sig = NULL;
	}

	if (!mr->cache_ent) {
		destroy_mkey(dev, mr);
		mlx5_free_priv_descs(mr);
	/* Stop DMA */
	if (mr->cache_ent) {
		if (mlx5_mr_cache_invalidate(mr)) {
			spin_lock_irq(&mr->cache_ent->lock);
			mr->cache_ent->total_mrs--;
			spin_unlock_irq(&mr->cache_ent->lock);
			mr->cache_ent = NULL;
		}
	}
	if (!mr->cache_ent) {
		rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
		if (rc)
			return rc;
	}

static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
{
	struct ib_umem *umem = mr->umem;
	if (mr->umem) {
		bool is_odp = is_odp_mr(mr);

	/* Stop all DMA */
	if (is_odp_mr(mr))
		mlx5_ib_fence_odp_mr(mr);
	else if (is_dmabuf_mr(mr))
		mlx5_ib_fence_dmabuf_mr(mr);
	else
		clean_mr(dev, mr);

	if (umem) {
		if (!is_odp_mr(mr))
			atomic_sub(ib_umem_num_pages(umem),
		if (!is_odp)
			atomic_sub(ib_umem_num_pages(mr->umem),
				   &dev->mdev->priv.reg_pages);
		ib_umem_release(umem);
		ib_umem_release(mr->umem);
		if (is_odp)
			mlx5_ib_free_odp_mr(mr);
	}

	if (mr->cache_ent)
	if (mr->cache_ent) {
		mlx5_mr_cache_free(dev, mr);
	else
	} else {
		mlx5_free_priv_descs(mr);
		kfree(mr);
	}

int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
{
	struct mlx5_ib_mr *mmr = to_mmr(ibmr);

	if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
		dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr);
		dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr);
	}

	if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) {
		mlx5_ib_free_implicit_mr(mmr);
		return 0;
	}

	dereg_mr(to_mdev(ibmr->device), mmr);

	return 0;
}

@@ -2175,10 +2166,10 @@ static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
	destroy_mkey(dev, mr);
	mlx5_free_priv_descs(mr);
err_free_mtt_mr:
	dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr);
	mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
	mr->mtt_mr = NULL;
err_free_klm_mr:
	dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr);
	mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
	mr->klm_mr = NULL;
err_destroy_psv:
	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
+23 −128
Original line number Diff line number Diff line
@@ -181,63 +181,29 @@ void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
	}
}

static void dma_fence_odp_mr(struct mlx5_ib_mr *mr)
{
	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);

	/* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */
	mutex_lock(&odp->umem_mutex);
	if (odp->npages) {
		mlx5_mr_cache_invalidate(mr);
		ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp),
					    ib_umem_end(odp));
		WARN_ON(odp->npages);
	}
	odp->private = NULL;
	mutex_unlock(&odp->umem_mutex);

	if (!mr->cache_ent) {
		mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, &mr->mmkey);
		WARN_ON(mr->descs);
	}
}

/*
 * This must be called after the mr has been removed from implicit_children.
 * NOTE: The MR does not necessarily have to be
 * empty here, parallel page faults could have raced with the free process and
 * added pages to it.
 */
static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
static void free_implicit_child_mr_work(struct work_struct *work)
{
	struct mlx5_ib_mr *mr =
		container_of(work, struct mlx5_ib_mr, odp_destroy.work);
	struct mlx5_ib_mr *imr = mr->parent;
	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
	unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;

	mlx5r_deref_wait_odp_mkey(&mr->mmkey);

	if (need_imr_xlt) {
	mutex_lock(&odp_imr->umem_mutex);
		mlx5_ib_update_xlt(mr->parent, idx, 1, 0,
				   MLX5_IB_UPD_XLT_INDIRECT |
				   MLX5_IB_UPD_XLT_ATOMIC);
	mlx5_ib_update_xlt(mr->parent, ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT,
			   1, 0,
			   MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC);
	mutex_unlock(&odp_imr->umem_mutex);
	}

	dma_fence_odp_mr(mr);
	mlx5_ib_dereg_mr(&mr->ibmr, NULL);

	mlx5_mr_cache_free(mr_to_mdev(mr), mr);
	ib_umem_odp_release(odp);
}

static void free_implicit_child_mr_work(struct work_struct *work)
{
	struct mlx5_ib_mr *mr =
		container_of(work, struct mlx5_ib_mr, odp_destroy.work);
	struct mlx5_ib_mr *imr = mr->parent;

	free_implicit_child_mr(mr, true);
	mlx5r_deref_odp_mkey(&imr->mmkey);
}

@@ -454,8 +420,10 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,

	ret = mr = mlx5_mr_cache_alloc(
		mr_to_mdev(imr), MLX5_IMR_MTT_CACHE_ENTRY, imr->access_flags);
	if (IS_ERR(mr))
		goto out_umem;
	if (IS_ERR(mr)) {
		ib_umem_odp_release(odp);
		return mr;
	}

	mr->ibmr.pd = imr->ibmr.pd;
	mr->ibmr.device = &mr_to_mdev(imr)->ib_dev;
@@ -505,9 +473,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
out_lock:
	xa_unlock(&imr->implicit_children);
out_mr:
	mlx5_mr_cache_free(mr_to_mdev(imr), mr);
out_umem:
	ib_umem_odp_release(odp);
	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
	return ret;
}

@@ -530,8 +496,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,

	imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags);
	if (IS_ERR(imr)) {
		err = PTR_ERR(imr);
		goto out_umem;
		ib_umem_odp_release(umem_odp);
		return imr;
	}

	imr->ibmr.pd = &pd->ibpd;
@@ -561,93 +527,22 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
	return imr;
out_mr:
	mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
	mlx5_mr_cache_free(dev, imr);
out_umem:
	ib_umem_odp_release(umem_odp);
	mlx5_ib_dereg_mr(&imr->ibmr, NULL);
	return ERR_PTR(err);
}

void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr)
{
	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
	struct mlx5_ib_dev *dev = mr_to_mdev(imr);
	struct mlx5_ib_mr *mtt;
	unsigned long idx;

	xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key));
	/*
	 * All work on the prefetch list must be completed, xa_erase() prevented
	 * new work from being created.
	 */
	mlx5r_deref_wait_odp_mkey(&imr->mmkey);
	/*
	 * At this point it is forbidden for any other thread to enter
	 * pagefault_mr() on this imr. It is already forbidden to call
	 * pagefault_mr() on an implicit child. Due to this additions to
	 * implicit_children are prevented.
	 * In addition, any new call to destroy_unused_implicit_child_mr()
	 * may return immediately.
	 */

	/*
	 * Fence the imr before we destroy the children. This allows us to
	 * skip updating the XLT of the imr during destroy of the child mkey
	 * the imr points to.
	 */
	mlx5_mr_cache_invalidate(imr);

	xa_for_each(&imr->implicit_children, idx, mtt) {
		xa_erase(&imr->implicit_children, idx);
		free_implicit_child_mr(mtt, false);
	}

	mlx5_mr_cache_free(dev, imr);
	ib_umem_odp_release(odp_imr);
}

/**
 * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR
 * @mr: to fence
 *
 * On return no parallel threads will be touching this MR and no DMA will be
 * active.
 */
void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
{
	/* Prevent new page faults and prefetch requests from succeeding */
	xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));

	/* Wait for all running page-fault handlers to finish. */
	mlx5r_deref_wait_odp_mkey(&mr->mmkey);

	dma_fence_odp_mr(mr);
}

/**
 * mlx5_ib_fence_dmabuf_mr - Stop all access to the dmabuf MR
 * @mr: to fence
 *
 * On return no parallel threads will be touching this MR and no DMA will be
 * active.
	 * If this is an implicit MR it is already invalidated so we can just
	 * delete the children mkeys.
	 */
void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr)
{
	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);

	/* Prevent new page faults and prefetch requests from succeeding */
	xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));

	mlx5r_deref_wait_odp_mkey(&mr->mmkey);

	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
	mlx5_mr_cache_invalidate(mr);
	umem_dmabuf->private = NULL;
	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);

	if (!mr->cache_ent) {
		mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, &mr->mmkey);
		WARN_ON(mr->descs);
	xa_for_each(&mr->implicit_children, idx, mtt) {
		xa_erase(&mr->implicit_children, idx);
		mlx5_ib_dereg_mr(&mtt->ibmr, NULL);
	}
}