Commit 8974efaa authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull rdma fixes from Jason Gunthorpe:

 - Several hfi1 patches fixing some long standing driver bugs

 - Overflow when working with sg lists with elements greater than 4G

 - An rxe regression with object numbering after the mrs reach their
   limit

 - A theoretical problem with the scatterlist merging code

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
  lib/scatterlist: Fix to calculate the last_pg properly
  IB/hfi1: Remove user expected buffer invalidate race
  IB/hfi1: Immediately remove invalid memory from hardware
  IB/hfi1: Fix expected receive setup error exit issues
  IB/hfi1: Reserve user expected TIDs
  IB/hfi1: Reject a zero-length user expected buffer
  RDMA/core: Fix ib block iterator counter overflow
  RDMA/rxe: Prevent faulty rkey generation
  RDMA/rxe: Fix inaccurate constants in rxe_type_info
parents edc00350 0f097f08
Loading
Loading
Loading
Loading
+5 −2
Original line number Diff line number Diff line
@@ -2957,15 +2957,18 @@ EXPORT_SYMBOL(__rdma_block_iter_start);
bool __rdma_block_iter_next(struct ib_block_iter *biter)
{
	unsigned int block_offset;
	unsigned int sg_delta;

	if (!biter->__sg_nents || !biter->__sg)
		return false;

	biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
	block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
	biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset;
	sg_delta = BIT_ULL(biter->__pg_bit) - block_offset;

	if (biter->__sg_advance >= sg_dma_len(biter->__sg)) {
	if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) {
		biter->__sg_advance += sg_delta;
	} else {
		biter->__sg_advance = 0;
		biter->__sg = sg_next(biter->__sg);
		biter->__sg_nents--;
+140 −60
Original line number Diff line number Diff line
@@ -23,18 +23,25 @@ static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
			      const struct mmu_notifier_range *range,
			      unsigned long cur_seq);
static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
			         const struct mmu_notifier_range *range,
			         unsigned long cur_seq);
static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
			    struct tid_group *grp,
			    unsigned int start, u16 count,
			    u32 *tidlist, unsigned int *tididx,
			    unsigned int *pmapped);
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
			      struct tid_group **grp);
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo);
static void __clear_tid_node(struct hfi1_filedata *fd,
			     struct tid_rb_node *node);
static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);

static const struct mmu_interval_notifier_ops tid_mn_ops = {
	.invalidate = tid_rb_invalidate,
};
static const struct mmu_interval_notifier_ops tid_cover_ops = {
	.invalidate = tid_cover_invalidate,
};

/*
 * Initialize context and file private data needed for Expected
@@ -253,53 +260,65 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
		tididx = 0, mapped, mapped_pages = 0;
	u32 *tidlist = NULL;
	struct tid_user_buf *tidbuf;
	unsigned long mmu_seq = 0;

	if (!PAGE_ALIGNED(tinfo->vaddr))
		return -EINVAL;
	if (tinfo->length == 0)
		return -EINVAL;

	tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
	if (!tidbuf)
		return -ENOMEM;

	mutex_init(&tidbuf->cover_mutex);
	tidbuf->vaddr = tinfo->vaddr;
	tidbuf->length = tinfo->length;
	tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
				GFP_KERNEL);
	if (!tidbuf->psets) {
		kfree(tidbuf);
		return -ENOMEM;
		ret = -ENOMEM;
		goto fail_release_mem;
	}

	if (fd->use_mn) {
		ret = mmu_interval_notifier_insert(
			&tidbuf->notifier, current->mm,
			tidbuf->vaddr, tidbuf->npages * PAGE_SIZE,
			&tid_cover_ops);
		if (ret)
			goto fail_release_mem;
		mmu_seq = mmu_interval_read_begin(&tidbuf->notifier);
	}

	pinned = pin_rcv_pages(fd, tidbuf);
	if (pinned <= 0) {
		kfree(tidbuf->psets);
		kfree(tidbuf);
		return pinned;
		ret = (pinned < 0) ? pinned : -ENOSPC;
		goto fail_unpin;
	}

	/* Find sets of physically contiguous pages */
	tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);

	/*
	 * We don't need to access this under a lock since tid_used is per
	 * process and the same process cannot be in hfi1_user_exp_rcv_clear()
	 * and hfi1_user_exp_rcv_setup() at the same time.
	 */
	/* Reserve the number of expected tids to be used. */
	spin_lock(&fd->tid_lock);
	if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
		pageset_count = fd->tid_limit - fd->tid_used;
	else
		pageset_count = tidbuf->n_psets;
	fd->tid_used += pageset_count;
	spin_unlock(&fd->tid_lock);

	if (!pageset_count)
		goto bail;
	if (!pageset_count) {
		ret = -ENOSPC;
		goto fail_unreserve;
	}

	ngroups = pageset_count / dd->rcv_entries.group_size;
	tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
	if (!tidlist) {
		ret = -ENOMEM;
		goto nomem;
		goto fail_unreserve;
	}

	tididx = 0;
@@ -395,43 +414,78 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
	}
unlock:
	mutex_unlock(&uctxt->exp_mutex);
nomem:
	hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
		  mapped_pages, ret);
	if (tididx) {

	/* fail if nothing was programmed, set error if none provided */
	if (tididx == 0) {
		if (ret >= 0)
			ret = -ENOSPC;
		goto fail_unreserve;
	}

	/* adjust reserved tid_used to actual count */
	spin_lock(&fd->tid_lock);
		fd->tid_used += tididx;
	fd->tid_used -= pageset_count - tididx;
	spin_unlock(&fd->tid_lock);

	/* unpin all pages not covered by a TID */
	unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages,
			false);

	if (fd->use_mn) {
		/* check for an invalidate during setup */
		bool fail = false;

		mutex_lock(&tidbuf->cover_mutex);
		fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq);
		mutex_unlock(&tidbuf->cover_mutex);

		if (fail) {
			ret = -EBUSY;
			goto fail_unprogram;
		}
	}

	tinfo->tidcnt = tididx;
	tinfo->length = mapped_pages * PAGE_SIZE;

	if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
			 tidlist, sizeof(tidlist[0]) * tididx)) {
			/*
			 * On failure to copy to the user level, we need to undo
			 * everything done so far so we don't leak resources.
			 */
			tinfo->tidlist = (unsigned long)&tidlist;
			hfi1_user_exp_rcv_clear(fd, tinfo);
			tinfo->tidlist = 0;
		ret = -EFAULT;
			goto bail;
		}
		goto fail_unprogram;
	}

	/*
	 * If not everything was mapped (due to insufficient RcvArray entries,
	 * for example), unpin all unmapped pages so we can pin them nex time.
	 */
	if (mapped_pages != pinned)
		unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
				(pinned - mapped_pages), false);
bail:
	if (fd->use_mn)
		mmu_interval_notifier_remove(&tidbuf->notifier);
	kfree(tidbuf->pages);
	kfree(tidbuf->psets);
	kfree(tidbuf);
	kfree(tidlist);
	return 0;

fail_unprogram:
	/* unprogram, unmap, and unpin all allocated TIDs */
	tinfo->tidlist = (unsigned long)tidlist;
	hfi1_user_exp_rcv_clear(fd, tinfo);
	tinfo->tidlist = 0;
	pinned = 0;		/* nothing left to unpin */
	pageset_count = 0;	/* nothing left reserved */
fail_unreserve:
	spin_lock(&fd->tid_lock);
	fd->tid_used -= pageset_count;
	spin_unlock(&fd->tid_lock);
fail_unpin:
	if (fd->use_mn)
		mmu_interval_notifier_remove(&tidbuf->notifier);
	if (pinned > 0)
		unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false);
fail_release_mem:
	kfree(tidbuf->pages);
	kfree(tidbuf->psets);
	kfree(tidbuf);
	return ret > 0 ? 0 : ret;
	kfree(tidlist);
	return ret;
}

int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
@@ -452,7 +506,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,

	mutex_lock(&uctxt->exp_mutex);
	for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
		ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
		ret = unprogram_rcvarray(fd, tidinfo[tididx]);
		if (ret) {
			hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
				  ret);
@@ -706,6 +760,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
	}

	node->fdata = fd;
	mutex_init(&node->invalidate_mutex);
	node->phys = page_to_phys(pages[0]);
	node->npages = npages;
	node->rcventry = rcventry;
@@ -721,11 +776,6 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
			&tid_mn_ops);
		if (ret)
			goto out_unmap;
		/*
		 * FIXME: This is in the wrong order, the notifier should be
		 * established before the pages are pinned by pin_rcv_pages.
		 */
		mmu_interval_read_begin(&node->notifier);
	}
	fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;

@@ -745,8 +795,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
	return -EFAULT;
}

static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
			      struct tid_group **grp)
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo)
{
	struct hfi1_ctxtdata *uctxt = fd->uctxt;
	struct hfi1_devdata *dd = uctxt->dd;
@@ -769,9 +818,6 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
	if (!node || node->rcventry != (uctxt->expected_base + rcventry))
		return -EBADF;

	if (grp)
		*grp = node->grp;

	if (fd->use_mn)
		mmu_interval_notifier_remove(&node->notifier);
	cacheless_tid_rb_remove(fd, node);
@@ -779,23 +825,34 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
	return 0;
}

static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
{
	struct hfi1_ctxtdata *uctxt = fd->uctxt;
	struct hfi1_devdata *dd = uctxt->dd;

	mutex_lock(&node->invalidate_mutex);
	if (node->freed)
		goto done;
	node->freed = true;

	trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
				 node->npages,
				 node->notifier.interval_tree.start, node->phys,
				 node->dma_addr);

	/*
	 * Make sure device has seen the write before we unpin the
	 * pages.
	 */
	/* Make sure device has seen the write before pages are unpinned */
	hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);

	unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
done:
	mutex_unlock(&node->invalidate_mutex);
}

static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
{
	struct hfi1_ctxtdata *uctxt = fd->uctxt;

	__clear_tid_node(fd, node);

	node->grp->used--;
	node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
@@ -854,10 +911,16 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
	if (node->freed)
		return true;

	/* take action only if unmapping */
	if (range->event != MMU_NOTIFY_UNMAP)
		return true;

	trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
				 node->notifier.interval_tree.start,
				 node->rcventry, node->npages, node->dma_addr);
	node->freed = true;

	/* clear the hardware rcvarray entry */
	__clear_tid_node(fdata, node);

	spin_lock(&fdata->invalid_lock);
	if (fdata->invalid_tid_idx < uctxt->expected_count) {
@@ -887,6 +950,23 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
	return true;
}

static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
			         const struct mmu_notifier_range *range,
			         unsigned long cur_seq)
{
	struct tid_user_buf *tidbuf =
		container_of(mni, struct tid_user_buf, notifier);

	/* take action only if unmapping */
	if (range->event == MMU_NOTIFY_UNMAP) {
		mutex_lock(&tidbuf->cover_mutex);
		mmu_interval_set_seq(mni, cur_seq);
		mutex_unlock(&tidbuf->cover_mutex);
	}

	return true;
}

static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
				    struct tid_rb_node *tnode)
{
+3 −0
Original line number Diff line number Diff line
@@ -16,6 +16,8 @@ struct tid_pageset {
};

struct tid_user_buf {
	struct mmu_interval_notifier notifier;
	struct mutex cover_mutex;
	unsigned long vaddr;
	unsigned long length;
	unsigned int npages;
@@ -27,6 +29,7 @@ struct tid_user_buf {
struct tid_rb_node {
	struct mmu_interval_notifier notifier;
	struct hfi1_filedata *fdata;
	struct mutex invalidate_mutex; /* covers hw removal */
	unsigned long phys;
	struct tid_group *grp;
	u32 rcventry;
+5 −5
Original line number Diff line number Diff line
@@ -98,11 +98,11 @@ enum rxe_device_param {
	RXE_MAX_SRQ			= DEFAULT_MAX_VALUE - RXE_MIN_SRQ_INDEX,

	RXE_MIN_MR_INDEX		= 0x00000001,
	RXE_MAX_MR_INDEX		= DEFAULT_MAX_VALUE,
	RXE_MAX_MR			= DEFAULT_MAX_VALUE - RXE_MIN_MR_INDEX,
	RXE_MIN_MW_INDEX		= 0x00010001,
	RXE_MAX_MW_INDEX		= 0x00020000,
	RXE_MAX_MW			= 0x00001000,
	RXE_MAX_MR_INDEX		= DEFAULT_MAX_VALUE >> 1,
	RXE_MAX_MR			= RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX,
	RXE_MIN_MW_INDEX		= RXE_MAX_MR_INDEX + 1,
	RXE_MAX_MW_INDEX		= DEFAULT_MAX_VALUE,
	RXE_MAX_MW			= RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX,

	RXE_MAX_PKT_PER_ACK		= 64,

+11 −11
Original line number Diff line number Diff line
@@ -23,16 +23,16 @@ static const struct rxe_type_info {
		.size		= sizeof(struct rxe_ucontext),
		.elem_offset	= offsetof(struct rxe_ucontext, elem),
		.min_index	= 1,
		.max_index	= UINT_MAX,
		.max_elem	= UINT_MAX,
		.max_index	= RXE_MAX_UCONTEXT,
		.max_elem	= RXE_MAX_UCONTEXT,
	},
	[RXE_TYPE_PD] = {
		.name		= "pd",
		.size		= sizeof(struct rxe_pd),
		.elem_offset	= offsetof(struct rxe_pd, elem),
		.min_index	= 1,
		.max_index	= UINT_MAX,
		.max_elem	= UINT_MAX,
		.max_index	= RXE_MAX_PD,
		.max_elem	= RXE_MAX_PD,
	},
	[RXE_TYPE_AH] = {
		.name		= "ah",
@@ -40,7 +40,7 @@ static const struct rxe_type_info {
		.elem_offset	= offsetof(struct rxe_ah, elem),
		.min_index	= RXE_MIN_AH_INDEX,
		.max_index	= RXE_MAX_AH_INDEX,
		.max_elem	= RXE_MAX_AH_INDEX - RXE_MIN_AH_INDEX + 1,
		.max_elem	= RXE_MAX_AH,
	},
	[RXE_TYPE_SRQ] = {
		.name		= "srq",
@@ -49,7 +49,7 @@ static const struct rxe_type_info {
		.cleanup	= rxe_srq_cleanup,
		.min_index	= RXE_MIN_SRQ_INDEX,
		.max_index	= RXE_MAX_SRQ_INDEX,
		.max_elem	= RXE_MAX_SRQ_INDEX - RXE_MIN_SRQ_INDEX + 1,
		.max_elem	= RXE_MAX_SRQ,
	},
	[RXE_TYPE_QP] = {
		.name		= "qp",
@@ -58,7 +58,7 @@ static const struct rxe_type_info {
		.cleanup	= rxe_qp_cleanup,
		.min_index	= RXE_MIN_QP_INDEX,
		.max_index	= RXE_MAX_QP_INDEX,
		.max_elem	= RXE_MAX_QP_INDEX - RXE_MIN_QP_INDEX + 1,
		.max_elem	= RXE_MAX_QP,
	},
	[RXE_TYPE_CQ] = {
		.name		= "cq",
@@ -66,8 +66,8 @@ static const struct rxe_type_info {
		.elem_offset	= offsetof(struct rxe_cq, elem),
		.cleanup	= rxe_cq_cleanup,
		.min_index	= 1,
		.max_index	= UINT_MAX,
		.max_elem	= UINT_MAX,
		.max_index	= RXE_MAX_CQ,
		.max_elem	= RXE_MAX_CQ,
	},
	[RXE_TYPE_MR] = {
		.name		= "mr",
@@ -76,7 +76,7 @@ static const struct rxe_type_info {
		.cleanup	= rxe_mr_cleanup,
		.min_index	= RXE_MIN_MR_INDEX,
		.max_index	= RXE_MAX_MR_INDEX,
		.max_elem	= RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX + 1,
		.max_elem	= RXE_MAX_MR,
	},
	[RXE_TYPE_MW] = {
		.name		= "mw",
@@ -85,7 +85,7 @@ static const struct rxe_type_info {
		.cleanup	= rxe_mw_cleanup,
		.min_index	= RXE_MIN_MW_INDEX,
		.max_index	= RXE_MAX_MW_INDEX,
		.max_elem	= RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX + 1,
		.max_elem	= RXE_MAX_MW,
	},
};

Loading