Commit d1dfccca authored by Steve Sistare's avatar Steve Sistare Committed by sanglipeng
Browse files

vfio/type1: prevent underflow of locked_vm via exec()

stable inclusion
from stable-v5.10.173
commit 5a271242716846cc016736fb76be2b40ee49b0c3
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I8BFR3

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=5a271242716846cc016736fb76be2b40ee49b0c3



--------------------------------

commit 046eca50 upstream.

When a vfio container is preserved across exec, the task does not change,
but it gets a new mm with locked_vm=0, and loses the count from existing
dma mappings.  If the user later unmaps a dma mapping, locked_vm underflows
to a large unsigned value, and a subsequent dma map request fails with
ENOMEM in __account_locked_vm.

To avoid underflow, grab and save the mm at the time a dma is mapped.
Use that mm when adjusting locked_vm, rather than re-acquiring the saved
task's mm, which may have changed.  If the saved mm is dead, do nothing.

locked_vm is incremented for existing mappings in a subsequent patch.

Fixes: 73fa0d10 ("vfio: Type1 IOMMU implementation")
Cc: stable@vger.kernel.org
Signed-off-by: default avatarSteve Sistare <steven.sistare@oracle.com>
Reviewed-by: default avatarKevin Tian <kevin.tian@intel.com>
Reviewed-by: default avatarJason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1675184289-267876-3-git-send-email-steven.sistare@oracle.com


Signed-off-by: default avatarAlex Williamson <alex.williamson@redhat.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: default avatarsanglipeng <sanglipeng1@jd.com>
parent 0418c317
Loading
Loading
Loading
Loading
+14 −27
Original line number Diff line number Diff line
@@ -100,6 +100,7 @@ struct vfio_dma {
	struct task_struct	*task;
	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
	unsigned long		*bitmap;
	struct mm_struct	*mm;
};

/* Bond between group and mm */
@@ -405,8 +406,8 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
	if (!npage)
		return 0;

	mm = async ? get_task_mm(dma->task) : dma->task->mm;
	if (!mm)
	mm = dma->mm;
	if (async && !mmget_not_zero(mm))
		return -ESRCH; /* process exited */

	ret = mmap_write_lock_killable(mm);
@@ -680,8 +681,8 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
	struct mm_struct *mm;
	int ret;

	mm = get_task_mm(dma->task);
	if (!mm)
	mm = dma->mm;
	if (!mmget_not_zero(mm))
		return -ENODEV;

	ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
@@ -691,7 +692,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
	ret = 0;

	if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
		ret = vfio_lock_acct(dma, 1, true);
		ret = vfio_lock_acct(dma, 1, false);
		if (ret) {
			put_pfn(*pfn_base, dma->prot);
			if (ret == -ENOMEM)
@@ -1045,6 +1046,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
	vfio_unmap_unpin(iommu, dma, true);
	vfio_unlink_dma(iommu, dma);
	put_task_struct(dma->task);
	mmdrop(dma->mm);
	vfio_dma_bitmap_free(dma);
	kfree(dma);
	iommu->dma_avail++;
@@ -1629,29 +1631,15 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
	 * against the locked memory limit and we need to be able to do both
	 * outside of this call path as pinning can be asynchronous via the
	 * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
	 * task_struct and VM locked pages requires an mm_struct, however
	 * holding an indefinite mm reference is not recommended, therefore we
	 * only hold a reference to a task.  We could hold a reference to
	 * current, however QEMU uses this call path through vCPU threads,
	 * which can be killed resulting in a NULL mm and failure in the unmap
	 * path when called via a different thread.  Avoid this problem by
	 * using the group_leader as threads within the same group require
	 * both CLONE_THREAD and CLONE_VM and will therefore use the same
	 * mm_struct.
	 *
	 * Previously we also used the task for testing CAP_IPC_LOCK at the
	 * time of pinning and accounting, however has_capability() makes use
	 * of real_cred, a copy-on-write field, so we can't guarantee that it
	 * matches group_leader, or in fact that it might not change by the
	 * time it's evaluated.  If a process were to call MAP_DMA with
	 * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
	 * possibly see different results for an iommu_mapped vfio_dma vs
	 * externally mapped.  Therefore track CAP_IPC_LOCK in vfio_dma at the
	 * time of calling MAP_DMA.
	 * task_struct. Save the group_leader so that all DMA tracking uses
	 * the same task, to make debugging easier.  VM locked pages requires
	 * an mm_struct, so grab the mm in case the task dies.
	 */
	get_task_struct(current->group_leader);
	dma->task = current->group_leader;
	dma->lock_cap = capable(CAP_IPC_LOCK);
	dma->mm = current->mm;
	mmgrab(dma->mm);

	dma->pfn_list = RB_ROOT;

@@ -3596,9 +3584,8 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
			!(dma->prot & IOMMU_READ))
		return -EPERM;

	mm = get_task_mm(dma->task);

	if (!mm)
	mm = dma->mm;
	if (!mmget_not_zero(mm))
		return -EPERM;

	if (kthread)