Commit 53f4e528 authored by Daniel Jordan's avatar Daniel Jordan Committed by Xie XiuQi
Browse files

mm: change locked_vm's type from unsigned long to atomic_long_t



hulk inclusion
category: feature
bugzilla: 13228
CVE: NA
---------------------------

Currently, mmap_sem must be held as writer to modify the locked_vm field
in mm_struct.

This creates a bottleneck when multithreading VFIO page pinning because
each thread holds the mmap_sem as reader for the majority of the pinning
time but also takes mmap_sem as writer regularly, for short times, when
modifying locked_vm.

The problem gets worse when other workloads compete for CPU with ktask
threads doing page pinning because the other workloads force ktask
threads that hold mmap_sem as writer off the CPU, blocking ktask threads
trying to get mmap_sem as reader for an excessively long time (the
mmap_sem reader wait time grows linearly with the thread count).

Requiring mmap_sem for locked_vm also abuses mmap_sem by making it
protect data that could be synchronized separately.

So, decouple locked_vm from mmap_sem by making locked_vm an
atomic_long_t.  locked_vm's old type was unsigned long and changing it
to a signed type makes it lose half its capacity, but that's only a
concern for 32-bit systems and LONG_MAX * PAGE_SIZE is 8T on x86 in that
case, so there's headroom.

Now that mmap_sem is not taken as writer here, ktask threads holding
mmap_sem as reader can run more often.  Performance results appear later
in the series.

On powerpc, this was cross-compiled-tested only.

[XXX Can send separately.]

Signed-off-by: default avatarDaniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: default avatarHongbo Yao <yaohongbo@huawei.com>
Reviewed-by: default avatarXie XiuQi <xiexiuqi@huawei.com>
Tested-by: default avatarHongbo Yao <yaohongbo@huawei.com>
Signed-off-by: default avatarYang Yingliang <yangyingliang@huawei.com>
parent b0908eee
Loading
Loading
Loading
Loading
+8 −7
Original line number Diff line number Diff line
@@ -58,33 +58,34 @@ static unsigned long kvmppc_stt_pages(unsigned long tce_pages)

static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
{
	long ret = 0;
	long locked_vm, ret = 0;

	if (!current || !current->mm)
		return ret; /* process exited */

	down_write(&current->mm->mmap_sem);

	locked_vm = atomic_long_read(&current->mm->locked_vm);
	if (inc) {
		unsigned long locked, lock_limit;

		locked = current->mm->locked_vm + stt_pages;
		locked = locked_vm + stt_pages;
		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
			ret = -ENOMEM;
		else
			current->mm->locked_vm += stt_pages;
			atomic_long_add(stt_pages, &current->mm->locked_vm);
	} else {
		if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
			stt_pages = current->mm->locked_vm;
		if (WARN_ON_ONCE(stt_pages > locked_vm))
			stt_pages = locked_vm;

		current->mm->locked_vm -= stt_pages;
		atomic_long_sub(stt_pages, &current->mm->locked_vm);
	}

	pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
			inc ? '+' : '-',
			stt_pages << PAGE_SHIFT,
			current->mm->locked_vm << PAGE_SHIFT,
			atomic_long_read(&current->mm->locked_vm) << PAGE_SHIFT,
			rlimit(RLIMIT_MEMLOCK),
			ret ? " - exceeded" : "");

+8 −8
Original line number Diff line number Diff line
@@ -41,31 +41,31 @@ struct mm_iommu_table_group_mem_t {
static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
		unsigned long npages, bool incr)
{
	long ret = 0, locked, lock_limit;
	long ret = 0, locked, lock_limit, locked_vm;

	if (!npages)
		return 0;

	down_write(&mm->mmap_sem);

	locked_vm = atomic_long_read(&mm->locked_vm);
	if (incr) {
		locked = mm->locked_vm + npages;
		locked = locked_vm + npages;
		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
			ret = -ENOMEM;
		else
			mm->locked_vm += npages;
			atomic_long_add(npages, &mm->locked_vm);
	} else {
		if (WARN_ON_ONCE(npages > mm->locked_vm))
			npages = mm->locked_vm;
		mm->locked_vm -= npages;
		if (WARN_ON_ONCE(npages > locked_vm))
			npages = locked_vm;
		atomic_long_sub(npages, &mm->locked_vm);
	}

	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
			current ? current->pid : 0,
			incr ? '+' : '-',
			npages << PAGE_SHIFT,
			mm->locked_vm << PAGE_SHIFT,
			atomic_long_read(&mm->locked_vm) << PAGE_SHIFT,
			rlimit(RLIMIT_MEMLOCK));
	up_write(&mm->mmap_sem);

+9 −7
Original line number Diff line number Diff line
@@ -45,6 +45,7 @@ void afu_dma_region_init(struct dfl_feature_platform_data *pdata)
static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr)
{
	unsigned long locked, lock_limit;
	long locked_vm;
	int ret = 0;

	/* the task is exiting. */
@@ -53,24 +54,25 @@ static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr)

	down_write(&current->mm->mmap_sem);

	locked_vm = atomic_long_read(&current->mm->locked_vm);
	if (incr) {
		locked = current->mm->locked_vm + npages;
		locked = locked_vm + npages;
		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
			ret = -ENOMEM;
		else
			current->mm->locked_vm += npages;
			atomic_long_add(npages, &current->mm->locked_vm);
	} else {
		if (WARN_ON_ONCE(npages > current->mm->locked_vm))
			npages = current->mm->locked_vm;
		current->mm->locked_vm -= npages;
		if (WARN_ON_ONCE(npages > locked_vm))
			npages = locked_vm;
		atomic_long_sub(npages, &current->mm->locked_vm);
	}

	dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %ld/%ld%s\n", current->pid,
		incr ? '+' : '-', npages << PAGE_SHIFT,
		current->mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK),
		ret ? "- execeeded" : "");
		atomic_long_read(&current->mm->locked_vm) << PAGE_SHIFT,
		rlimit(RLIMIT_MEMLOCK), ret ? "- exceeded" : "");

	up_write(&current->mm->mmap_sem);

+1 −1
Original line number Diff line number Diff line
@@ -60,7 +60,7 @@ static void usnic_uiom_reg_account(struct work_struct *work)
						struct usnic_uiom_reg, work);

	down_write(&umem->mm->mmap_sem);
	umem->mm->locked_vm -= umem->diff;
	atomic_long_sub(umem->diff, &umem->mm->locked_vm);
	up_write(&umem->mm->mmap_sem);
	mmput(umem->mm);
	kfree(umem);
+7 −7
Original line number Diff line number Diff line
@@ -45,16 +45,16 @@ static long try_increment_locked_vm(struct mm_struct *mm, long npages)
		return 0;

	down_write(&mm->mmap_sem);
	locked = mm->locked_vm + npages;
	locked = atomic_long_read(&mm->locked_vm) + npages;
	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
		ret = -ENOMEM;
	else
		mm->locked_vm += npages;
		atomic_long_add(npages, &mm->locked_vm);

	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
			npages << PAGE_SHIFT,
			mm->locked_vm << PAGE_SHIFT,
			atomic_long_read(&mm->locked_vm) << PAGE_SHIFT,
			rlimit(RLIMIT_MEMLOCK),
			ret ? " - exceeded" : "");

@@ -69,12 +69,12 @@ static void decrement_locked_vm(struct mm_struct *mm, long npages)
		return;

	down_write(&mm->mmap_sem);
	if (WARN_ON_ONCE(npages > mm->locked_vm))
		npages = mm->locked_vm;
	mm->locked_vm -= npages;
	if (WARN_ON_ONCE(npages > atomic_long_read(&mm->locked_vm)))
		npages = atomic_long_read(&mm->locked_vm);
	atomic_long_sub(npages, &mm->locked_vm);
	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
			npages << PAGE_SHIFT,
			mm->locked_vm << PAGE_SHIFT,
			atomic_long_read(&mm->locked_vm) << PAGE_SHIFT,
			rlimit(RLIMIT_MEMLOCK));
	up_write(&mm->mmap_sem);
}
Loading