Commit d919b33d authored by Alexey Dobriyan's avatar Alexey Dobriyan Committed by Linus Torvalds
Browse files

proc: faster open/read/close with "permanent" files



Now that "struct proc_ops" exist we can start putting there stuff which
could not fly with VFS "struct file_operations"...

Most of fs/proc/inode.c file is dedicated to make open/read/.../close
reliable in the event of disappearing /proc entries which usually happens
if module is getting removed.  Files like /proc/cpuinfo which never
disappear simply do not need such protection.

Save 2 atomic ops, 1 allocation, 1 free per open/read/close sequence for such
"permanent" files.

Enable "permanent" flag for

	/proc/cpuinfo
	/proc/kmsg
	/proc/modules
	/proc/slabinfo
	/proc/stat
	/proc/sysvipc/*
	/proc/swaps

More will come once I figure out foolproof way to prevent out module
authors from marking their stuff "permanent" for performance reasons
when it is not.

This should help with scalability: benchmark is "read /proc/cpuinfo R times
by N threads scattered over the system".

	N	R	t, s (before)	t, s (after)
	-----------------------------------------------------
	64	4096	1.582458	1.530502	-3.2%
	256	4096	6.371926	6.125168	-3.9%
	1024	4096	25.64888	24.47528	-4.6%

Benchmark source:

#include <chrono>
#include <iostream>
#include <thread>
#include <vector>

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

const int NR_CPUS = sysconf(_SC_NPROCESSORS_ONLN);
int N;
const char *filename;
int R;

int xxx = 0;

int glue(int n)
{
	cpu_set_t m;
	CPU_ZERO(&m);
	CPU_SET(n, &m);
	return sched_setaffinity(0, sizeof(cpu_set_t), &m);
}

void f(int n)
{
	glue(n % NR_CPUS);

	while (*(volatile int *)&xxx == 0) {
	}

	for (int i = 0; i < R; i++) {
		int fd = open(filename, O_RDONLY);
		char buf[4096];
		ssize_t rv = read(fd, buf, sizeof(buf));
		asm volatile ("" :: "g" (rv));
		close(fd);
	}
}

int main(int argc, char *argv[])
{
	if (argc < 4) {
		std::cerr << "usage: " << argv[0] << ' ' << "N /proc/filename R
";
		return 1;
	}

	N = atoi(argv[1]);
	filename = argv[2];
	R = atoi(argv[3]);

	for (int i = 0; i < NR_CPUS; i++) {
		if (glue(i) == 0)
			break;
	}

	std::vector<std::thread> T;
	T.reserve(N);
	for (int i = 0; i < N; i++) {
		T.emplace_back(f, i);
	}

	auto t0 = std::chrono::system_clock::now();
	{
		*(volatile int *)&xxx = 1;
		for (auto& t: T) {
			t.join();
		}
	}
	auto t1 = std::chrono::system_clock::now();
	std::chrono::duration<double> dt = t1 - t0;
	std::cout << dt.count() << '
';

	return 0;
}

P.S.:
Explicit randomization marker is added because adding non-function pointer
will silently disable structure layout randomization.

[akpm@linux-foundation.org: coding style fixes]
Reported-by: default avatarkbuild test robot <lkp@intel.com>
Reported-by: default avatarDan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: default avatarAlexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Joe Perches <joe@perches.com>
Link: http://lkml.kernel.org/r/20200222201539.GA22576@avx2


Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 904f394e
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file)
}

static const struct proc_ops cpuinfo_proc_ops = {
	.proc_flags	= PROC_ENTRY_PERMANENT,
	.proc_open	= cpuinfo_open,
	.proc_read	= seq_read,
	.proc_lseek	= seq_lseek,
+28 −3
Original line number Diff line number Diff line
@@ -531,6 +531,12 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
	return p;
}

static inline void pde_set_flags(struct proc_dir_entry *pde)
{
	if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
		pde->flags |= PROC_ENTRY_PERMANENT;
}

struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
		struct proc_dir_entry *parent,
		const struct proc_ops *proc_ops, void *data)
@@ -541,6 +547,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
	if (!p)
		return NULL;
	p->proc_ops = proc_ops;
	pde_set_flags(p);
	return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
@@ -572,6 +579,7 @@ static int proc_seq_release(struct inode *inode, struct file *file)
}

static const struct proc_ops proc_seq_ops = {
	/* not permanent -- can call into arbitrary seq_operations */
	.proc_open	= proc_seq_open,
	.proc_read	= seq_read,
	.proc_lseek	= seq_lseek,
@@ -602,6 +610,7 @@ static int proc_single_open(struct inode *inode, struct file *file)
}

static const struct proc_ops proc_single_ops = {
	/* not permanent -- can call into arbitrary ->single_show */
	.proc_open	= proc_single_open,
	.proc_read	= seq_read,
	.proc_lseek	= seq_lseek,
@@ -662,8 +671,12 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)

	de = pde_subdir_find(parent, fn, len);
	if (de) {
		if (unlikely(pde_is_permanent(de))) {
			WARN(1, "removing permanent /proc entry '%s'", de->name);
			de = NULL;
		} else {
			rb_erase(&de->subdir_node, &parent->subdir);
		if (S_ISDIR(de->mode)) {
			if (S_ISDIR(de->mode))
				parent->nlink--;
		}
	}
@@ -700,12 +713,24 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
		write_unlock(&proc_subdir_lock);
		return -ENOENT;
	}
	if (unlikely(pde_is_permanent(root))) {
		write_unlock(&proc_subdir_lock);
		WARN(1, "removing permanent /proc entry '%s/%s'",
			root->parent->name, root->name);
		return -EINVAL;
	}
	rb_erase(&root->subdir_node, &parent->subdir);

	de = root;
	while (1) {
		next = pde_subdir_first(de);
		if (next) {
			if (unlikely(pde_is_permanent(root))) {
				write_unlock(&proc_subdir_lock);
				WARN(1, "removing permanent /proc entry '%s/%s'",
					next->parent->name, next->name);
				return -EINVAL;
			}
			rb_erase(&next->subdir_node, &de->subdir);
			de = next;
			continue;
+137 −50
Original line number Diff line number Diff line
@@ -259,123 +259,180 @@ void proc_entry_rundown(struct proc_dir_entry *de)
	spin_unlock(&de->pde_unload_lock);
}

static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	loff_t rv = -EINVAL;
	if (use_pde(pde)) {
	typeof_member(struct proc_ops, proc_lseek) lseek;

	lseek = pde->proc_ops->proc_lseek;
	if (!lseek)
		lseek = default_llseek;
		rv = lseek(file, offset, whence);
	return lseek(file, offset, whence);
}

static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	loff_t rv = -EINVAL;

	if (pde_is_permanent(pde)) {
		return pde_lseek(pde, file, offset, whence);
	} else if (use_pde(pde)) {
		rv = pde_lseek(pde, file, offset, whence);
		unuse_pde(pde);
	}
	return rv;
}

static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	ssize_t rv = -EIO;
	if (use_pde(pde)) {
	typeof_member(struct proc_ops, proc_read) read;

	read = pde->proc_ops->proc_read;
	if (read)
			rv = read(file, buf, count, ppos);
		return read(file, buf, count, ppos);
	return -EIO;
}

static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	ssize_t rv = -EIO;

	if (pde_is_permanent(pde)) {
		return pde_read(pde, file, buf, count, ppos);
	} else if (use_pde(pde)) {
		rv = pde_read(pde, file, buf, count, ppos);
		unuse_pde(pde);
	}
	return rv;
}

static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	ssize_t rv = -EIO;
	if (use_pde(pde)) {
	typeof_member(struct proc_ops, proc_write) write;

	write = pde->proc_ops->proc_write;
	if (write)
			rv = write(file, buf, count, ppos);
		return write(file, buf, count, ppos);
	return -EIO;
}

static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	ssize_t rv = -EIO;

	if (pde_is_permanent(pde)) {
		return pde_write(pde, file, buf, count, ppos);
	} else if (use_pde(pde)) {
		rv = pde_write(pde, file, buf, count, ppos);
		unuse_pde(pde);
	}
	return rv;
}

static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	__poll_t rv = DEFAULT_POLLMASK;
	if (use_pde(pde)) {
	typeof_member(struct proc_ops, proc_poll) poll;

	poll = pde->proc_ops->proc_poll;
	if (poll)
			rv = poll(file, pts);
		return poll(file, pts);
	return DEFAULT_POLLMASK;
}

static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	__poll_t rv = DEFAULT_POLLMASK;

	if (pde_is_permanent(pde)) {
		return pde_poll(pde, file, pts);
	} else if (use_pde(pde)) {
		rv = pde_poll(pde, file, pts);
		unuse_pde(pde);
	}
	return rv;
}

static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	long rv = -ENOTTY;
	if (use_pde(pde)) {
	typeof_member(struct proc_ops, proc_ioctl) ioctl;

	ioctl = pde->proc_ops->proc_ioctl;
	if (ioctl)
			rv = ioctl(file, cmd, arg);
		return ioctl(file, cmd, arg);
	return -ENOTTY;
}

static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	long rv = -ENOTTY;

	if (pde_is_permanent(pde)) {
		return pde_ioctl(pde, file, cmd, arg);
	} else if (use_pde(pde)) {
		rv = pde_ioctl(pde, file, cmd, arg);
		unuse_pde(pde);
	}
	return rv;
}

#ifdef CONFIG_COMPAT
static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	long rv = -ENOTTY;
	if (use_pde(pde)) {
	typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;

	compat_ioctl = pde->proc_ops->proc_compat_ioctl;
	if (compat_ioctl)
			rv = compat_ioctl(file, cmd, arg);
		return compat_ioctl(file, cmd, arg);
	return -ENOTTY;
}

static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	long rv = -ENOTTY;
	if (pde_is_permanent(pde)) {
		return pde_compat_ioctl(pde, file, cmd, arg);
	} else if (use_pde(pde)) {
		rv = pde_compat_ioctl(pde, file, cmd, arg);
		unuse_pde(pde);
	}
	return rv;
}
#endif

static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	int rv = -EIO;
	if (use_pde(pde)) {
	typeof_member(struct proc_ops, proc_mmap) mmap;

	mmap = pde->proc_ops->proc_mmap;
	if (mmap)
			rv = mmap(file, vma);
		return mmap(file, vma);
	return -EIO;
}

static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	int rv = -EIO;

	if (pde_is_permanent(pde)) {
		return pde_mmap(pde, file, vma);
	} else if (use_pde(pde)) {
		rv = pde_mmap(pde, file, vma);
		unuse_pde(pde);
	}
	return rv;
}

static unsigned long
proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr,
			   unsigned long len, unsigned long pgoff,
			   unsigned long flags)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	unsigned long rv = -EIO;

	if (use_pde(pde)) {
	typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;

	get_area = pde->proc_ops->proc_get_unmapped_area;
@@ -383,11 +440,23 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
	if (!get_area)
		get_area = current->mm->get_unmapped_area;
#endif

	if (get_area)
			rv = get_area(file, orig_addr, len, pgoff, flags);
		else
			rv = orig_addr;
		return get_area(file, orig_addr, len, pgoff, flags);
	return orig_addr;
}

static unsigned long
proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
			   unsigned long len, unsigned long pgoff,
			   unsigned long flags)
{
	struct proc_dir_entry *pde = PDE(file_inode(file));
	unsigned long rv = -EIO;

	if (pde_is_permanent(pde)) {
		return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
	} else if (use_pde(pde)) {
		rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
		unuse_pde(pde);
	}
	return rv;
@@ -401,6 +470,13 @@ static int proc_reg_open(struct inode *inode, struct file *file)
	typeof_member(struct proc_ops, proc_release) release;
	struct pde_opener *pdeo;

	if (pde_is_permanent(pde)) {
		open = pde->proc_ops->proc_open;
		if (open)
			rv = open(inode, file);
		return rv;
	}

	/*
	 * Ensure that
	 * 1) PDE's ->release hook will be called no matter what
@@ -450,6 +526,17 @@ static int proc_reg_release(struct inode *inode, struct file *file)
{
	struct proc_dir_entry *pde = PDE(inode);
	struct pde_opener *pdeo;

	if (pde_is_permanent(pde)) {
		typeof_member(struct proc_ops, proc_release) release;

		release = pde->proc_ops->proc_release;
		if (release) {
			return release(inode, file);
		}
		return 0;
	}

	spin_lock(&pde->pde_unload_lock);
	list_for_each_entry(pdeo, &pde->pde_openers, lh) {
		if (pdeo->file == file) {
+6 −0
Original line number Diff line number Diff line
@@ -61,6 +61,7 @@ struct proc_dir_entry {
	struct rb_node subdir_node;
	char *name;
	umode_t mode;
	u8 flags;
	u8 namelen;
	char inline_name[];
} __randomize_layout;
@@ -73,6 +74,11 @@ struct proc_dir_entry {
	0)
#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))

static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
{
	return pde->flags & PROC_ENTRY_PERMANENT;
}

extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);

+1 −0
Original line number Diff line number Diff line
@@ -50,6 +50,7 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait)


static const struct proc_ops kmsg_proc_ops = {
	.proc_flags	= PROC_ENTRY_PERMANENT,
	.proc_read	= kmsg_read,
	.proc_poll	= kmsg_poll,
	.proc_open	= kmsg_open,
Loading