Commit 3644286f authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'fsnotify_for_v5.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs

Pull fsnotify updates from Jan Kara:

 - support for limited fanotify functionality for unpriviledged users

 - faster merging of fanotify events

 - a few smaller fsnotify improvements

* tag 'fsnotify_for_v5.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs:
  shmem: allow reporting fanotify events with file handles on tmpfs
  fs: introduce a wrapper uuid_to_fsid()
  fanotify_user: use upper_32_bits() to verify mask
  fanotify: support limited functionality for unprivileged users
  fanotify: configurable limits via sysfs
  fanotify: limit number of event merge attempts
  fsnotify: use hash table for faster events merge
  fanotify: mix event info and pid into merge key hash
  fanotify: reduce event objectid to 29-bit hash
  fsnotify: allow fsnotify_{peek,remove}_first_event with empty queue
parents 767fcbc8 59cda49e
Loading
Loading
Loading
Loading
+1 −4
Original line number Diff line number Diff line
@@ -1399,7 +1399,6 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
	struct super_block *sb = dentry->d_sb;
	struct ext2_sb_info *sbi = EXT2_SB(sb);
	struct ext2_super_block *es = sbi->s_es;
	u64 fsid;

	spin_lock(&sbi->s_lock);

@@ -1453,9 +1452,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
	buf->f_ffree = ext2_count_free_inodes(sb);
	es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
	buf->f_namelen = EXT2_NAME_LEN;
	fsid = le64_to_cpup((void *)es->s_uuid) ^
	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
	buf->f_fsid = u64_to_fsid(fsid);
	buf->f_fsid = uuid_to_fsid(es->s_uuid);
	spin_unlock(&sbi->s_lock);
	return 0;
}
+1 −4
Original line number Diff line number Diff line
@@ -6153,7 +6153,6 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	struct ext4_super_block *es = sbi->s_es;
	ext4_fsblk_t overhead = 0, resv_blocks;
	u64 fsid;
	s64 bfree;
	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));

@@ -6174,9 +6173,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
	buf->f_files = le32_to_cpu(es->s_inodes_count);
	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
	buf->f_namelen = EXT4_NAME_LEN;
	fsid = le64_to_cpup((void *)es->s_uuid) ^
	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
	buf->f_fsid = u64_to_fsid(fsid);
	buf->f_fsid = uuid_to_fsid(es->s_uuid);

#ifdef CONFIG_QUOTA
	if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
+119 −47
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@
#include <linux/audit.h>
#include <linux/sched/mm.h>
#include <linux/statfs.h>
#include <linux/stringhash.h>

#include "fanotify.h"

@@ -22,12 +23,24 @@ static bool fanotify_path_equal(struct path *p1, struct path *p2)
	return p1->mnt == p2->mnt && p1->dentry == p2->dentry;
}

static unsigned int fanotify_hash_path(const struct path *path)
{
	return hash_ptr(path->dentry, FANOTIFY_EVENT_HASH_BITS) ^
		hash_ptr(path->mnt, FANOTIFY_EVENT_HASH_BITS);
}

static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1,
				       __kernel_fsid_t *fsid2)
{
	return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1];
}

static unsigned int fanotify_hash_fsid(__kernel_fsid_t *fsid)
{
	return hash_32(fsid->val[0], FANOTIFY_EVENT_HASH_BITS) ^
		hash_32(fsid->val[1], FANOTIFY_EVENT_HASH_BITS);
}

static bool fanotify_fh_equal(struct fanotify_fh *fh1,
			      struct fanotify_fh *fh2)
{
@@ -38,6 +51,16 @@ static bool fanotify_fh_equal(struct fanotify_fh *fh1,
		!memcmp(fanotify_fh_buf(fh1), fanotify_fh_buf(fh2), fh1->len);
}

static unsigned int fanotify_hash_fh(struct fanotify_fh *fh)
{
	long salt = (long)fh->type | (long)fh->len << 8;

	/*
	 * full_name_hash() works long by long, so it handles fh buf optimally.
	 */
	return full_name_hash((void *)salt, fanotify_fh_buf(fh), fh->len);
}

static bool fanotify_fid_event_equal(struct fanotify_fid_event *ffe1,
				     struct fanotify_fid_event *ffe2)
{
@@ -88,16 +111,12 @@ static bool fanotify_name_event_equal(struct fanotify_name_event *fne1,
	return fanotify_info_equal(info1, info2);
}

static bool fanotify_should_merge(struct fsnotify_event *old_fsn,
				  struct fsnotify_event *new_fsn)
static bool fanotify_should_merge(struct fanotify_event *old,
				  struct fanotify_event *new)
{
	struct fanotify_event *old, *new;
	pr_debug("%s: old=%p new=%p\n", __func__, old, new);

	pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
	old = FANOTIFY_E(old_fsn);
	new = FANOTIFY_E(new_fsn);

	if (old_fsn->objectid != new_fsn->objectid ||
	if (old->hash != new->hash ||
	    old->type != new->type || old->pid != new->pid)
		return false;

@@ -129,14 +148,20 @@ static bool fanotify_should_merge(struct fsnotify_event *old_fsn,
	return false;
}

/* Limit event merges to limit CPU overhead per event */
#define FANOTIFY_MAX_MERGE_EVENTS 128

/* and the list better be locked by something too! */
static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
static int fanotify_merge(struct fsnotify_group *group,
			  struct fsnotify_event *event)
{
	struct fsnotify_event *test_event;
	struct fanotify_event *new;
	struct fanotify_event *old, *new = FANOTIFY_E(event);
	unsigned int bucket = fanotify_event_hash_bucket(group, new);
	struct hlist_head *hlist = &group->fanotify_data.merge_hash[bucket];
	int i = 0;

	pr_debug("%s: list=%p event=%p\n", __func__, list, event);
	new = FANOTIFY_E(event);
	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
		 group, event, bucket);

	/*
	 * Don't merge a permission event with any other event so that we know
@@ -146,9 +171,11 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
	if (fanotify_is_perm_event(new->mask))
		return 0;

	list_for_each_entry_reverse(test_event, list, list) {
		if (fanotify_should_merge(test_event, event)) {
			FANOTIFY_E(test_event)->mask |= new->mask;
	hlist_for_each_entry(old, hlist, merge_list) {
		if (++i > FANOTIFY_MAX_MERGE_EVENTS)
			break;
		if (fanotify_should_merge(old, new)) {
			old->mask |= new->mask;
			return 1;
		}
	}
@@ -184,8 +211,11 @@ static int fanotify_get_response(struct fsnotify_group *group,
			return ret;
		}
		/* Event not yet reported? Just remove it. */
		if (event->state == FAN_EVENT_INIT)
		if (event->state == FAN_EVENT_INIT) {
			fsnotify_remove_queued_event(group, &event->fae.fse);
			/* Permission events are not supposed to be hashed */
			WARN_ON_ONCE(!hlist_unhashed(&event->fae.merge_list));
		}
		/*
		 * Event may be also answered in case signal delivery raced
		 * with wakeup. In that case we have nothing to do besides
@@ -329,7 +359,8 @@ static int fanotify_encode_fh_len(struct inode *inode)
 * Return 0 on failure to encode.
 */
static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
			      unsigned int fh_len, gfp_t gfp)
			      unsigned int fh_len, unsigned int *hash,
			      gfp_t gfp)
{
	int dwords, type = 0;
	char *ext_buf = NULL;
@@ -372,6 +403,9 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
	fh->type = type;
	fh->len = fh_len;

	/* Mix fh into event merge key */
	*hash ^= fanotify_hash_fh(fh);

	return FANOTIFY_FH_HDR_LEN + fh_len;

out_err:
@@ -425,6 +459,7 @@ static struct inode *fanotify_dfid_inode(u32 event_mask, const void *data,
}

static struct fanotify_event *fanotify_alloc_path_event(const struct path *path,
							unsigned int *hash,
							gfp_t gfp)
{
	struct fanotify_path_event *pevent;
@@ -435,6 +470,7 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path,

	pevent->fae.type = FANOTIFY_EVENT_TYPE_PATH;
	pevent->path = *path;
	*hash ^= fanotify_hash_path(path);
	path_get(path);

	return &pevent->fae;
@@ -460,6 +496,7 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path,

static struct fanotify_event *fanotify_alloc_fid_event(struct inode *id,
						       __kernel_fsid_t *fsid,
						       unsigned int *hash,
						       gfp_t gfp)
{
	struct fanotify_fid_event *ffe;
@@ -470,16 +507,18 @@ static struct fanotify_event *fanotify_alloc_fid_event(struct inode *id,

	ffe->fae.type = FANOTIFY_EVENT_TYPE_FID;
	ffe->fsid = *fsid;
	*hash ^= fanotify_hash_fsid(fsid);
	fanotify_encode_fh(&ffe->object_fh, id, fanotify_encode_fh_len(id),
			   gfp);
			   hash, gfp);

	return &ffe->fae;
}

static struct fanotify_event *fanotify_alloc_name_event(struct inode *id,
							__kernel_fsid_t *fsid,
							const struct qstr *file_name,
							const struct qstr *name,
							struct inode *child,
							unsigned int *hash,
							gfp_t gfp)
{
	struct fanotify_name_event *fne;
@@ -492,24 +531,30 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id,
	size = sizeof(*fne) + FANOTIFY_FH_HDR_LEN + dir_fh_len;
	if (child_fh_len)
		size += FANOTIFY_FH_HDR_LEN + child_fh_len;
	if (file_name)
		size += file_name->len + 1;
	if (name)
		size += name->len + 1;
	fne = kmalloc(size, gfp);
	if (!fne)
		return NULL;

	fne->fae.type = FANOTIFY_EVENT_TYPE_FID_NAME;
	fne->fsid = *fsid;
	*hash ^= fanotify_hash_fsid(fsid);
	info = &fne->info;
	fanotify_info_init(info);
	dfh = fanotify_info_dir_fh(info);
	info->dir_fh_totlen = fanotify_encode_fh(dfh, id, dir_fh_len, 0);
	info->dir_fh_totlen = fanotify_encode_fh(dfh, id, dir_fh_len, hash, 0);
	if (child_fh_len) {
		ffh = fanotify_info_file_fh(info);
		info->file_fh_totlen = fanotify_encode_fh(ffh, child, child_fh_len, 0);
		info->file_fh_totlen = fanotify_encode_fh(ffh, child,
							child_fh_len, hash, 0);
	}
	if (name) {
		long salt = name->len;

		fanotify_info_copy_name(info, name);
		*hash ^= full_name_hash((void *)salt, name->name, name->len);
	}
	if (file_name)
		fanotify_info_copy_name(info, file_name);

	pr_debug("%s: ino=%lu size=%u dir_fh_len=%u child_fh_len=%u name_len=%u name='%.*s'\n",
		 __func__, id->i_ino, size, dir_fh_len, child_fh_len,
@@ -533,6 +578,9 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
	struct mem_cgroup *old_memcg;
	struct inode *child = NULL;
	bool name_event = false;
	unsigned int hash = 0;
	bool ondir = mask & FAN_ONDIR;
	struct pid *pid;

	if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) {
		/*
@@ -540,8 +588,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
		 * report the child fid for events reported on a non-dir child
		 * in addition to reporting the parent fid and maybe child name.
		 */
		if ((fid_mode & FAN_REPORT_FID) &&
		    id != dirid && !(mask & FAN_ONDIR))
		if ((fid_mode & FAN_REPORT_FID) && id != dirid && !ondir)
			child = id;

		id = dirid;
@@ -562,8 +609,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
		if (!(fid_mode & FAN_REPORT_NAME)) {
			name_event = !!child;
			file_name = NULL;
		} else if ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
			   !(mask & FAN_ONDIR)) {
		} else if ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) || !ondir) {
			name_event = true;
		}
	}
@@ -586,26 +632,25 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
		event = fanotify_alloc_perm_event(path, gfp);
	} else if (name_event && (file_name || child)) {
		event = fanotify_alloc_name_event(id, fsid, file_name, child,
						  gfp);
						  &hash, gfp);
	} else if (fid_mode) {
		event = fanotify_alloc_fid_event(id, fsid, gfp);
		event = fanotify_alloc_fid_event(id, fsid, &hash, gfp);
	} else {
		event = fanotify_alloc_path_event(path, gfp);
		event = fanotify_alloc_path_event(path, &hash, gfp);
	}

	if (!event)
		goto out;

	/*
	 * Use the victim inode instead of the watching inode as the id for
	 * event queue, so event reported on parent is merged with event
	 * reported on child when both directory and child watches exist.
	 */
	fanotify_init_event(event, (unsigned long)id, mask);
	if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
		event->pid = get_pid(task_pid(current));
		pid = get_pid(task_pid(current));
	else
		event->pid = get_pid(task_tgid(current));
		pid = get_pid(task_tgid(current));

	/* Mix event info, FAN_ONDIR flag and pid into event merge key */
	hash ^= hash_long((unsigned long)pid | ondir, FANOTIFY_EVENT_HASH_BITS);
	fanotify_init_event(event, hash, mask);
	event->pid = pid;

out:
	set_active_memcg(old_memcg);
@@ -645,6 +690,24 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
	return fsid;
}

/*
 * Add an event to hash table for faster merge.
 */
static void fanotify_insert_event(struct fsnotify_group *group,
				  struct fsnotify_event *fsn_event)
{
	struct fanotify_event *event = FANOTIFY_E(fsn_event);
	unsigned int bucket = fanotify_event_hash_bucket(group, event);
	struct hlist_head *hlist = &group->fanotify_data.merge_hash[bucket];

	assert_spin_locked(&group->notification_lock);

	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
		 group, event, bucket);

	hlist_add_head(&event->merge_list, hlist);
}

static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
				 const void *data, int data_type,
				 struct inode *dir,
@@ -715,7 +778,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
	}

	fsn_event = &event->fse;
	ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
	ret = fsnotify_add_event(group, fsn_event, fanotify_merge,
				 fanotify_is_hashed_event(mask) ?
				 fanotify_insert_event : NULL);
	if (ret) {
		/* Permission events shouldn't be merged */
		BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
@@ -736,11 +801,10 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,

static void fanotify_free_group_priv(struct fsnotify_group *group)
{
	struct user_struct *user;

	user = group->fanotify_data.user;
	atomic_dec(&user->fanotify_listeners);
	free_uid(user);
	kfree(group->fanotify_data.merge_hash);
	if (group->fanotify_data.ucounts)
		dec_ucount(group->fanotify_data.ucounts,
			   UCOUNT_FANOTIFY_GROUPS);
}

static void fanotify_free_path_event(struct fanotify_event *event)
@@ -796,6 +860,13 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
	}
}

static void fanotify_freeing_mark(struct fsnotify_mark *mark,
				  struct fsnotify_group *group)
{
	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
		dec_ucount(group->fanotify_data.ucounts, UCOUNT_FANOTIFY_MARKS);
}

static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
{
	kmem_cache_free(fanotify_mark_cache, fsn_mark);
@@ -805,5 +876,6 @@ const struct fsnotify_ops fanotify_fsnotify_ops = {
	.handle_event = fanotify_handle_event,
	.free_group_priv = fanotify_free_group_priv,
	.free_event = fanotify_free_event,
	.freeing_mark = fanotify_freeing_mark,
	.free_mark = fanotify_free_mark,
};
+43 −3
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@
#include <linux/path.h>
#include <linux/slab.h>
#include <linux/exportfs.h>
#include <linux/hashtable.h>

extern struct kmem_cache *fanotify_mark_cache;
extern struct kmem_cache *fanotify_fid_event_cachep;
@@ -115,6 +116,11 @@ static inline void fanotify_info_init(struct fanotify_info *info)
	info->name_len = 0;
}

static inline unsigned int fanotify_info_len(struct fanotify_info *info)
{
	return info->dir_fh_totlen + info->file_fh_totlen + info->name_len;
}

static inline void fanotify_info_copy_name(struct fanotify_info *info,
					   const struct qstr *name)
{
@@ -135,19 +141,31 @@ enum fanotify_event_type {
	FANOTIFY_EVENT_TYPE_PATH,
	FANOTIFY_EVENT_TYPE_PATH_PERM,
	FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */
	__FANOTIFY_EVENT_TYPE_NUM
};

#define FANOTIFY_EVENT_TYPE_BITS \
	(ilog2(__FANOTIFY_EVENT_TYPE_NUM - 1) + 1)
#define FANOTIFY_EVENT_HASH_BITS \
	(32 - FANOTIFY_EVENT_TYPE_BITS)

struct fanotify_event {
	struct fsnotify_event fse;
	struct hlist_node merge_list;	/* List for hashed merge */
	u32 mask;
	enum fanotify_event_type type;
	struct {
		unsigned int type : FANOTIFY_EVENT_TYPE_BITS;
		unsigned int hash : FANOTIFY_EVENT_HASH_BITS;
	};
	struct pid *pid;
};

static inline void fanotify_init_event(struct fanotify_event *event,
				       unsigned long id, u32 mask)
				       unsigned int hash, u32 mask)
{
	fsnotify_init_event(&event->fse, id);
	fsnotify_init_event(&event->fse);
	INIT_HLIST_NODE(&event->merge_list);
	event->hash = hash;
	event->mask = mask;
	event->pid = NULL;
}
@@ -284,3 +302,25 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event)
	else
		return NULL;
}

/*
 * Use 128 size hash table to speed up events merge.
 */
#define FANOTIFY_HTABLE_BITS	(7)
#define FANOTIFY_HTABLE_SIZE	(1 << FANOTIFY_HTABLE_BITS)
#define FANOTIFY_HTABLE_MASK	(FANOTIFY_HTABLE_SIZE - 1)

/*
 * Permission events and overflow event do not get merged - don't hash them.
 */
static inline bool fanotify_is_hashed_event(u32 mask)
{
	return !fanotify_is_perm_event(mask) && !(mask & FS_Q_OVERFLOW);
}

static inline unsigned int fanotify_event_hash_bucket(
						struct fsnotify_group *group,
						struct fanotify_event *event)
{
	return event->hash & FANOTIFY_HTABLE_MASK;
}
+186 −33
Original line number Diff line number Diff line
@@ -27,8 +27,61 @@
#include "fanotify.h"

#define FANOTIFY_DEFAULT_MAX_EVENTS	16384
#define FANOTIFY_DEFAULT_MAX_MARKS	8192
#define FANOTIFY_DEFAULT_MAX_LISTENERS	128
#define FANOTIFY_OLD_DEFAULT_MAX_MARKS	8192
#define FANOTIFY_DEFAULT_MAX_GROUPS	128

/*
 * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
 * limit of marks per user, similar to inotify.  Effectively, the legacy limit
 * of fanotify marks per user is <max marks per group> * <max groups per user>.
 * This default limit (1M) also happens to match the increased limit of inotify
 * max_user_watches since v5.10.
 */
#define FANOTIFY_DEFAULT_MAX_USER_MARKS	\
	(FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)

/*
 * Most of the memory cost of adding an inode mark is pinning the marked inode.
 * The size of the filesystem inode struct is not uniform across filesystems,
 * so double the size of a VFS inode is used as a conservative approximation.
 */
#define INODE_MARK_COST	(2 * sizeof(struct inode))

/* configurable via /proc/sys/fs/fanotify/ */
static int fanotify_max_queued_events __read_mostly;

#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>

struct ctl_table fanotify_table[] = {
	{
		.procname	= "max_user_groups",
		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= SYSCTL_ZERO,
	},
	{
		.procname	= "max_user_marks",
		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= SYSCTL_ZERO,
	},
	{
		.procname	= "max_queued_events",
		.data		= &fanotify_max_queued_events,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= SYSCTL_ZERO
	},
	{ }
};
#endif /* CONFIG_SYSCTL */

/*
 * All flags that may be specified in parameter event_f_flags of fanotify_init.
@@ -89,6 +142,23 @@ static int fanotify_event_info_len(unsigned int fid_mode,
	return info_len;
}

/*
 * Remove an hashed event from merge hash table.
 */
static void fanotify_unhash_event(struct fsnotify_group *group,
				  struct fanotify_event *event)
{
	assert_spin_locked(&group->notification_lock);

	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
		 group, event, fanotify_event_hash_bucket(group, event));

	if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
		return;

	hlist_del_init(&event->merge_list);
}

/*
 * Get an fanotify notification event if one exists and is small
 * enough to fit in "count". Return an error pointer if the count
@@ -100,26 +170,34 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
{
	size_t event_size = FAN_EVENT_METADATA_LEN;
	struct fanotify_event *event = NULL;
	struct fsnotify_event *fsn_event;
	unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);

	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);

	spin_lock(&group->notification_lock);
	if (fsnotify_notify_queue_is_empty(group))
	fsn_event = fsnotify_peek_first_event(group);
	if (!fsn_event)
		goto out;

	if (fid_mode) {
		event_size += fanotify_event_info_len(fid_mode,
			FANOTIFY_E(fsnotify_peek_first_event(group)));
	}
	event = FANOTIFY_E(fsn_event);
	if (fid_mode)
		event_size += fanotify_event_info_len(fid_mode, event);

	if (event_size > count) {
		event = ERR_PTR(-EINVAL);
		goto out;
	}
	event = FANOTIFY_E(fsnotify_remove_first_event(group));

	/*
	 * Held the notification_lock the whole time, so this is the
	 * same event we peeked above.
	 */
	fsnotify_remove_first_event(group);
	if (fanotify_is_perm_event(event->mask))
		FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
	if (fanotify_is_hashed_event(event->mask))
		fanotify_unhash_event(group, event);
out:
	spin_unlock(&group->notification_lock);
	return event;
@@ -341,6 +419,14 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
	metadata.reserved = 0;
	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
	metadata.pid = pid_vnr(event->pid);
	/*
	 * For an unprivileged listener, event->pid can be used to identify the
	 * events generated by the listener process itself, without disclosing
	 * the pids of other processes.
	 */
	if (!capable(CAP_SYS_ADMIN) &&
	    task_tgid(current) != event->pid)
		metadata.pid = 0;

	if (path && path->mnt && path->dentry) {
		fd = create_fd(group, path, &f);
@@ -573,6 +659,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
static int fanotify_release(struct inode *ignored, struct file *file)
{
	struct fsnotify_group *group = file->private_data;
	struct fsnotify_event *fsn_event;

	/*
	 * Stop new events from arriving in the notification queue. since
@@ -601,13 +688,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
	 * dequeue them and set the response. They will be freed once the
	 * response is consumed and fanotify_get_response() returns.
	 */
	while (!fsnotify_notify_queue_is_empty(group)) {
		struct fanotify_event *event;
	while ((fsn_event = fsnotify_remove_first_event(group))) {
		struct fanotify_event *event = FANOTIFY_E(fsn_event);

		event = FANOTIFY_E(fsnotify_remove_first_event(group));
		if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
			spin_unlock(&group->notification_lock);
			fsnotify_destroy_event(group, &event->fse);
			fsnotify_destroy_event(group, fsn_event);
		} else {
			finish_permission_event(group, FANOTIFY_PERM(event),
						FAN_ALLOW);
@@ -822,24 +908,38 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
						   unsigned int type,
						   __kernel_fsid_t *fsid)
{
	struct ucounts *ucounts = group->fanotify_data.ucounts;
	struct fsnotify_mark *mark;
	int ret;

	if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
	/*
	 * Enforce per user marks limits per user in all containing user ns.
	 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
	 * in the limited groups account.
	 */
	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
	    !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
		return ERR_PTR(-ENOSPC);

	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
	if (!mark)
		return ERR_PTR(-ENOMEM);
	if (!mark) {
		ret = -ENOMEM;
		goto out_dec_ucounts;
	}

	fsnotify_init_mark(mark, group);
	ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
	if (ret) {
		fsnotify_put_mark(mark);
		return ERR_PTR(ret);
		goto out_dec_ucounts;
	}

	return mark;

out_dec_ucounts:
	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
		dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
	return ERR_PTR(ret);
}


@@ -919,20 +1019,41 @@ static struct fsnotify_event *fanotify_alloc_overflow_event(void)
	return &oevent->fse;
}

static struct hlist_head *fanotify_alloc_merge_hash(void)
{
	struct hlist_head *hash;

	hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
		       GFP_KERNEL_ACCOUNT);
	if (!hash)
		return NULL;

	__hash_init(hash, FANOTIFY_HTABLE_SIZE);

	return hash;
}

/* fanotify syscalls */
SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
{
	struct fsnotify_group *group;
	int f_flags, fd;
	struct user_struct *user;
	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
	unsigned int class = flags & FANOTIFY_CLASS_BITS;

	pr_debug("%s: flags=%x event_f_flags=%x\n",
		 __func__, flags, event_f_flags);

	if (!capable(CAP_SYS_ADMIN))
	if (!capable(CAP_SYS_ADMIN)) {
		/*
		 * An unprivileged user can setup an fanotify group with
		 * limited functionality - an unprivileged group is limited to
		 * notification events with file handles and it cannot use
		 * unlimited queue/marks.
		 */
		if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
			return -EPERM;
	}

#ifdef CONFIG_AUDITSYSCALL
	if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
@@ -963,12 +1084,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
	if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
		return -EINVAL;

	user = get_current_user();
	if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
		free_uid(user);
		return -EMFILE;
	}

	f_flags = O_RDWR | FMODE_NONOTIFY;
	if (flags & FAN_CLOEXEC)
		f_flags |= O_CLOEXEC;
@@ -978,15 +1093,27 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
	group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
	if (IS_ERR(group)) {
		free_uid(user);
		return PTR_ERR(group);
	}

	group->fanotify_data.user = user;
	/* Enforce groups limits per user in all containing user ns */
	group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
						  current_euid(),
						  UCOUNT_FANOTIFY_GROUPS);
	if (!group->fanotify_data.ucounts) {
		fd = -EMFILE;
		goto out_destroy_group;
	}

	group->fanotify_data.flags = flags;
	atomic_inc(&user->fanotify_listeners);
	group->memcg = get_mem_cgroup_from_mm(current->mm);

	group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
	if (!group->fanotify_data.merge_hash) {
		fd = -ENOMEM;
		goto out_destroy_group;
	}

	group->overflow_event = fanotify_alloc_overflow_event();
	if (unlikely(!group->overflow_event)) {
		fd = -ENOMEM;
@@ -1019,16 +1146,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
			goto out_destroy_group;
		group->max_events = UINT_MAX;
	} else {
		group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
		group->max_events = fanotify_max_queued_events;
	}

	if (flags & FAN_UNLIMITED_MARKS) {
		fd = -EPERM;
		if (!capable(CAP_SYS_ADMIN))
			goto out_destroy_group;
		group->fanotify_data.max_marks = UINT_MAX;
	} else {
		group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
	}

	if (flags & FAN_ENABLE_AUDIT) {
@@ -1126,7 +1250,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
		 __func__, fanotify_fd, flags, dfd, pathname, mask);

	/* we only use the lower 32 bits as of right now. */
	if (mask & ((__u64)0xffffffff << 32))
	if (upper_32_bits(mask))
		return -EINVAL;

	if (flags & ~FANOTIFY_MARK_FLAGS)
@@ -1180,6 +1304,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
		goto fput_and_out;
	group = f.file->private_data;

	/*
	 * An unprivileged user is not allowed to watch a mount point nor
	 * a filesystem.
	 */
	ret = -EPERM;
	if (!capable(CAP_SYS_ADMIN) &&
	    mark_type != FAN_MARK_INODE)
		goto fput_and_out;

	/*
	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
	 * allowed to set permissions events.
@@ -1312,6 +1445,21 @@ SYSCALL32_DEFINE6(fanotify_mark,
 */
static int __init fanotify_user_setup(void)
{
	struct sysinfo si;
	int max_marks;

	si_meminfo(&si);
	/*
	 * Allow up to 1% of addressable memory to be accounted for per user
	 * marks limited to the range [8192, 1048576]. mount and sb marks are
	 * a lot cheaper than inode marks, but there is no reason for a user
	 * to have many of those, so calculate by the cost of inode marks.
	 */
	max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
		    INODE_MARK_COST;
	max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
				     FANOTIFY_DEFAULT_MAX_USER_MARKS);

	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);

@@ -1326,6 +1474,11 @@ static int __init fanotify_user_setup(void)
			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
	}

	fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
	init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
					FANOTIFY_DEFAULT_MAX_GROUPS;
	init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;

	return 0;
}
device_initcall(fanotify_user_setup);
Loading