Commit d9395512 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull vfs namei updates from Al Viro:
 "RCU pathwalk cleanups.

  Storing sampled ->d_seq of the next dentry in nameidata simplifies
  life considerably, especially if we delay fetching ->d_inode until
  step_into()"

* tag 'pull-work.namei' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
  step_into(): move fetching ->d_inode past handle_mounts()
  lookup_fast(): don't bother with inode
  follow_dotdot{,_rcu}(): don't bother with inode
  step_into(): lose inode argument
  namei: stash the sampled ->d_seq into nameidata
  namei: move clearing LOOKUP_RCU towards rcu_read_unlock()
  switch try_to_unlazy_next() to __legitimize_mnt()
  follow_dotdot{,_rcu}(): change calling conventions
  namei: get rid of pointless unlikely(read_seqcount_retry(...))
  __follow_mount_rcu(): verify that mount_lock remains unchanged
parents f0065400 3bd8bc89
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -100,7 +100,6 @@ static inline int is_mounted(struct vfsmount *mnt)
extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);

extern int __legitimize_mnt(struct vfsmount *, unsigned);
extern bool legitimize_mnt(struct vfsmount *, unsigned);

static inline bool __path_is_mountpoint(const struct path *path)
{
+85 −106
Original line number Diff line number Diff line
@@ -567,7 +567,7 @@ struct nameidata {
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags, state;
	unsigned	seq, m_seq, r_seq;
	unsigned	seq, next_seq, m_seq, r_seq;
	int		last_type;
	unsigned	depth;
	int		total_link_count;
@@ -665,6 +665,13 @@ static void drop_links(struct nameidata *nd)
	}
}

static void leave_rcu(struct nameidata *nd)
{
	nd->flags &= ~LOOKUP_RCU;
	nd->seq = nd->next_seq = 0;
	rcu_read_unlock();
}

static void terminate_walk(struct nameidata *nd)
{
	drop_links(nd);
@@ -678,8 +685,7 @@ static void terminate_walk(struct nameidata *nd)
			nd->state &= ~ND_ROOT_GRABBED;
		}
	} else {
		nd->flags &= ~LOOKUP_RCU;
		rcu_read_unlock();
		leave_rcu(nd);
	}
	nd->depth = 0;
	nd->path.mnt = NULL;
@@ -765,14 +771,13 @@ static bool try_to_unlazy(struct nameidata *nd)

	BUG_ON(!(nd->flags & LOOKUP_RCU));

	nd->flags &= ~LOOKUP_RCU;
	if (unlikely(!legitimize_links(nd)))
		goto out1;
	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
		goto out;
	if (unlikely(!legitimize_root(nd)))
		goto out;
	rcu_read_unlock();
	leave_rcu(nd);
	BUG_ON(nd->inode != parent->d_inode);
	return true;

@@ -780,7 +785,7 @@ static bool try_to_unlazy(struct nameidata *nd)
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
out:
	rcu_read_unlock();
	leave_rcu(nd);
	return false;
}

@@ -788,7 +793,6 @@ static bool try_to_unlazy(struct nameidata *nd)
 * try_to_unlazy_next - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: next dentry to step into
 * @seq: seq number to check @dentry against
 * Returns: true on success, false on failure
 *
 * Similar to try_to_unlazy(), but here we have the next dentry already
@@ -797,15 +801,19 @@ static bool try_to_unlazy(struct nameidata *nd)
 * Nothing should touch nameidata between try_to_unlazy_next() failure and
 * terminate_walk().
 */
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
{
	int res;
	BUG_ON(!(nd->flags & LOOKUP_RCU));

	nd->flags &= ~LOOKUP_RCU;
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
	res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
	if (unlikely(res)) {
		if (res > 0)
			goto out2;
		goto out1;
	}
	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
		goto out1;

@@ -818,7 +826,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi
	 */
	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
		goto out;
	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
	if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
		goto out_dput;
	/*
	 * Sequence counts matched. Now make sure that the root is
@@ -826,7 +834,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi
	 */
	if (unlikely(!legitimize_root(nd)))
		goto out_dput;
	rcu_read_unlock();
	leave_rcu(nd);
	return true;

out2:
@@ -834,10 +842,10 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi
out1:
	nd->path.dentry = NULL;
out:
	rcu_read_unlock();
	leave_rcu(nd);
	return false;
out_dput:
	rcu_read_unlock();
	leave_rcu(nd);
	dput(dentry);
	return false;
}
@@ -962,7 +970,7 @@ static int nd_jump_root(struct nameidata *nd)
		d = nd->path.dentry;
		nd->inode = d->d_inode;
		nd->seq = nd->root_seq;
		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
		if (read_seqcount_retry(&d->d_seq, nd->seq))
			return -ECHILD;
	} else {
		path_put(&nd->path);
@@ -1466,8 +1474,7 @@ EXPORT_SYMBOL(follow_down);
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
			       struct inode **inode, unsigned *seqp)
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path)
{
	struct dentry *dentry = path->dentry;
	unsigned int flags = dentry->d_flags;
@@ -1496,15 +1503,12 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
				path->mnt = &mounted->mnt;
				dentry = path->dentry = mounted->mnt.mnt_root;
				nd->state |= ND_JUMPED;
				*seqp = read_seqcount_begin(&dentry->d_seq);
				*inode = dentry->d_inode;
				/*
				 * We don't need to re-check ->d_seq after this
				 * ->d_inode read - there will be an RCU delay
				 * between mount hash removal and ->mnt_root
				 * becoming unpinned.
				 */
				nd->next_seq = read_seqcount_begin(&dentry->d_seq);
				flags = dentry->d_flags;
				// makes sure that non-RCU pathwalk could reach
				// this state.
				if (read_seqretry(&mount_lock, nd->m_seq))
					return false;
				continue;
			}
			if (read_seqretry(&mount_lock, nd->m_seq))
@@ -1515,8 +1519,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
}

static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
			  struct path *path, struct inode **inode,
			  unsigned int *seqp)
			  struct path *path)
{
	bool jumped;
	int ret;
@@ -1524,16 +1527,15 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
	path->mnt = nd->path.mnt;
	path->dentry = dentry;
	if (nd->flags & LOOKUP_RCU) {
		unsigned int seq = *seqp;
		if (unlikely(!*inode))
			return -ENOENT;
		if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
		unsigned int seq = nd->next_seq;
		if (likely(__follow_mount_rcu(nd, path)))
			return 0;
		if (!try_to_unlazy_next(nd, dentry, seq))
			return -ECHILD;
		// *path might've been clobbered by __follow_mount_rcu()
		// *path and nd->next_seq might've been clobbered
		path->mnt = nd->path.mnt;
		path->dentry = dentry;
		nd->next_seq = seq;
		if (!try_to_unlazy_next(nd, dentry))
			return -ECHILD;
	}
	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
	if (jumped) {
@@ -1546,9 +1548,6 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
		dput(path->dentry);
		if (path->mnt != nd->path.mnt)
			mntput(path->mnt);
	} else {
		*inode = d_backing_inode(path->dentry);
		*seqp = 0; /* out of RCU mode, so the value doesn't matter */
	}
	return ret;
}
@@ -1607,9 +1606,7 @@ static struct dentry *__lookup_hash(const struct qstr *name,
	return dentry;
}

static struct dentry *lookup_fast(struct nameidata *nd,
				  struct inode **inode,
			          unsigned *seqp)
static struct dentry *lookup_fast(struct nameidata *nd)
{
	struct dentry *dentry, *parent = nd->path.dentry;
	int status = 1;
@@ -1620,37 +1617,24 @@ static struct dentry *lookup_fast(struct nameidata *nd,
	 * going to fall back to non-racy lookup.
	 */
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;
		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
		dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq);
		if (unlikely(!dentry)) {
			if (!try_to_unlazy(nd))
				return ERR_PTR(-ECHILD);
			return NULL;
		}

		/*
		 * This sequence count validates that the inode matches
		 * the dentry name information from lookup.
		 */
		*inode = d_backing_inode(dentry);
		if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
			return ERR_PTR(-ECHILD);

		/*
		 * This sequence count validates that the parent had no
		 * changes while we did the lookup of the dentry above.
		 *
		 * The memory barrier in read_seqcount_begin of child is
		 *  enough, we can use __read_seqcount_retry here.
		 */
		if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
		if (read_seqcount_retry(&parent->d_seq, nd->seq))
			return ERR_PTR(-ECHILD);

		*seqp = seq;
		status = d_revalidate(dentry, nd->flags);
		if (likely(status > 0))
			return dentry;
		if (!try_to_unlazy_next(nd, dentry, seq))
		if (!try_to_unlazy_next(nd, dentry))
			return ERR_PTR(-ECHILD);
		if (status == -ECHILD)
			/* we'd been told to redo it in non-rcu mode */
@@ -1731,7 +1715,7 @@ static inline int may_lookup(struct user_namespace *mnt_userns,
	return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
}

static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
static int reserve_stack(struct nameidata *nd, struct path *link)
{
	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
		return -ELOOP;
@@ -1746,7 +1730,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
	if (nd->flags & LOOKUP_RCU) {
		// we need to grab link before we do unlazy.  And we can't skip
		// unlazy even if we fail to grab the link - cleanup needs it
		bool grabbed_link = legitimize_path(nd, link, seq);
		bool grabbed_link = legitimize_path(nd, link, nd->next_seq);

		if (!try_to_unlazy(nd) || !grabbed_link)
			return -ECHILD;
@@ -1760,11 +1744,11 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

static const char *pick_link(struct nameidata *nd, struct path *link,
		     struct inode *inode, unsigned seq, int flags)
		     struct inode *inode, int flags)
{
	struct saved *last;
	const char *res;
	int error = reserve_stack(nd, link, seq);
	int error = reserve_stack(nd, link);

	if (unlikely(error)) {
		if (!(nd->flags & LOOKUP_RCU))
@@ -1774,7 +1758,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
	last = nd->stack + nd->depth++;
	last->link = *link;
	clear_delayed_call(&last->done);
	last->seq = seq;
	last->seq = nd->next_seq;

	if (flags & WALK_TRAILING) {
		error = may_follow_link(nd, inode);
@@ -1836,43 +1820,50 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 *
 * NOTE: dentry must be what nd->next_seq had been sampled from.
 */
static const char *step_into(struct nameidata *nd, int flags,
		     struct dentry *dentry, struct inode *inode, unsigned seq)
		     struct dentry *dentry)
{
	struct path path;
	int err = handle_mounts(nd, dentry, &path, &inode, &seq);
	struct inode *inode;
	int err = handle_mounts(nd, dentry, &path);

	if (err < 0)
		return ERR_PTR(err);
	inode = path.dentry->d_inode;
	if (likely(!d_is_symlink(path.dentry)) ||
	   ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
	   (flags & WALK_NOFOLLOW)) {
		/* not a symlink or should not follow */
		if (!(nd->flags & LOOKUP_RCU)) {
		if (nd->flags & LOOKUP_RCU) {
			if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
				return ERR_PTR(-ECHILD);
			if (unlikely(!inode))
				return ERR_PTR(-ENOENT);
		} else {
			dput(nd->path.dentry);
			if (nd->path.mnt != path.mnt)
				mntput(nd->path.mnt);
		}
		nd->path = path;
		nd->inode = inode;
		nd->seq = seq;
		nd->seq = nd->next_seq;
		return NULL;
	}
	if (nd->flags & LOOKUP_RCU) {
		/* make sure that d_is_symlink above matches inode */
		if (read_seqcount_retry(&path.dentry->d_seq, seq))
		if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
			return ERR_PTR(-ECHILD);
	} else {
		if (path.mnt == nd->path.mnt)
			mntget(path.mnt);
	}
	return pick_link(nd, &path, inode, seq, flags);
	return pick_link(nd, &path, inode, flags);
}

static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
					struct inode **inodep,
					unsigned *seqp)
static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
{
	struct dentry *parent, *old;

@@ -1889,30 +1880,30 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
		nd->path = path;
		nd->inode = path.dentry->d_inode;
		nd->seq = seq;
		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
		// makes sure that non-RCU pathwalk could reach this state
		if (read_seqretry(&mount_lock, nd->m_seq))
			return ERR_PTR(-ECHILD);
		/* we know that mountpoint was pinned */
	}
	old = nd->path.dentry;
	parent = old->d_parent;
	*inodep = parent->d_inode;
	*seqp = read_seqcount_begin(&parent->d_seq);
	if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
	nd->next_seq = read_seqcount_begin(&parent->d_seq);
	// makes sure that non-RCU pathwalk could reach this state
	if (read_seqcount_retry(&old->d_seq, nd->seq))
		return ERR_PTR(-ECHILD);
	if (unlikely(!path_connected(nd->path.mnt, parent)))
		return ERR_PTR(-ECHILD);
	return parent;
in_root:
	if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
	if (read_seqretry(&mount_lock, nd->m_seq))
		return ERR_PTR(-ECHILD);
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return ERR_PTR(-ECHILD);
	return NULL;
	nd->next_seq = nd->seq;
	return nd->path.dentry;
}

static struct dentry *follow_dotdot(struct nameidata *nd,
				 struct inode **inodep,
				 unsigned *seqp)
static struct dentry *follow_dotdot(struct nameidata *nd)
{
	struct dentry *parent;

@@ -1936,15 +1927,12 @@ static struct dentry *follow_dotdot(struct nameidata *nd,
		dput(parent);
		return ERR_PTR(-ENOENT);
	}
	*seqp = 0;
	*inodep = parent->d_inode;
	return parent;

in_root:
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return ERR_PTR(-EXDEV);
	dget(nd->path.dentry);
	return NULL;
	return dget(nd->path.dentry);
}

static const char *handle_dots(struct nameidata *nd, int type)
@@ -1952,8 +1940,6 @@ static const char *handle_dots(struct nameidata *nd, int type)
	if (type == LAST_DOTDOT) {
		const char *error = NULL;
		struct dentry *parent;
		struct inode *inode;
		unsigned seq;

		if (!nd->root.mnt) {
			error = ERR_PTR(set_root(nd));
@@ -1961,17 +1947,12 @@ static const char *handle_dots(struct nameidata *nd, int type)
				return error;
		}
		if (nd->flags & LOOKUP_RCU)
			parent = follow_dotdot_rcu(nd, &inode, &seq);
			parent = follow_dotdot_rcu(nd);
		else
			parent = follow_dotdot(nd, &inode, &seq);
			parent = follow_dotdot(nd);
		if (IS_ERR(parent))
			return ERR_CAST(parent);
		if (unlikely(!parent))
			error = step_into(nd, WALK_NOFOLLOW,
					 nd->path.dentry, nd->inode, nd->seq);
		else
			error = step_into(nd, WALK_NOFOLLOW,
					 parent, inode, seq);
		error = step_into(nd, WALK_NOFOLLOW, parent);
		if (unlikely(error))
			return error;

@@ -1983,9 +1964,9 @@ static const char *handle_dots(struct nameidata *nd, int type)
			 * some fallback).
			 */
			smp_rmb();
			if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
			if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
				return ERR_PTR(-EAGAIN);
			if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
			if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
				return ERR_PTR(-EAGAIN);
		}
	}
@@ -1995,8 +1976,6 @@ static const char *handle_dots(struct nameidata *nd, int type)
static const char *walk_component(struct nameidata *nd, int flags)
{
	struct dentry *dentry;
	struct inode *inode;
	unsigned seq;
	/*
	 * "." and ".." are special - ".." especially so because it has
	 * to be able to know about the current root directory and
@@ -2007,7 +1986,7 @@ static const char *walk_component(struct nameidata *nd, int flags)
			put_link(nd);
		return handle_dots(nd, nd->last_type);
	}
	dentry = lookup_fast(nd, &inode, &seq);
	dentry = lookup_fast(nd);
	if (IS_ERR(dentry))
		return ERR_CAST(dentry);
	if (unlikely(!dentry)) {
@@ -2017,7 +1996,7 @@ static const char *walk_component(struct nameidata *nd, int flags)
	}
	if (!(flags & WALK_MORE) && nd->depth)
		put_link(nd);
	return step_into(nd, flags, dentry, inode, seq);
	return step_into(nd, flags, dentry);
}

/*
@@ -2372,6 +2351,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
		flags &= ~LOOKUP_RCU;
	if (flags & LOOKUP_RCU)
		rcu_read_lock();
	else
		nd->seq = nd->next_seq = 0;

	nd->flags = flags;
	nd->state |= ND_JUMPED;
@@ -2473,8 +2454,8 @@ static int handle_lookup_down(struct nameidata *nd)
{
	if (!(nd->flags & LOOKUP_RCU))
		dget(nd->path.dentry);
	return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
			nd->path.dentry, nd->inode, nd->seq));
	nd->next_seq = nd->seq;
	return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry));
}

/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
@@ -3393,8 +3374,6 @@ static const char *open_last_lookups(struct nameidata *nd,
	struct dentry *dir = nd->path.dentry;
	int open_flag = op->open_flag;
	bool got_write = false;
	unsigned seq;
	struct inode *inode;
	struct dentry *dentry;
	const char *res;

@@ -3410,7 +3389,7 @@ static const char *open_last_lookups(struct nameidata *nd,
		if (nd->last.name[nd->last.len])
			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
		/* we _can_ be in RCU mode here */
		dentry = lookup_fast(nd, &inode, &seq);
		dentry = lookup_fast(nd);
		if (IS_ERR(dentry))
			return ERR_CAST(dentry);
		if (likely(dentry))
@@ -3464,7 +3443,7 @@ static const char *open_last_lookups(struct nameidata *nd,
finish_lookup:
	if (nd->depth)
		put_link(nd);
	res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
	res = step_into(nd, WALK_TRAILING, dentry);
	if (unlikely(res))
		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
	return res;
+1 −1
Original line number Diff line number Diff line
@@ -648,7 +648,7 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
}

/* call under rcu_read_lock */
bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
	int res = __legitimize_mnt(bastard, seq);
	if (likely(!res))