Commit 03fa86e9 authored by Al Viro's avatar Al Viro
Browse files

namei: stash the sampled ->d_seq into nameidata



New field: nd->next_seq.  Set to 0 outside of RCU mode, holds the sampled
value for the next dentry to be considered.  Used instead of an arseload
of local variables, arguments, etc.

step_into() has lost seq argument; nd->next_seq is used, so dentry passed
to it must be the one ->next_seq is about.

There are two requirements for RCU pathwalk:
	1) it should not give a hard failure (other than -ECHILD) unless
non-RCU pathwalk might fail that way given suitable timings.
	2) it should not succeed unless non-RCU pathwalk might succeed
with the same end location given suitable timings.

The use of seq numbers is the way we achieve that.  Invariant we want
to maintain is:
	if RCU pathwalk can reach the state with given nd->path, nd->inode
and nd->seq after having traversed some part of pathname, it must be possible
for non-RCU pathwalk to reach the same nd->path and nd->inode after having
traversed the same part of pathname, and observe the nd->path.dentry->d_seq
equal to what RCU pathwalk has in nd->seq

	For transition from parent to child, we sample child's ->d_seq
and verify that parent's ->d_seq remains unchanged.  Anything that
disrupts parent-child relationship would've bumped ->d_seq on both.
	For transitions from child to parent we sample parent's ->d_seq
and verify that child's ->d_seq has not changed.  Same reasoning as
for the previous case applies.
	For transition from mountpoint to root of mounted we sample
the ->d_seq of root and verify that nobody has touched mount_lock since
the beginning of pathwalk.  That guarantees that mount we'd found had
been there all along, with these mountpoint and root of the mounted.
It would be possible for a non-RCU pathwalk to reach the previous state,
find the same mount and observe its root at the moment we'd sampled
->d_seq of that
	For transitions from root of mounted to mountpoint we sample
->d_seq of mountpoint and verify that mount_lock had not been touched
since the beginning of pathwalk.  The same reasoning as in the
previous case applies.

Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent 6e180327
Loading
Loading
Loading
Loading
+48 −50
Original line number Diff line number Diff line
@@ -567,7 +567,7 @@ struct nameidata {
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags, state;
	unsigned	seq, m_seq, r_seq;
	unsigned	seq, next_seq, m_seq, r_seq;
	int		last_type;
	unsigned	depth;
	int		total_link_count;
@@ -668,6 +668,7 @@ static void drop_links(struct nameidata *nd)
static void leave_rcu(struct nameidata *nd)
{
	nd->flags &= ~LOOKUP_RCU;
	nd->seq = nd->next_seq = 0;
	rcu_read_unlock();
}

@@ -792,7 +793,6 @@ static bool try_to_unlazy(struct nameidata *nd)
 * try_to_unlazy_next - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: next dentry to step into
 * @seq: seq number to check @dentry against
 * Returns: true on success, false on failure
 *
 * Similar to try_to_unlazy(), but here we have the next dentry already
@@ -801,7 +801,7 @@ static bool try_to_unlazy(struct nameidata *nd)
 * Nothing should touch nameidata between try_to_unlazy_next() failure and
 * terminate_walk().
 */
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
{
	int res;
	BUG_ON(!(nd->flags & LOOKUP_RCU));
@@ -826,7 +826,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi
	 */
	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
		goto out;
	if (read_seqcount_retry(&dentry->d_seq, seq))
	if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
		goto out_dput;
	/*
	 * Sequence counts matched. Now make sure that the root is
@@ -1475,7 +1475,7 @@ EXPORT_SYMBOL(follow_down);
 * we meet a managed dentry that would need blocking.
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
			       struct inode **inode, unsigned *seqp)
			       struct inode **inode)
{
	struct dentry *dentry = path->dentry;
	unsigned int flags = dentry->d_flags;
@@ -1504,7 +1504,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
				path->mnt = &mounted->mnt;
				dentry = path->dentry = mounted->mnt.mnt_root;
				nd->state |= ND_JUMPED;
				*seqp = read_seqcount_begin(&dentry->d_seq);
				nd->next_seq = read_seqcount_begin(&dentry->d_seq);
				*inode = dentry->d_inode;
				/*
				 * We don't need to re-check ->d_seq after this
@@ -1513,6 +1513,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
				 * becoming unpinned.
				 */
				flags = dentry->d_flags;
				// makes sure that non-RCU pathwalk could reach
				// this state.
				if (read_seqretry(&mount_lock, nd->m_seq))
					return false;
				continue;
@@ -1525,8 +1527,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
}

static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
			  struct path *path, struct inode **inode,
			  unsigned int *seqp)
			  struct path *path, struct inode **inode)
{
	bool jumped;
	int ret;
@@ -1534,16 +1535,17 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
	path->mnt = nd->path.mnt;
	path->dentry = dentry;
	if (nd->flags & LOOKUP_RCU) {
		unsigned int seq = *seqp;
		unsigned int seq = nd->next_seq;
		if (unlikely(!*inode))
			return -ENOENT;
		if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
		if (likely(__follow_mount_rcu(nd, path, inode)))
			return 0;
		if (!try_to_unlazy_next(nd, dentry, seq))
			return -ECHILD;
		// *path might've been clobbered by __follow_mount_rcu()
		// *path and nd->next_seq might've been clobbered
		path->mnt = nd->path.mnt;
		path->dentry = dentry;
		nd->next_seq = seq;
		if (!try_to_unlazy_next(nd, dentry))
			return -ECHILD;
	}
	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
	if (jumped) {
@@ -1558,7 +1560,6 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
			mntput(path->mnt);
	} else {
		*inode = d_backing_inode(path->dentry);
		*seqp = 0; /* out of RCU mode, so the value doesn't matter */
	}
	return ret;
}
@@ -1618,8 +1619,7 @@ static struct dentry *__lookup_hash(const struct qstr *name,
}

static struct dentry *lookup_fast(struct nameidata *nd,
				  struct inode **inode,
			          unsigned *seqp)
				  struct inode **inode)
{
	struct dentry *dentry, *parent = nd->path.dentry;
	int status = 1;
@@ -1630,8 +1630,7 @@ static struct dentry *lookup_fast(struct nameidata *nd,
	 * going to fall back to non-racy lookup.
	 */
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;
		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
		dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq);
		if (unlikely(!dentry)) {
			if (!try_to_unlazy(nd))
				return ERR_PTR(-ECHILD);
@@ -1643,7 +1642,7 @@ static struct dentry *lookup_fast(struct nameidata *nd,
		 * the dentry name information from lookup.
		 */
		*inode = d_backing_inode(dentry);
		if (read_seqcount_retry(&dentry->d_seq, seq))
		if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
			return ERR_PTR(-ECHILD);

		/*
@@ -1656,11 +1655,10 @@ static struct dentry *lookup_fast(struct nameidata *nd,
		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
			return ERR_PTR(-ECHILD);

		*seqp = seq;
		status = d_revalidate(dentry, nd->flags);
		if (likely(status > 0))
			return dentry;
		if (!try_to_unlazy_next(nd, dentry, seq))
		if (!try_to_unlazy_next(nd, dentry))
			return ERR_PTR(-ECHILD);
		if (status == -ECHILD)
			/* we'd been told to redo it in non-rcu mode */
@@ -1741,7 +1739,7 @@ static inline int may_lookup(struct user_namespace *mnt_userns,
	return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
}

static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
static int reserve_stack(struct nameidata *nd, struct path *link)
{
	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
		return -ELOOP;
@@ -1756,7 +1754,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
	if (nd->flags & LOOKUP_RCU) {
		// we need to grab link before we do unlazy.  And we can't skip
		// unlazy even if we fail to grab the link - cleanup needs it
		bool grabbed_link = legitimize_path(nd, link, seq);
		bool grabbed_link = legitimize_path(nd, link, nd->next_seq);

		if (!try_to_unlazy(nd) || !grabbed_link)
			return -ECHILD;
@@ -1770,11 +1768,11 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

static const char *pick_link(struct nameidata *nd, struct path *link,
		     struct inode *inode, unsigned seq, int flags)
		     struct inode *inode, int flags)
{
	struct saved *last;
	const char *res;
	int error = reserve_stack(nd, link, seq);
	int error = reserve_stack(nd, link);

	if (unlikely(error)) {
		if (!(nd->flags & LOOKUP_RCU))
@@ -1784,7 +1782,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
	last = nd->stack + nd->depth++;
	last->link = *link;
	clear_delayed_call(&last->done);
	last->seq = seq;
	last->seq = nd->next_seq;

	if (flags & WALK_TRAILING) {
		error = may_follow_link(nd, inode);
@@ -1846,12 +1844,14 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 *
 * NOTE: dentry must be what nd->next_seq had been sampled from.
 */
static const char *step_into(struct nameidata *nd, int flags,
		     struct dentry *dentry, struct inode *inode, unsigned seq)
		     struct dentry *dentry, struct inode *inode)
{
	struct path path;
	int err = handle_mounts(nd, dentry, &path, &inode, &seq);
	int err = handle_mounts(nd, dentry, &path, &inode);

	if (err < 0)
		return ERR_PTR(err);
@@ -1866,23 +1866,22 @@ static const char *step_into(struct nameidata *nd, int flags,
		}
		nd->path = path;
		nd->inode = inode;
		nd->seq = seq;
		nd->seq = nd->next_seq;
		return NULL;
	}
	if (nd->flags & LOOKUP_RCU) {
		/* make sure that d_is_symlink above matches inode */
		if (read_seqcount_retry(&path.dentry->d_seq, seq))
		if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
			return ERR_PTR(-ECHILD);
	} else {
		if (path.mnt == nd->path.mnt)
			mntget(path.mnt);
	}
	return pick_link(nd, &path, inode, seq, flags);
	return pick_link(nd, &path, inode, flags);
}

static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
					struct inode **inodep,
					unsigned *seqp)
					struct inode **inodep)
{
	struct dentry *parent, *old;

@@ -1899,6 +1898,7 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
		nd->path = path;
		nd->inode = path.dentry->d_inode;
		nd->seq = seq;
		// makes sure that non-RCU pathwalk could reach this state
		if (read_seqretry(&mount_lock, nd->m_seq))
			return ERR_PTR(-ECHILD);
		/* we know that mountpoint was pinned */
@@ -1906,7 +1906,8 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
	old = nd->path.dentry;
	parent = old->d_parent;
	*inodep = parent->d_inode;
	*seqp = read_seqcount_begin(&parent->d_seq);
	nd->next_seq = read_seqcount_begin(&parent->d_seq);
	// makes sure that non-RCU pathwalk could reach this state
	if (read_seqcount_retry(&old->d_seq, nd->seq))
		return ERR_PTR(-ECHILD);
	if (unlikely(!path_connected(nd->path.mnt, parent)))
@@ -1917,14 +1918,13 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
		return ERR_PTR(-ECHILD);
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return ERR_PTR(-ECHILD);
	*seqp = nd->seq;
	nd->next_seq = nd->seq;
	*inodep = nd->path.dentry->d_inode;
	return nd->path.dentry;
}

static struct dentry *follow_dotdot(struct nameidata *nd,
				 struct inode **inodep,
				 unsigned *seqp)
				 struct inode **inodep)
{
	struct dentry *parent;

@@ -1948,14 +1948,12 @@ static struct dentry *follow_dotdot(struct nameidata *nd,
		dput(parent);
		return ERR_PTR(-ENOENT);
	}
	*seqp = 0;
	*inodep = parent->d_inode;
	return parent;

in_root:
	if (unlikely(nd->flags & LOOKUP_BENEATH))
		return ERR_PTR(-EXDEV);
	*seqp = 0;
	*inodep = nd->path.dentry->d_inode;
	return dget(nd->path.dentry);
}
@@ -1966,7 +1964,6 @@ static const char *handle_dots(struct nameidata *nd, int type)
		const char *error = NULL;
		struct dentry *parent;
		struct inode *inode;
		unsigned seq;

		if (!nd->root.mnt) {
			error = ERR_PTR(set_root(nd));
@@ -1974,12 +1971,12 @@ static const char *handle_dots(struct nameidata *nd, int type)
				return error;
		}
		if (nd->flags & LOOKUP_RCU)
			parent = follow_dotdot_rcu(nd, &inode, &seq);
			parent = follow_dotdot_rcu(nd, &inode);
		else
			parent = follow_dotdot(nd, &inode, &seq);
			parent = follow_dotdot(nd, &inode);
		if (IS_ERR(parent))
			return ERR_CAST(parent);
		error = step_into(nd, WALK_NOFOLLOW, parent, inode, seq);
		error = step_into(nd, WALK_NOFOLLOW, parent, inode);
		if (unlikely(error))
			return error;

@@ -2004,7 +2001,6 @@ static const char *walk_component(struct nameidata *nd, int flags)
{
	struct dentry *dentry;
	struct inode *inode;
	unsigned seq;
	/*
	 * "." and ".." are special - ".." especially so because it has
	 * to be able to know about the current root directory and
@@ -2015,7 +2011,7 @@ static const char *walk_component(struct nameidata *nd, int flags)
			put_link(nd);
		return handle_dots(nd, nd->last_type);
	}
	dentry = lookup_fast(nd, &inode, &seq);
	dentry = lookup_fast(nd, &inode);
	if (IS_ERR(dentry))
		return ERR_CAST(dentry);
	if (unlikely(!dentry)) {
@@ -2025,7 +2021,7 @@ static const char *walk_component(struct nameidata *nd, int flags)
	}
	if (!(flags & WALK_MORE) && nd->depth)
		put_link(nd);
	return step_into(nd, flags, dentry, inode, seq);
	return step_into(nd, flags, dentry, inode);
}

/*
@@ -2380,6 +2376,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
		flags &= ~LOOKUP_RCU;
	if (flags & LOOKUP_RCU)
		rcu_read_lock();
	else
		nd->seq = nd->next_seq = 0;

	nd->flags = flags;
	nd->state |= ND_JUMPED;
@@ -2481,8 +2479,9 @@ static int handle_lookup_down(struct nameidata *nd)
{
	if (!(nd->flags & LOOKUP_RCU))
		dget(nd->path.dentry);
	nd->next_seq = nd->seq;
	return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
			nd->path.dentry, nd->inode, nd->seq));
			nd->path.dentry, nd->inode));
}

/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
@@ -3401,7 +3400,6 @@ static const char *open_last_lookups(struct nameidata *nd,
	struct dentry *dir = nd->path.dentry;
	int open_flag = op->open_flag;
	bool got_write = false;
	unsigned seq;
	struct inode *inode;
	struct dentry *dentry;
	const char *res;
@@ -3418,7 +3416,7 @@ static const char *open_last_lookups(struct nameidata *nd,
		if (nd->last.name[nd->last.len])
			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
		/* we _can_ be in RCU mode here */
		dentry = lookup_fast(nd, &inode, &seq);
		dentry = lookup_fast(nd, &inode);
		if (IS_ERR(dentry))
			return ERR_CAST(dentry);
		if (likely(dentry))
@@ -3472,7 +3470,7 @@ static const char *open_last_lookups(struct nameidata *nd,
finish_lookup:
	if (nd->depth)
		put_link(nd);
	res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
	res = step_into(nd, WALK_TRAILING, dentry, inode);
	if (unlikely(res))
		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
	return res;