Unverified Commit 71bc356f authored by Christian Brauner's avatar Christian Brauner
Browse files

commoncap: handle idmapped mounts

When interacting with user namespace and non-user namespace aware
filesystem capabilities the vfs will perform various security checks to
determine whether or not the filesystem capabilities can be used by the
caller, whether they need to be removed and so on. The main
infrastructure for this resides in the capability codepaths but they are
called through the LSM security infrastructure even though they are not
technically an LSM or optional. This extends the existing security hooks
security_inode_removexattr(), security_inode_killpriv(),
security_inode_getsecurity() to pass down the mount's user namespace and
makes them aware of idmapped mounts.

In order to actually get filesystem capabilities from disk the
capability infrastructure exposes the get_vfs_caps_from_disk() helper.
For user namespace aware filesystem capabilities a root uid is stored
alongside the capabilities.

In order to determine whether the caller can make use of the filesystem
capability or whether it needs to be ignored it is translated according
to the superblock's user namespace. If it can be translated to uid 0
according to that id mapping the caller can use the filesystem
capabilities stored on disk. If we are accessing the inode that holds
the filesystem capabilities through an idmapped mount we map the root
uid according to the mount's user namespace. Afterwards the checks are
identical to non-idmapped mounts: reading filesystem caps from disk
enforces that the root uid associated with the filesystem capability
must have a mapping in the superblock's user namespace and that the
caller is either in the same user namespace or is a descendant of the
superblock's user namespace. For filesystems that are mountable inside
user namespace the caller can just mount the filesystem and won't
usually need to idmap it. If they do want to idmap it they can create an
idmapped mount and mark it with a user namespace they created and which
is thus a descendant of s_user_ns. For filesystems that are not
mountable inside user namespaces the descendant rule is trivially true
because the s_user_ns will be the initial user namespace.

If the initial user namespace is passed nothing changes so non-idmapped
mounts will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-11-christian.brauner@ubuntu.com


Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Acked-by: default avatarJames Morris <jamorris@linux.microsoft.com>
Signed-off-by: default avatarChristian Brauner <christian.brauner@ubuntu.com>
parent c7c7a1a1
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -143,7 +143,7 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
	if (ia_valid & ATTR_KILL_PRIV) {
		int error;

		error = security_inode_killpriv(dentry);
		error = security_inode_killpriv(mnt_userns, dentry);
		if (error)
			return error;
	}
+11 −7
Original line number Diff line number Diff line
@@ -262,7 +262,8 @@ __vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry,
	if (error)
		return error;

	error = security_inode_setxattr(dentry, name, value, size, flags);
	error = security_inode_setxattr(mnt_userns, dentry, name, value, size,
					flags);
	if (error)
		goto out;

@@ -313,18 +314,20 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
EXPORT_SYMBOL_GPL(vfs_setxattr);

static ssize_t
xattr_getsecurity(struct inode *inode, const char *name, void *value,
			size_t size)
xattr_getsecurity(struct user_namespace *mnt_userns, struct inode *inode,
		  const char *name, void *value, size_t size)
{
	void *buffer = NULL;
	ssize_t len;

	if (!value || !size) {
		len = security_inode_getsecurity(inode, name, &buffer, false);
		len = security_inode_getsecurity(mnt_userns, inode, name,
						 &buffer, false);
		goto out_noalloc;
	}

	len = security_inode_getsecurity(inode, name, &buffer, true);
	len = security_inode_getsecurity(mnt_userns, inode, name, &buffer,
					 true);
	if (len < 0)
		return len;
	if (size < len) {
@@ -414,7 +417,8 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
	if (!strncmp(name, XATTR_SECURITY_PREFIX,
				XATTR_SECURITY_PREFIX_LEN)) {
		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
		int ret = xattr_getsecurity(inode, suffix, value, size);
		int ret = xattr_getsecurity(mnt_userns, inode, suffix, value,
					    size);
		/*
		 * Only overwrite the return value if a security module
		 * is actually active.
@@ -486,7 +490,7 @@ __vfs_removexattr_locked(struct user_namespace *mnt_userns,
	if (error)
		return error;

	error = security_inode_removexattr(dentry, name);
	error = security_inode_removexattr(mnt_userns, dentry, name);
	if (error)
		goto out;

+3 −1
Original line number Diff line number Diff line
@@ -271,7 +271,9 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
}

/* audit system wants to get cap info from files as well */
extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
int get_vfs_caps_from_disk(struct user_namespace *mnt_userns,
			   const struct dentry *dentry,
			   struct cpu_vfs_cap_data *cpu_caps);

int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry,
		      const void **ivalue, size_t size);
+9 −6
Original line number Diff line number Diff line
@@ -133,17 +133,20 @@ LSM_HOOK(int, 0, inode_follow_link, struct dentry *dentry, struct inode *inode,
LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask)
LSM_HOOK(int, 0, inode_setattr, struct dentry *dentry, struct iattr *attr)
LSM_HOOK(int, 0, inode_getattr, const struct path *path)
LSM_HOOK(int, 0, inode_setxattr, struct dentry *dentry, const char *name,
	 const void *value, size_t size, int flags)
LSM_HOOK(int, 0, inode_setxattr, struct user_namespace *mnt_userns,
	 struct dentry *dentry, const char *name, const void *value,
	 size_t size, int flags)
LSM_HOOK(void, LSM_RET_VOID, inode_post_setxattr, struct dentry *dentry,
	 const char *name, const void *value, size_t size, int flags)
LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name)
LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry)
LSM_HOOK(int, 0, inode_removexattr, struct dentry *dentry, const char *name)
LSM_HOOK(int, 0, inode_removexattr, struct user_namespace *mnt_userns,
	 struct dentry *dentry, const char *name)
LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry)
LSM_HOOK(int, 0, inode_killpriv, struct dentry *dentry)
LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct inode *inode,
	 const char *name, void **buffer, bool alloc)
LSM_HOOK(int, 0, inode_killpriv, struct user_namespace *mnt_userns,
	 struct dentry *dentry)
LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct user_namespace *mnt_userns,
	 struct inode *inode, const char *name, void **buffer, bool alloc)
LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode,
	 const char *name, const void *value, size_t size, int flags)
LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer,
+1 −0
Original line number Diff line number Diff line
@@ -444,6 +444,7 @@
 * @inode_killpriv:
 *	The setuid bit is being removed.  Remove similar security labels.
 *	Called with the dentry->d_inode->i_mutex held.
 *	@mnt_userns: user namespace of the mount
 *	@dentry is the dentry being changed.
 *	Return 0 on success.  If error is returned, then the operation
 *	causing setuid bit removal is failed.
Loading