Commit 645b34a7 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'netns-sysctl-isolation'

Jonathon Reinhart says:

====================
Ensuring net sysctl isolation

This patchset is the result of an audit of /proc/sys/net to prove that
it is safe to be mouted read-write in a container when a net namespace
is in use. See [1].

The first commit adds code to detect sysctls which are not netns-safe,
and can "leak" changes to other net namespaces.

My manual audit found, and the above feature confirmed, that there are
two nf_conntrack sysctls which are in fact not netns-safe.

I considered sending the latter to netfilter-devel, but I think it's
better to have both together on net-next: Adding only the former causes
undesirable warnings in the kernel log.

[1]: https://github.com/opencontainers/runc/issues/2826


====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents a115d24a 2671fa4d
Loading
Loading
Loading
Loading
+2 −8
Original line number Diff line number Diff line
@@ -1060,16 +1060,10 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
	nf_conntrack_standalone_init_dccp_sysctl(net, table);
	nf_conntrack_standalone_init_gre_sysctl(net, table);

	/* Don't allow unprivileged users to alter certain sysctls */
	if (net->user_ns != &init_user_ns) {
	/* Don't allow non-init_net ns to alter global sysctls */
	if (!net_eq(&init_net, net)) {
		table[NF_SYSCTL_CT_MAX].mode = 0444;
		table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
		table[NF_SYSCTL_CT_HELPER].mode = 0444;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
		table[NF_SYSCTL_CT_EVENTS].mode = 0444;
#endif
		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
	} else if (!net_eq(&init_net, net)) {
		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
	}

+48 −0
Original line number Diff line number Diff line
@@ -115,9 +115,57 @@ __init int net_sysctl_init(void)
	goto out;
}

/* Verify that sysctls for non-init netns are safe by either:
 * 1) being read-only, or
 * 2) having a data pointer which points outside of the global kernel/module
 *    data segment, and rather into the heap where a per-net object was
 *    allocated.
 */
static void ensure_safe_net_sysctl(struct net *net, const char *path,
				   struct ctl_table *table)
{
	struct ctl_table *ent;

	pr_debug("Registering net sysctl (net %p): %s\n", net, path);
	for (ent = table; ent->procname; ent++) {
		unsigned long addr;
		const char *where;

		pr_debug("  procname=%s mode=%o proc_handler=%ps data=%p\n",
			 ent->procname, ent->mode, ent->proc_handler, ent->data);

		/* If it's not writable inside the netns, then it can't hurt. */
		if ((ent->mode & 0222) == 0) {
			pr_debug("    Not writable by anyone\n");
			continue;
		}

		/* Where does data point? */
		addr = (unsigned long)ent->data;
		if (is_module_address(addr))
			where = "module";
		else if (core_kernel_data(addr))
			where = "kernel";
		else
			continue;

		/* If it is writable and points to kernel/module global
		 * data, then it's probably a netns leak.
		 */
		WARN(1, "sysctl %s/%s: data points to %s global data: %ps\n",
		     path, ent->procname, where, ent->data);

		/* Make it "safe" by dropping writable perms */
		ent->mode &= ~0222;
	}
}

struct ctl_table_header *register_net_sysctl(struct net *net,
	const char *path, struct ctl_table *table)
{
	if (!net_eq(net, &init_net))
		ensure_safe_net_sysctl(net, path, table);

	return __register_sysctl_table(&net->sysctls, path, table);
}
EXPORT_SYMBOL_GPL(register_net_sysctl);