Commit 6a953678 authored by Tejun Heo's avatar Tejun Heo Committed by Li Lingfeng
Browse files

kernfs: RCU protect kernfs_nodes and avoid kernfs_idr_lock in kernfs_find_and_get_node_by_id()

stable inclusion
from stable-v6.6.27
commit 4b1f991bad566de14ae79bca952d201189cada52
bugzilla: https://gitee.com/openeuler/kernel/issues/I9MPZ8

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=4b1f991bad566de14ae79bca952d201189cada52



--------------------------------

[ Upstream commit 4207b556e62f0a8915afc5da4c5d5ad915a253a5 ]

The BPF helper bpf_cgroup_from_id() calls kernfs_find_and_get_node_by_id()
which acquires kernfs_idr_lock, which is an non-raw non-IRQ-safe lock. This
can lead to deadlocks as bpf_cgroup_from_id() can be called from any BPF
programs including e.g. the ones that attach to functions which are holding
the scheduler rq lock.

Consider the following BPF program:

  SEC("fentry/__set_cpus_allowed_ptr_locked")
  int BPF_PROG(__set_cpus_allowed_ptr_locked, struct task_struct *p,
	       struct affinity_context *affn_ctx, struct rq *rq, struct rq_flags *rf)
  {
	  struct cgroup *cgrp = bpf_cgroup_from_id(p->cgroups->dfl_cgrp->kn->id);

	  if (cgrp) {
		  bpf_printk("%d[%s] in %s", p->pid, p->comm, cgrp->kn->name);
		  bpf_cgroup_release(cgrp);
	  }
	  return 0;
  }

__set_cpus_allowed_ptr_locked() is called with rq lock held and the above
BPF program calls bpf_cgroup_from_id() within leading to the following
lockdep warning:

  =====================================================
  WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
  6.7.0-rc3-work-00053-g07124366a1d7-dirty #147 Not tainted
  -----------------------------------------------------
  repro/1620 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
  ffffffff833b3688 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1e/0x70

		and this task is already holding:
  ffff888237ced698 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x4e/0xf0
  which would create a new lock dependency:
   (&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2}
  ...
   Possible interrupt unsafe locking scenario:

	 CPU0                    CPU1
	 ----                    ----
    lock(kernfs_idr_lock);
				 local_irq_disable();
				 lock(&rq->__lock);
				 lock(kernfs_idr_lock);
    <Interrupt>
      lock(&rq->__lock);

		 *** DEADLOCK ***
  ...
  Call Trace:
   dump_stack_lvl+0x55/0x70
   dump_stack+0x10/0x20
   __lock_acquire+0x781/0x2a40
   lock_acquire+0xbf/0x1f0
   _raw_spin_lock+0x2f/0x40
   kernfs_find_and_get_node_by_id+0x1e/0x70
   cgroup_get_from_id+0x21/0x240
   bpf_cgroup_from_id+0xe/0x20
   bpf_prog_98652316e9337a5a___set_cpus_allowed_ptr_locked+0x96/0x11a
   bpf_trampoline_6442545632+0x4f/0x1000
   __set_cpus_allowed_ptr_locked+0x5/0x5a0
   sched_setaffinity+0x1b3/0x290
   __x64_sys_sched_setaffinity+0x4f/0x60
   do_syscall_64+0x40/0xe0
   entry_SYSCALL_64_after_hwframe+0x46/0x4e

Let's fix it by protecting kernfs_node and kernfs_root with RCU and making
kernfs_find_and_get_node_by_id() acquire rcu_read_lock() instead of
kernfs_idr_lock.

This adds an rcu_head to kernfs_node making it larger by 16 bytes on 64bit.
Combined with the preceding rearrange patch, the net increase is 8 bytes.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Cc: Andrea Righi <andrea.righi@canonical.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20240109214828.252092-4-tj@kernel.org


Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: default avatarSasha Levin <sashal@kernel.org>

Conflicts:
  fs/kernfs/kernfs-internal.h
[Commit 50dfec9d("kernfs: kabi: Reserve KABI slots for kernfs_*
struct") from OLK-6.6 add KABI_RESERVE in struct kernfs_root]
Signed-off-by: default avatarLi Lingfeng <lilingfeng3@huawei.com>
parent 8c0dfc8b
Loading
Loading
Loading
Loading
+20 −11
Original line number Diff line number Diff line
@@ -529,6 +529,20 @@ void kernfs_get(struct kernfs_node *kn)
}
EXPORT_SYMBOL_GPL(kernfs_get);

static void kernfs_free_rcu(struct rcu_head *rcu)
{
	struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu);

	kfree_const(kn->name);

	if (kn->iattr) {
		simple_xattrs_free(&kn->iattr->xattrs, NULL);
		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
	}

	kmem_cache_free(kernfs_node_cache, kn);
}

/**
 * kernfs_put - put a reference count on a kernfs_node
 * @kn: the target kernfs_node
@@ -557,16 +571,11 @@ void kernfs_put(struct kernfs_node *kn)
	if (kernfs_type(kn) == KERNFS_LINK)
		kernfs_put(kn->symlink.target_kn);

	kfree_const(kn->name);

	if (kn->iattr) {
		simple_xattrs_free(&kn->iattr->xattrs, NULL);
		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
	}
	spin_lock(&kernfs_idr_lock);
	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
	spin_unlock(&kernfs_idr_lock);
	kmem_cache_free(kernfs_node_cache, kn);

	call_rcu(&kn->rcu, kernfs_free_rcu);

	kn = parent;
	if (kn) {
@@ -575,7 +584,7 @@ void kernfs_put(struct kernfs_node *kn)
	} else {
		/* just released the root kn, free @root too */
		idr_destroy(&root->ino_idr);
		kfree(root);
		kfree_rcu(root, rcu);
	}
}
EXPORT_SYMBOL_GPL(kernfs_put);
@@ -715,7 +724,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
	ino_t ino = kernfs_id_ino(id);
	u32 gen = kernfs_id_gen(id);

	spin_lock(&kernfs_idr_lock);
	rcu_read_lock();

	kn = idr_find(&root->ino_idr, (u32)ino);
	if (!kn)
@@ -739,10 +748,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
	if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
		goto err_unlock;

	spin_unlock(&kernfs_idr_lock);
	rcu_read_unlock();
	return kn;
err_unlock:
	spin_unlock(&kernfs_idr_lock);
	rcu_read_unlock();
	return NULL;
}

+2 −0
Original line number Diff line number Diff line
@@ -50,6 +50,8 @@ struct kernfs_root {
	struct rw_semaphore	kernfs_iattr_rwsem;
	struct rw_semaphore	kernfs_supers_rwsem;

	struct rcu_head 	rcu;

	KABI_RESERVE(1)
	KABI_RESERVE(2)
	KABI_RESERVE(3)
+2 −0
Original line number Diff line number Diff line
@@ -224,6 +224,8 @@ struct kernfs_node {
	unsigned short		flags;
	umode_t			mode;
	struct kernfs_iattrs	*iattr;

	struct rcu_head		rcu;
};

/*