Commit 84ad0af0 authored by Peilin Ye's avatar Peilin Ye Committed by Paolo Abeni
Browse files

net/sched: qdisc_destroy() old ingress and clsact Qdiscs before grafting



mini_Qdisc_pair::p_miniq is a double pointer to mini_Qdisc, initialized
in ingress_init() to point to net_device::miniq_ingress.  ingress Qdiscs
access this per-net_device pointer in mini_qdisc_pair_swap().  Similar
for clsact Qdiscs and miniq_egress.

Unfortunately, after introducing RTNL-unlocked RTM_{NEW,DEL,GET}TFILTER
requests (thanks Hillf Danton for the hint), when replacing ingress or
clsact Qdiscs, for example, the old Qdisc ("@old") could access the same
miniq_{in,e}gress pointer(s) concurrently with the new Qdisc ("@new"),
causing race conditions [1] including a use-after-free bug in
mini_qdisc_pair_swap() reported by syzbot:

 BUG: KASAN: slab-use-after-free in mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
 Write of size 8 at addr ffff888045b31308 by task syz-executor690/14901
...
 Call Trace:
  <TASK>
  __dump_stack lib/dump_stack.c:88 [inline]
  dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106
  print_address_description.constprop.0+0x2c/0x3c0 mm/kasan/report.c:319
  print_report mm/kasan/report.c:430 [inline]
  kasan_report+0x11c/0x130 mm/kasan/report.c:536
  mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
  tcf_chain_head_change_item net/sched/cls_api.c:495 [inline]
  tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:509
  tcf_chain_tp_insert net/sched/cls_api.c:1826 [inline]
  tcf_chain_tp_insert_unique net/sched/cls_api.c:1875 [inline]
  tc_new_tfilter+0x1de6/0x2290 net/sched/cls_api.c:2266
...

@old and @new should not affect each other.  In other words, @old should
never modify miniq_{in,e}gress after @new, and @new should not update
@old's RCU state.

Fixing without changing sch_api.c turned out to be difficult (please
refer to Closes: for discussions).  Instead, make sure @new's first call
always happen after @old's last call (in {ingress,clsact}_destroy()) has
finished:

In qdisc_graft(), return -EBUSY if @old has any ongoing filter requests,
and call qdisc_destroy() for @old before grafting @new.

Introduce qdisc_refcount_dec_if_one() as the counterpart of
qdisc_refcount_inc_nz() used for filter requests.  Introduce a
non-static version of qdisc_destroy() that does a TCQ_F_BUILTIN check,
just like qdisc_put() etc.

Depends on patch "net/sched: Refactor qdisc_graft() for ingress and
clsact Qdiscs".

[1] To illustrate, the syzkaller reproducer adds ingress Qdiscs under
TC_H_ROOT (no longer possible after commit c7cfbd11 ("net/sched:
sch_ingress: Only create under TC_H_INGRESS")) on eth0 that has 8
transmission queues:

  Thread 1 creates ingress Qdisc A (containing mini Qdisc a1 and a2),
  then adds a flower filter X to A.

  Thread 2 creates another ingress Qdisc B (containing mini Qdisc b1 and
  b2) to replace A, then adds a flower filter Y to B.

 Thread 1               A's refcnt   Thread 2
  RTM_NEWQDISC (A, RTNL-locked)
   qdisc_create(A)               1
   qdisc_graft(A)                9

  RTM_NEWTFILTER (X, RTNL-unlocked)
   __tcf_qdisc_find(A)          10
   tcf_chain0_head_change(A)
   mini_qdisc_pair_swap(A) (1st)
            |
            |                         RTM_NEWQDISC (B, RTNL-locked)
         RCU sync                2     qdisc_graft(B)
            |                    1     notify_and_destroy(A)
            |
   tcf_block_release(A)          0    RTM_NEWTFILTER (Y, RTNL-unlocked)
   qdisc_destroy(A)                    tcf_chain0_head_change(B)
   tcf_chain0_head_change_cb_del(A)    mini_qdisc_pair_swap(B) (2nd)
   mini_qdisc_pair_swap(A) (3rd)                |
           ...                                 ...

Here, B calls mini_qdisc_pair_swap(), pointing eth0->miniq_ingress to
its mini Qdisc, b1.  Then, A calls mini_qdisc_pair_swap() again during
ingress_destroy(), setting eth0->miniq_ingress to NULL, so ingress
packets on eth0 will not find filter Y in sch_handle_ingress().

This is just one of the possible consequences of concurrently accessing
miniq_{in,e}gress pointers.

Fixes: 7a096d57 ("net: sched: ingress: set 'unlocked' flag for Qdisc ops")
Fixes: 87f37392 ("net: sched: ingress: set 'unlocked' flag for clsact Qdisc ops")
Reported-by: default avatar <syzbot+b53a9c0d1ea4ad62da8b@syzkaller.appspotmail.com>
Closes: https://lore.kernel.org/r/0000000000006cf87705f79acf1a@google.com/


Cc: Hillf Danton <hdanton@sina.com>
Cc: Vlad Buslov <vladbu@mellanox.com>
Signed-off-by: default avatarPeilin Ye <peilin.ye@bytedance.com>
Acked-by: default avatarJamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parent 2d5f6a8d
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -137,6 +137,13 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
	refcount_inc(&qdisc->refcnt);
}

static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc)
{
	if (qdisc->flags & TCQ_F_BUILTIN)
		return true;
	return refcount_dec_if_one(&qdisc->refcnt);
}

/* Intended to be used by unlocked users, when concurrent qdisc release is
 * possible.
 */
@@ -652,6 +659,7 @@ void dev_deactivate_many(struct list_head *head);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
void qdisc_destroy(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);
+23 −5
Original line number Diff line number Diff line
@@ -1086,10 +1086,22 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
		if ((q && q->flags & TCQ_F_INGRESS) ||
		    (new && new->flags & TCQ_F_INGRESS)) {
			ingress = 1;
			if (!dev_ingress_queue(dev)) {
			dev_queue = dev_ingress_queue(dev);
			if (!dev_queue) {
				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
				return -ENOENT;
			}

			q = rtnl_dereference(dev_queue->qdisc_sleeping);

			/* This is the counterpart of that qdisc_refcount_inc_nz() call in
			 * __tcf_qdisc_find() for filter requests.
			 */
			if (!qdisc_refcount_dec_if_one(q)) {
				NL_SET_ERR_MSG(extack,
					       "Current ingress or clsact Qdisc has ongoing filter requests");
				return -EBUSY;
			}
		}

		if (dev->flags & IFF_UP)
@@ -1110,8 +1122,16 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
				qdisc_put(old);
			}
		} else {
			dev_queue = dev_ingress_queue(dev);
			old = dev_graft_qdisc(dev_queue, new);
			old = dev_graft_qdisc(dev_queue, NULL);

			/* {ingress,clsact}_destroy() @old before grafting @new to avoid
			 * unprotected concurrent accesses to net_device::miniq_{in,e}gress
			 * pointer(s) in mini_qdisc_pair_swap().
			 */
			qdisc_notify(net, skb, n, classid, old, new, extack);
			qdisc_destroy(old);

			dev_graft_qdisc(dev_queue, new);
		}

skip:
@@ -1125,8 +1145,6 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,

			if (new && new->ops->attach)
				new->ops->attach(new);
		} else {
			notify_and_destroy(net, skb, n, classid, old, new, extack);
		}

		if (dev->flags & IFF_UP)
+11 −3
Original line number Diff line number Diff line
@@ -1046,7 +1046,7 @@ static void qdisc_free_cb(struct rcu_head *head)
	qdisc_free(q);
}

static void qdisc_destroy(struct Qdisc *qdisc)
static void __qdisc_destroy(struct Qdisc *qdisc)
{
	const struct Qdisc_ops  *ops = qdisc->ops;

@@ -1070,6 +1070,14 @@ static void qdisc_destroy(struct Qdisc *qdisc)
	call_rcu(&qdisc->rcu, qdisc_free_cb);
}

void qdisc_destroy(struct Qdisc *qdisc)
{
	if (qdisc->flags & TCQ_F_BUILTIN)
		return;

	__qdisc_destroy(qdisc);
}

void qdisc_put(struct Qdisc *qdisc)
{
	if (!qdisc)
@@ -1079,7 +1087,7 @@ void qdisc_put(struct Qdisc *qdisc)
	    !refcount_dec_and_test(&qdisc->refcnt))
		return;

	qdisc_destroy(qdisc);
	__qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);

@@ -1094,7 +1102,7 @@ void qdisc_put_unlocked(struct Qdisc *qdisc)
	    !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
		return;

	qdisc_destroy(qdisc);
	__qdisc_destroy(qdisc);
	rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);