Commit e9bb1e48 authored by Breno Leitao's avatar Breno Leitao Committed by Yifan Qiao
Browse files

io_uring/io-wq: Use set_bit() and test_bit() at worker->flags

mainline inclusion
from mainline-v6.10-rc1
commit 8a565304927fbd28c9f028c492b5c1714002cbab
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/IACR2Y
CVE: CVE-2024-39508

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8a565304927fbd28c9f028c492b5c1714002cbab

--------------------------------

Utilize set_bit() and test_bit() on worker->flags within io_uring/io-wq
to address potential data races.

The structure io_worker->flags may be accessed through various data
paths, leading to concurrency issues. When KCSAN is enabled, it reveals
data races occurring in io_worker_handle_work and
io_wq_activate_free_worker functions.

	 BUG: KCSAN: data-race in io_worker_handle_work / io_wq_activate_free_worker
	 write to 0xffff8885c4246404 of 4 bytes by task 49071 on cpu 28:
	 io_worker_handle_work (io_uring/io-wq.c:434 io_uring/io-wq.c:569)
	 io_wq_worker (io_uring/io-wq.c:?)
<snip>

	 read to 0xffff8885c4246404 of 4 bytes by task 49024 on cpu 5:
	 io_wq_activate_free_worker (io_uring/io-wq.c:? io_uring/io-wq.c:285)
	 io_wq_enqueue (io_uring/io-wq.c:947)
	 io_queue_iowq (io_uring/io_uring.c:524)
	 io_req_task_submit (io_uring/io_uring.c:1511)
	 io_handle_tw_list (io_uring/io_uring.c:1198)
<snip>

Line numbers against commit 18daea77cca6 ("Merge tag 'for-linus' of
git://git.kernel.org/pub/scm/virt/kvm/kvm"

).

These races involve writes and reads to the same memory location by
different tasks running on different CPUs. To mitigate this, refactor
the code to use atomic operations such as set_bit(), test_bit(), and
clear_bit() instead of basic "and" and "or" operations. This ensures
thread-safe manipulation of worker flags.

Also, move `create_index` to avoid holes in the structure.

Signed-off-by: default avatarBreno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/20240507170002.2269003-1-leitao@debian.org


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
Conflicts:
	fs/io-wq.c
	io_uring/io-wq.c
[The function names are different than cve commit. But their basic
functions are same and they are using the io_worker in the same way.]
Signed-off-by: default avatarYifan Qiao <qiaoyifan4@huawei.com>
parent ca57ed16
Loading
Loading
Loading
Loading
+30 −29
Original line number Diff line number Diff line
@@ -25,12 +25,12 @@
#define WORKER_IDLE_TIMEOUT	(5 * HZ)

enum {
	IO_WORKER_F_UP		= 1,	/* up and active */
	IO_WORKER_F_RUNNING	= 2,	/* account as running */
	IO_WORKER_F_FREE	= 4,	/* worker on free list */
	IO_WORKER_F_EXITING	= 8,	/* worker exiting */
	IO_WORKER_F_FIXED	= 16,	/* static idle worker */
	IO_WORKER_F_BOUND	= 32,	/* is doing bounded work */
	IO_WORKER_F_UP		= 0,	/* up and active */
	IO_WORKER_F_RUNNING	= 1,	/* account as running */
	IO_WORKER_F_FREE	= 2,	/* worker on free list */
	IO_WORKER_F_EXITING	= 3,	/* worker exiting */
	IO_WORKER_F_FIXED	= 4,	/* static idle worker */
	IO_WORKER_F_BOUND	= 5,	/* is doing bounded work */
};

enum {
@@ -48,7 +48,7 @@ enum {
 */
struct io_worker {
	refcount_t ref;
	unsigned flags;
	unsigned long flags;
	struct hlist_nulls_node nulls_node;
	struct list_head all_list;
	struct task_struct *task;
@@ -193,7 +193,7 @@ static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe,
						  struct io_worker *worker)
{
	if (worker->flags & IO_WORKER_F_BOUND)
	if (test_bit(IO_WORKER_F_BOUND, &worker->flags))
		return &wqe->acct[IO_WQ_ACCT_BOUND];

	return &wqe->acct[IO_WQ_ACCT_UNBOUND];
@@ -215,9 +215,9 @@ static void io_worker_exit(struct io_worker *worker)

	preempt_disable();
	current->flags &= ~PF_IO_WORKER;
	if (worker->flags & IO_WORKER_F_RUNNING)
	if (test_bit(IO_WORKER_F_RUNNING, &worker->flags))
		atomic_dec(&acct->nr_running);
	if (!(worker->flags & IO_WORKER_F_BOUND))
	if (!test_bit(IO_WORKER_F_BOUND, &worker->flags))
		atomic_dec(&wqe->wq->user->processes);
	worker->flags = 0;
	preempt_enable();
@@ -315,7 +315,8 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)

	current->flags |= PF_IO_WORKER;

	worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
	set_mask_bits(&worker->flags, 0,
		      BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING));
	worker->restore_files = current->files;
	worker->restore_fs = current->fs;
	io_wqe_inc_running(wqe, worker);
@@ -331,8 +332,8 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
{
	bool worker_bound, work_bound;

	if (worker->flags & IO_WORKER_F_FREE) {
		worker->flags &= ~IO_WORKER_F_FREE;
	if (test_bit(IO_WORKER_F_FREE, &worker->flags)) {
		clear_bit(IO_WORKER_F_FREE, &worker->flags);
		hlist_nulls_del_init_rcu(&worker->nulls_node);
	}

@@ -340,17 +341,17 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
	 * If worker is moving from bound to unbound (or vice versa), then
	 * ensure we update the running accounting.
	 */
	worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
	worker_bound = test_bit(IO_WORKER_F_BOUND, &worker->flags);
	work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
	if (worker_bound != work_bound) {
		io_wqe_dec_running(wqe, worker);
		if (work_bound) {
			worker->flags |= IO_WORKER_F_BOUND;
			set_bit(IO_WORKER_F_BOUND, &worker->flags);
			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
			wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
			atomic_dec(&wqe->wq->user->processes);
		} else {
			worker->flags &= ~IO_WORKER_F_BOUND;
			clear_bit(IO_WORKER_F_BOUND, &worker->flags);
			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
			wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
			atomic_inc(&wqe->wq->user->processes);
@@ -369,8 +370,8 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
	__must_hold(wqe->lock)
{
	if (!(worker->flags & IO_WORKER_F_FREE)) {
		worker->flags |= IO_WORKER_F_FREE;
	if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) {
		set_bit(IO_WORKER_F_FREE, &worker->flags);
		hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
	}

@@ -585,7 +586,7 @@ static int io_wqe_worker(void *data)
			continue;
		/* timed out, exit unless we're the fixed worker */
		if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
		    !(worker->flags & IO_WORKER_F_FIXED))
		    !test_bit(IO_WORKER_F_FIXED, &worker->flags))
			break;
	}

@@ -609,11 +610,11 @@ void io_wq_worker_running(struct task_struct *tsk)
	struct io_worker *worker = kthread_data(tsk);
	struct io_wqe *wqe = worker->wqe;

	if (!(worker->flags & IO_WORKER_F_UP))
	if (!test_bit(IO_WORKER_F_UP, &worker->flags))
		return;
	if (worker->flags & IO_WORKER_F_RUNNING)
	if (test_bit(IO_WORKER_F_RUNNING, &worker->flags))
		return;
	worker->flags |= IO_WORKER_F_RUNNING;
	set_bit(IO_WORKER_F_RUNNING, &worker->flags);
	io_wqe_inc_running(wqe, worker);
}

@@ -627,12 +628,12 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
	struct io_worker *worker = kthread_data(tsk);
	struct io_wqe *wqe = worker->wqe;

	if (!(worker->flags & IO_WORKER_F_UP))
	if (!test_bit(IO_WORKER_F_UP, &worker->flags))
		return;
	if (!(worker->flags & IO_WORKER_F_RUNNING))
	if (!test_bit(IO_WORKER_F_RUNNING, &worker->flags))
		return;

	worker->flags &= ~IO_WORKER_F_RUNNING;
	clear_bit(IO_WORKER_F_RUNNING, &worker->flags);

	spin_lock_irq(&wqe->lock);
	io_wqe_dec_running(wqe, worker);
@@ -663,11 +664,11 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
	spin_lock_irq(&wqe->lock);
	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
	list_add_tail_rcu(&worker->all_list, &wqe->all_list);
	worker->flags |= IO_WORKER_F_FREE;
	set_bit(IO_WORKER_F_FREE, &worker->flags);
	if (index == IO_WQ_ACCT_BOUND)
		worker->flags |= IO_WORKER_F_BOUND;
	if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
		worker->flags |= IO_WORKER_F_FIXED;
		set_bit(IO_WORKER_F_BOUND, &worker->flags);
	if (!acct->nr_workers && test_bit(IO_WORKER_F_BOUND, &worker->flags))
		set_bit(IO_WORKER_F_FIXED, &worker->flags);
	acct->nr_workers++;
	spin_unlock_irq(&wqe->lock);