Commit af472a9e authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-5.18/io_uring-2022-03-18' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:

 - Fixes for current file position. Still doesn't have the f_pos_lock
   sorted, but it's a step in the right direction (Dylan)

 - Tracing updates (Dylan, Stefan)

 - Improvements to io-wq locking (Hao)

 - Improvements for provided buffers (me, Pavel)

 - Support for registered file descriptors (me, Xiaoguang)

 - Support for ring messages (me)

 - Poll improvements (me)

 - Fix for fixed buffers and non-iterator reads/writes (me)

 - Support for NAPI on sockets (Olivier)

 - Ring quiesce improvements (Usama)

 - Misc fixes (Olivier, Pavel)

* tag 'for-5.18/io_uring-2022-03-18' of git://git.kernel.dk/linux-block: (42 commits)
  io_uring: terminate manual loop iterator loop correctly for non-vecs
  io_uring: don't check unrelated req->open.how in accept request
  io_uring: manage provided buffers strictly ordered
  io_uring: fold evfd signalling under a slower path
  io_uring: thin down io_commit_cqring()
  io_uring: shuffle io_eventfd_signal() bits around
  io_uring: remove extra barrier for non-sqpoll iopoll
  io_uring: fix provided buffer return on failure for kiocb_done()
  io_uring: extend provided buf return to fails
  io_uring: refactor timeout cancellation cqe posting
  io_uring: normilise naming for fill_cqe*
  io_uring: cache poll/double-poll state with a request flag
  io_uring: cache req->apoll->events in req->cflags
  io_uring: move req->poll_refs into previous struct hole
  io_uring: make tracing format consistent
  io_uring: recycle apoll_poll entries
  io_uring: remove duplicated member check for io_msg_ring_prep()
  io_uring: allow submissions to continue on error
  io_uring: recycle provided buffers if request goes async
  io_uring: ensure reads re-import for selected buffers
  ...
parents 93e220a6 5e929367
Loading
Loading
Loading
Loading
+57 −57
Original line number Diff line number Diff line
@@ -76,6 +76,7 @@ struct io_wqe_acct {
	unsigned max_workers;
	int index;
	atomic_t nr_running;
	raw_spinlock_t lock;
	struct io_wq_work_list work_list;
	unsigned long flags;
};
@@ -91,7 +92,7 @@ enum {
 */
struct io_wqe {
	raw_spinlock_t lock;
	struct io_wqe_acct acct[2];
	struct io_wqe_acct acct[IO_WQ_ACCT_NR];

	int node;

@@ -224,12 +225,12 @@ static void io_worker_exit(struct io_worker *worker)
	if (worker->flags & IO_WORKER_F_FREE)
		hlist_nulls_del_rcu(&worker->nulls_node);
	list_del_rcu(&worker->all_list);
	preempt_disable();
	raw_spin_unlock(&wqe->lock);
	io_wqe_dec_running(worker);
	worker->flags = 0;
	preempt_disable();
	current->flags &= ~PF_IO_WORKER;
	preempt_enable();
	raw_spin_unlock(&wqe->lock);

	kfree_rcu(worker, rcu);
	io_worker_ref_put(wqe->wq);
@@ -238,10 +239,15 @@ static void io_worker_exit(struct io_worker *worker)

static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
{
	bool ret = false;

	raw_spin_lock(&acct->lock);
	if (!wq_list_empty(&acct->work_list) &&
	    !test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
		return true;
	return false;
		ret = true;
	raw_spin_unlock(&acct->lock);

	return ret;
}

/*
@@ -385,7 +391,6 @@ static bool io_queue_worker_create(struct io_worker *worker,
}

static void io_wqe_dec_running(struct io_worker *worker)
	__must_hold(wqe->lock)
{
	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
	struct io_wqe *wqe = worker->wqe;
@@ -393,13 +398,14 @@ static void io_wqe_dec_running(struct io_worker *worker)
	if (!(worker->flags & IO_WORKER_F_UP))
		return;

	if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
	if (!atomic_dec_and_test(&acct->nr_running))
		return;
	if (!io_acct_run_queue(acct))
		return;

	atomic_inc(&acct->nr_running);
	atomic_inc(&wqe->wq->worker_refs);
		raw_spin_unlock(&wqe->lock);
	io_queue_worker_create(worker, acct, create_worker_cb);
		raw_spin_lock(&wqe->lock);
	}
}

/*
@@ -407,11 +413,12 @@ static void io_wqe_dec_running(struct io_worker *worker)
 * it's currently on the freelist
 */
static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker)
	__must_hold(wqe->lock)
{
	if (worker->flags & IO_WORKER_F_FREE) {
		worker->flags &= ~IO_WORKER_F_FREE;
		raw_spin_lock(&wqe->lock);
		hlist_nulls_del_init_rcu(&worker->nulls_node);
		raw_spin_unlock(&wqe->lock);
	}
}

@@ -456,7 +463,7 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)

static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
					   struct io_worker *worker)
	__must_hold(wqe->lock)
	__must_hold(acct->lock)
{
	struct io_wq_work_node *node, *prev;
	struct io_wq_work *work, *tail;
@@ -498,9 +505,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
		 * work being added and clearing the stalled bit.
		 */
		set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
		raw_spin_unlock(&wqe->lock);
		raw_spin_unlock(&acct->lock);
		unstalled = io_wait_on_hash(wqe, stall_hash);
		raw_spin_lock(&wqe->lock);
		raw_spin_lock(&acct->lock);
		if (unstalled) {
			clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
			if (wq_has_sleeper(&wqe->wq->hash->wait))
@@ -538,7 +545,6 @@ static void io_assign_current_work(struct io_worker *worker,
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);

static void io_worker_handle_work(struct io_worker *worker)
	__releases(wqe->lock)
{
	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
	struct io_wqe *wqe = worker->wqe;
@@ -555,7 +561,9 @@ static void io_worker_handle_work(struct io_worker *worker)
		 * can't make progress, any work completion or insertion will
		 * clear the stalled flag.
		 */
		raw_spin_lock(&acct->lock);
		work = io_get_next_work(acct, worker);
		raw_spin_unlock(&acct->lock);
		if (work) {
			__io_worker_busy(wqe, worker);

@@ -569,10 +577,9 @@ static void io_worker_handle_work(struct io_worker *worker)
			raw_spin_lock(&worker->lock);
			worker->next_work = work;
			raw_spin_unlock(&worker->lock);
		}
		raw_spin_unlock(&wqe->lock);
		if (!work)
		} else {
			break;
		}
		io_assign_current_work(worker, work);
		__set_current_state(TASK_RUNNING);

@@ -608,8 +615,6 @@ static void io_worker_handle_work(struct io_worker *worker)
					wake_up(&wq->hash->wait);
			}
		} while (work);

		raw_spin_lock(&wqe->lock);
	} while (1);
}

@@ -633,12 +638,10 @@ static int io_wqe_worker(void *data)
		long ret;

		set_current_state(TASK_INTERRUPTIBLE);
loop:
		raw_spin_lock(&wqe->lock);
		if (io_acct_run_queue(acct)) {
		while (io_acct_run_queue(acct))
			io_worker_handle_work(worker);
			goto loop;
		}

		raw_spin_lock(&wqe->lock);
		/* timed out, exit unless we're the last worker */
		if (last_timeout && acct->nr_workers > 1) {
			acct->nr_workers--;
@@ -662,10 +665,8 @@ static int io_wqe_worker(void *data)
		last_timeout = !ret;
	}

	if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
		raw_spin_lock(&wqe->lock);
	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
		io_worker_handle_work(worker);
	}

	audit_free(current);
	io_worker_exit(worker);
@@ -705,10 +706,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
		return;

	worker->flags &= ~IO_WORKER_F_RUNNING;

	raw_spin_lock(&worker->wqe->lock);
	io_wqe_dec_running(worker);
	raw_spin_unlock(&worker->wqe->lock);
}

static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
@@ -778,10 +776,12 @@ static void create_worker_cont(struct callback_head *cb)
				.cancel_all	= true,
			};

			raw_spin_unlock(&wqe->lock);
			while (io_acct_cancel_pending_work(wqe, acct, &match))
				raw_spin_lock(&wqe->lock);
		}
				;
		} else {
			raw_spin_unlock(&wqe->lock);
		}
		io_worker_ref_put(wqe->wq);
		kfree(worker);
		return;
@@ -914,6 +914,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
	struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
	struct io_cb_cancel_data match;
	unsigned work_flags = work->flags;
	bool do_create;

@@ -927,10 +928,12 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
		return;
	}

	raw_spin_lock(&wqe->lock);
	raw_spin_lock(&acct->lock);
	io_wqe_insert_work(wqe, work);
	clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
	raw_spin_unlock(&acct->lock);

	raw_spin_lock(&wqe->lock);
	rcu_read_lock();
	do_create = !io_wqe_activate_free_worker(wqe, acct);
	rcu_read_unlock();
@@ -946,18 +949,18 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
			return;

		raw_spin_lock(&wqe->lock);
		/* fatal condition, failed to create the first worker */
		if (!acct->nr_workers) {
			struct io_cb_cancel_data match = {
				.fn		= io_wq_work_match_item,
				.data		= work,
				.cancel_all	= false,
			};

			if (io_acct_cancel_pending_work(wqe, acct, &match))
				raw_spin_lock(&wqe->lock);
		if (acct->nr_workers) {
			raw_spin_unlock(&wqe->lock);
			return;
		}
		raw_spin_unlock(&wqe->lock);

		/* fatal condition, failed to create the first worker */
		match.fn		= io_wq_work_match_item,
		match.data		= work,
		match.cancel_all	= false,

		io_acct_cancel_pending_work(wqe, acct, &match);
	}
}

@@ -1032,22 +1035,23 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
					struct io_wqe_acct *acct,
					struct io_cb_cancel_data *match)
	__releases(wqe->lock)
{
	struct io_wq_work_node *node, *prev;
	struct io_wq_work *work;

	raw_spin_lock(&acct->lock);
	wq_list_for_each(node, prev, &acct->work_list) {
		work = container_of(node, struct io_wq_work, list);
		if (!match->fn(work, match->data))
			continue;
		io_wqe_remove_pending(wqe, work, prev);
		raw_spin_unlock(&wqe->lock);
		raw_spin_unlock(&acct->lock);
		io_run_cancel(work, wqe);
		match->nr_pending++;
		/* not safe to continue after unlock */
		return true;
	}
	raw_spin_unlock(&acct->lock);

	return false;
}
@@ -1061,7 +1065,6 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
		struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);

		if (io_acct_cancel_pending_work(wqe, acct, match)) {
			raw_spin_lock(&wqe->lock);
			if (match->cancel_all)
				goto retry;
			break;
@@ -1103,13 +1106,11 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
	for_each_node(node) {
		struct io_wqe *wqe = wq->wqes[node];

		raw_spin_lock(&wqe->lock);
		io_wqe_cancel_pending_work(wqe, &match);
		if (match.nr_pending && !match.cancel_all) {
			raw_spin_unlock(&wqe->lock);
		if (match.nr_pending && !match.cancel_all)
			return IO_WQ_CANCEL_OK;
		}

		raw_spin_lock(&wqe->lock);
		io_wqe_cancel_running_work(wqe, &match);
		raw_spin_unlock(&wqe->lock);
		if (match.nr_running && !match.cancel_all)
@@ -1190,6 +1191,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
			acct->index = i;
			atomic_set(&acct->nr_running, 0);
			INIT_WQ_LIST(&acct->work_list);
			raw_spin_lock_init(&acct->lock);
		}
		wqe->wq = wq;
		raw_spin_lock_init(&wqe->lock);
@@ -1282,9 +1284,7 @@ static void io_wq_destroy(struct io_wq *wq)
			.fn		= io_wq_work_match_all,
			.cancel_all	= true,
		};
		raw_spin_lock(&wqe->lock);
		io_wqe_cancel_pending_work(wqe, &match);
		raw_spin_unlock(&wqe->lock);
		free_cpumask_var(wqe->cpu_mask);
		kfree(wqe);
	}
@@ -1376,7 +1376,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
	BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
	BUILD_BUG_ON((int) IO_WQ_ACCT_NR      != 2);

	for (i = 0; i < 2; i++) {
	for (i = 0; i < IO_WQ_ACCT_NR; i++) {
		if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
			new_count[i] = task_rlimit(current, RLIMIT_NPROC);
	}
+967 −284

File changed.

Preview size limit exceeded, changes collapsed.

+4 −1
Original line number Diff line number Diff line
@@ -9,12 +9,15 @@
struct sock *io_uring_get_socket(struct file *file);
void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
void io_uring_unreg_ringfd(void);

static inline void io_uring_files_cancel(void)
{
	if (current->io_uring)
	if (current->io_uring) {
		io_uring_unreg_ringfd();
		__io_uring_cancel(false);
	}
}
static inline void io_uring_task_cancel(void)
{
	if (current->io_uring)
+159 −174
Original line number Diff line number Diff line
@@ -44,7 +44,7 @@ TRACE_EVENT(io_uring_create,
		__entry->flags		= flags;
	),

	TP_printk("ring %p, fd %d sq size %d, cq size %d, flags %d",
	TP_printk("ring %p, fd %d sq size %d, cq size %d, flags 0x%x",
			  __entry->ctx, __entry->fd, __entry->sq_entries,
			  __entry->cq_entries, __entry->flags)
);
@@ -57,10 +57,9 @@ TRACE_EVENT(io_uring_create,
 * @opcode:		describes which operation to perform
 * @nr_user_files:	number of registered files
 * @nr_user_bufs:	number of registered buffers
 * @cq_ev_fd:		whether eventfs registered or not
 * @ret:		return code
 *
 * Allows to trace fixed files/buffers/eventfds, that could be registered to
 * Allows to trace fixed files/buffers, that could be registered to
 * avoid an overhead of getting references to them for every operation. This
 * event, together with io_uring_file_get, can provide a full picture of how
 * much overhead one can reduce via fixing.
@@ -68,16 +67,15 @@ TRACE_EVENT(io_uring_create,
TRACE_EVENT(io_uring_register,

	TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files,
			 unsigned nr_bufs, bool eventfd, long ret),
			 unsigned nr_bufs, long ret),

	TP_ARGS(ctx, opcode, nr_files, nr_bufs, eventfd, ret),
	TP_ARGS(ctx, opcode, nr_files, nr_bufs, ret),

	TP_STRUCT__entry (
		__field(  void *,	ctx	)
		__field(  unsigned,	opcode	)
		__field(  unsigned,	nr_files)
		__field(  unsigned,	nr_bufs	)
		__field(  bool,		eventfd		)
		__field(  long,		ret	)
	),

@@ -86,20 +84,21 @@ TRACE_EVENT(io_uring_register,
		__entry->opcode		= opcode;
		__entry->nr_files	= nr_files;
		__entry->nr_bufs	= nr_bufs;
		__entry->eventfd	= eventfd;
		__entry->ret		= ret;
	),

	TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, "
			  "eventfd %d, ret %ld",
			  "ret %ld",
			  __entry->ctx, __entry->opcode, __entry->nr_files,
			  __entry->nr_bufs, __entry->eventfd, __entry->ret)
			  __entry->nr_bufs, __entry->ret)
);

/**
 * io_uring_file_get - called before getting references to an SQE file
 *
 * @ctx:	pointer to a ring context structure
 * @req:	pointer to a submitted request
 * @user_data:	user data associated with the request
 * @fd:		SQE file descriptor
 *
 * Allows to trace out how often an SQE file reference is obtained, which can
@@ -108,59 +107,71 @@ TRACE_EVENT(io_uring_register,
 */
TRACE_EVENT(io_uring_file_get,

	TP_PROTO(void *ctx, int fd),
	TP_PROTO(void *ctx, void *req, unsigned long long user_data, int fd),

	TP_ARGS(ctx, fd),
	TP_ARGS(ctx, req, user_data, fd),

	TP_STRUCT__entry (
		__field(  void *,	ctx		)
		__field(  void *,	req		)
		__field(  u64,		user_data	)
		__field(  int,		fd		)
	),

	TP_fast_assign(
		__entry->ctx		= ctx;
		__entry->req		= req;
		__entry->user_data	= user_data;
		__entry->fd		= fd;
	),

	TP_printk("ring %p, fd %d", __entry->ctx, __entry->fd)
	TP_printk("ring %p, req %p, user_data 0x%llx, fd %d",
		__entry->ctx, __entry->req, __entry->user_data, __entry->fd)
);

/**
 * io_uring_queue_async_work - called before submitting a new async work
 *
 * @ctx:	pointer to a ring context structure
 * @hashed:	type of workqueue, hashed or normal
 * @req:	pointer to a submitted request
 * @user_data:	user data associated with the request
 * @opcode:	opcode of request
 * @flags	request flags
 * @work:	pointer to a submitted io_wq_work
 * @rw:		type of workqueue, hashed or normal
 *
 * Allows to trace asynchronous work submission.
 */
TRACE_EVENT(io_uring_queue_async_work,

	TP_PROTO(void *ctx, int rw, void * req, struct io_wq_work *work,
			 unsigned int flags),
	TP_PROTO(void *ctx, void * req, unsigned long long user_data, u8 opcode,
		unsigned int flags, struct io_wq_work *work, int rw),

	TP_ARGS(ctx, rw, req, work, flags),
	TP_ARGS(ctx, req, user_data, flags, opcode, work, rw),

	TP_STRUCT__entry (
		__field(  void *,			ctx		)
		__field(  int,				rw	)
		__field(  void *,			req		)
		__field(  struct io_wq_work *,		work	)
		__field(  u64,				user_data	)
		__field(  u8,				opcode		)
		__field(  unsigned int,			flags		)
		__field(  struct io_wq_work *,		work		)
		__field(  int,				rw		)
	),

	TP_fast_assign(
		__entry->ctx		= ctx;
		__entry->rw	= rw;
		__entry->req		= req;
		__entry->work	= work;
		__entry->user_data	= user_data;
		__entry->flags		= flags;
		__entry->opcode		= opcode;
		__entry->work		= work;
		__entry->rw		= rw;
	),

	TP_printk("ring %p, request %p, flags %d, %s queue, work %p",
			  __entry->ctx, __entry->req, __entry->flags,
			  __entry->rw ? "hashed" : "normal", __entry->work)
	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, flags 0x%x, %s queue, work %p",
		__entry->ctx, __entry->req, __entry->user_data, __entry->opcode,
		__entry->flags, __entry->rw ? "hashed" : "normal", __entry->work)
);

/**
@@ -169,30 +180,33 @@ TRACE_EVENT(io_uring_queue_async_work,
 * @ctx:	pointer to a ring context structure
 * @req:	pointer to a deferred request
 * @user_data:	user data associated with the request
 * @opcode:	opcode of request
 *
 * Allows to track deferred requests, to get an insight about what requests are
 * not started immediately.
 */
TRACE_EVENT(io_uring_defer,

	TP_PROTO(void *ctx, void *req, unsigned long long user_data),
	TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode),

	TP_ARGS(ctx, req, user_data),
	TP_ARGS(ctx, req, user_data, opcode),

	TP_STRUCT__entry (
		__field(  void *,		ctx	)
		__field(  void *,		req	)
		__field(  unsigned long long,	data	)
		__field(  u8,			opcode	)
	),

	TP_fast_assign(
		__entry->ctx	= ctx;
		__entry->req	= req;
		__entry->data	= user_data;
		__entry->opcode	= opcode;
	),

	TP_printk("ring %p, request %p user_data %llu", __entry->ctx,
			__entry->req, __entry->data)
	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d",
		__entry->ctx, __entry->req, __entry->data, __entry->opcode)
);

/**
@@ -260,7 +274,10 @@ TRACE_EVENT(io_uring_cqring_wait,
/**
 * io_uring_fail_link - called before failing a linked request
 *
 * @ctx:	pointer to a ring context structure
 * @req:	request, which links were cancelled
 * @user_data:	user data associated with the request
 * @opcode:	opcode of request
 * @link:	cancelled link
 *
 * Allows to track linked requests cancellation, to see not only that some work
@@ -268,27 +285,36 @@ TRACE_EVENT(io_uring_cqring_wait,
 */
TRACE_EVENT(io_uring_fail_link,

	TP_PROTO(void *req, void *link),
	TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, void *link),

	TP_ARGS(req, link),
	TP_ARGS(ctx, req, user_data, opcode, link),

	TP_STRUCT__entry (
		__field(  void *,		ctx		)
		__field(  void *,		req		)
		__field(  unsigned long long,	user_data	)
		__field(  u8,			opcode		)
		__field(  void *,		link		)
	),

	TP_fast_assign(
		__entry->ctx		= ctx;
		__entry->req		= req;
		__entry->user_data	= user_data;
		__entry->opcode		= opcode;
		__entry->link		= link;
	),

	TP_printk("request %p, link %p", __entry->req, __entry->link)
	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, link %p",
		__entry->ctx, __entry->req, __entry->user_data, __entry->opcode,
		__entry->link)
);

/**
 * io_uring_complete - called when completing an SQE
 *
 * @ctx:		pointer to a ring context structure
 * @req:		pointer to a submitted request
 * @user_data:		user data associated with the request
 * @res:		result of the request
 * @cflags:		completion flags
@@ -296,12 +322,13 @@ TRACE_EVENT(io_uring_fail_link,
 */
TRACE_EVENT(io_uring_complete,

	TP_PROTO(void *ctx, u64 user_data, int res, unsigned cflags),
	TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags),

	TP_ARGS(ctx, user_data, res, cflags),
	TP_ARGS(ctx, req, user_data, res, cflags),

	TP_STRUCT__entry (
		__field(  void *,	ctx		)
		__field(  void *,	req		)
		__field(  u64,		user_data	)
		__field(  int,		res		)
		__field(  unsigned,	cflags		)
@@ -309,13 +336,15 @@ TRACE_EVENT(io_uring_complete,

	TP_fast_assign(
		__entry->ctx		= ctx;
		__entry->req		= req;
		__entry->user_data	= user_data;
		__entry->res		= res;
		__entry->cflags		= cflags;
	),

	TP_printk("ring %p, user_data 0x%llx, result %d, cflags %x",
			  __entry->ctx, (unsigned long long)__entry->user_data,
	TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x",
		__entry->ctx, __entry->req,
		__entry->user_data,
		__entry->res, __entry->cflags)
);

@@ -324,8 +353,8 @@ TRACE_EVENT(io_uring_complete,
 *
 * @ctx:		pointer to a ring context structure
 * @req:		pointer to a submitted request
 * @opcode:		opcode of request
 * @user_data:		user data associated with the request
 * @opcode:		opcode of request
 * @flags		request flags
 * @force_nonblock:	whether a context blocking or not
 * @sq_thread:		true if sq_thread has submitted this SQE
@@ -335,16 +364,16 @@ TRACE_EVENT(io_uring_complete,
 */
TRACE_EVENT(io_uring_submit_sqe,

	TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data, u32 flags,
	TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, u32 flags,
		 bool force_nonblock, bool sq_thread),

	TP_ARGS(ctx, req, opcode, user_data, flags, force_nonblock, sq_thread),
	TP_ARGS(ctx, req, user_data, opcode, flags, force_nonblock, sq_thread),

	TP_STRUCT__entry (
		__field(  void *,		ctx		)
		__field(  void *,		req		)
		__field(  unsigned long long,	user_data	)
		__field(  u8,			opcode		)
		__field(  u64,		user_data	)
		__field(  u32,			flags		)
		__field(  bool,			force_nonblock	)
		__field(  bool,			sq_thread	)
@@ -353,16 +382,16 @@ TRACE_EVENT(io_uring_submit_sqe,
	TP_fast_assign(
		__entry->ctx		= ctx;
		__entry->req		= req;
		__entry->opcode		= opcode;
		__entry->user_data	= user_data;
		__entry->opcode		= opcode;
		__entry->flags		= flags;
		__entry->force_nonblock	= force_nonblock;
		__entry->sq_thread	= sq_thread;
	),

	TP_printk("ring %p, req %p, op %d, data 0x%llx, flags %u, "
	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, flags 0x%x, "
		  "non block %d, sq_thread %d", __entry->ctx, __entry->req,
		  __entry->opcode, (unsigned long long)__entry->user_data,
		  __entry->user_data, __entry->opcode,
		  __entry->flags, __entry->force_nonblock, __entry->sq_thread)
);

@@ -371,8 +400,8 @@ TRACE_EVENT(io_uring_submit_sqe,
 *
 * @ctx:		pointer to a ring context structure
 * @req:		pointer to the armed request
 * @opcode:		opcode of request
 * @user_data:		user data associated with the request
 * @opcode:		opcode of request
 * @mask:		request poll events mask
 * @events:		registered events of interest
 *
@@ -381,16 +410,16 @@ TRACE_EVENT(io_uring_submit_sqe,
 */
TRACE_EVENT(io_uring_poll_arm,

	TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data,
	TP_PROTO(void *ctx, void *req, u64 user_data, u8 opcode,
		 int mask, int events),

	TP_ARGS(ctx, req, opcode, user_data, mask, events),
	TP_ARGS(ctx, req, user_data, opcode, mask, events),

	TP_STRUCT__entry (
		__field(  void *,		ctx		)
		__field(  void *,		req		)
		__field(  unsigned long long,	user_data	)
		__field(  u8,			opcode		)
		__field(  u64,		user_data	)
		__field(  int,			mask		)
		__field(  int,			events		)
	),
@@ -398,121 +427,74 @@ TRACE_EVENT(io_uring_poll_arm,
	TP_fast_assign(
		__entry->ctx		= ctx;
		__entry->req		= req;
		__entry->opcode		= opcode;
		__entry->user_data	= user_data;
		__entry->opcode		= opcode;
		__entry->mask		= mask;
		__entry->events		= events;
	),

	TP_printk("ring %p, req %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
		  __entry->ctx, __entry->req, __entry->opcode,
		  (unsigned long long) __entry->user_data,
	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask 0x%x, events 0x%x",
		  __entry->ctx, __entry->req, __entry->user_data, __entry->opcode,
		  __entry->mask, __entry->events)
);

TRACE_EVENT(io_uring_poll_wake,

	TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),

	TP_ARGS(ctx, opcode, user_data, mask),

	TP_STRUCT__entry (
		__field(  void *,	ctx		)
		__field(  u8,		opcode		)
		__field(  u64,		user_data	)
		__field(  int,		mask		)
	),

	TP_fast_assign(
		__entry->ctx		= ctx;
		__entry->opcode		= opcode;
		__entry->user_data	= user_data;
		__entry->mask		= mask;
	),

	TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x",
			  __entry->ctx, __entry->opcode,
			  (unsigned long long) __entry->user_data,
			  __entry->mask)
);

TRACE_EVENT(io_uring_task_add,

	TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),

	TP_ARGS(ctx, opcode, user_data, mask),

	TP_STRUCT__entry (
		__field(  void *,	ctx		)
		__field(  u8,		opcode		)
		__field(  u64,		user_data	)
		__field(  int,		mask		)
	),

	TP_fast_assign(
		__entry->ctx		= ctx;
		__entry->opcode		= opcode;
		__entry->user_data	= user_data;
		__entry->mask		= mask;
	),

	TP_printk("ring %p, op %d, data 0x%llx, mask %x",
			  __entry->ctx, __entry->opcode,
			  (unsigned long long) __entry->user_data,
			  __entry->mask)
);

/*
 * io_uring_task_run - called when task_work_run() executes the poll events
 *                     notification callbacks
 * io_uring_task_add - called after adding a task
 *
 * @ctx:		pointer to a ring context structure
 * @req:		pointer to the armed request
 * @opcode:		opcode of request
 * @req:		pointer to request
 * @user_data:		user data associated with the request
 * @opcode:		opcode of request
 * @mask:		request poll events mask
 *
 * Allows to track when notified poll events are processed
 */
TRACE_EVENT(io_uring_task_run,
TRACE_EVENT(io_uring_task_add,

	TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data),
	TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, int mask),

	TP_ARGS(ctx, req, opcode, user_data),
	TP_ARGS(ctx, req, user_data, opcode, mask),

	TP_STRUCT__entry (
		__field(  void *,		ctx		)
		__field(  void *,		req		)
		__field(  unsigned long long,	user_data	)
		__field(  u8,			opcode		)
		__field(  u64,		user_data	)
		__field(  int,			mask		)
	),

	TP_fast_assign(
		__entry->ctx		= ctx;
		__entry->req		= req;
		__entry->opcode		= opcode;
		__entry->user_data	= user_data;
		__entry->opcode		= opcode;
		__entry->mask		= mask;
	),

	TP_printk("ring %p, req %p, op %d, data 0x%llx",
		  __entry->ctx, __entry->req, __entry->opcode,
		  (unsigned long long) __entry->user_data)
	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask %x",
		__entry->ctx, __entry->req, __entry->user_data, __entry->opcode,
		__entry->mask)
);

/*
 * io_uring_req_failed - called when an sqe is errored dring submission
 *
 * @sqe:		pointer to the io_uring_sqe that failed
 * @ctx:		pointer to a ring context structure
 * @req:		pointer to request
 * @error:		error it failed with
 *
 * Allows easier diagnosing of malformed requests in production systems.
 */
TRACE_EVENT(io_uring_req_failed,

	TP_PROTO(const struct io_uring_sqe *sqe, int error),
	TP_PROTO(const struct io_uring_sqe *sqe, void *ctx, void *req, int error),

	TP_ARGS(sqe, error),
	TP_ARGS(sqe, ctx, req, error),

	TP_STRUCT__entry (
		__field(  void *,		ctx		)
		__field(  void *,		req		)
		__field(  unsigned long long,	user_data	)
		__field(  u8,			opcode		)
		__field(  u8,			flags		)
		__field(  u8,			ioprio		)
@@ -520,7 +502,6 @@ TRACE_EVENT(io_uring_req_failed,
		__field( u64,			addr		)
		__field( u32,			len		)
		__field( u32,			op_flags	)
		__field( u64,	user_data )
		__field( u16,			buf_index	)
		__field( u16,			personality	)
		__field( u32,			file_index	)
@@ -530,6 +511,9 @@ TRACE_EVENT(io_uring_req_failed,
	),

	TP_fast_assign(
		__entry->ctx		= ctx;
		__entry->req		= req;
		__entry->user_data	= sqe->user_data;
		__entry->opcode		= sqe->opcode;
		__entry->flags		= sqe->flags;
		__entry->ioprio		= sqe->ioprio;
@@ -537,7 +521,6 @@ TRACE_EVENT(io_uring_req_failed,
		__entry->addr		= sqe->addr;
		__entry->len		= sqe->len;
		__entry->op_flags	= sqe->rw_flags;
		__entry->user_data	= sqe->user_data;
		__entry->buf_index	= sqe->buf_index;
		__entry->personality	= sqe->personality;
		__entry->file_index	= sqe->file_index;
@@ -546,13 +529,15 @@ TRACE_EVENT(io_uring_req_failed,
		__entry->error		= error;
	),

	TP_printk("op %d, flags=0x%x, prio=%d, off=%llu, addr=%llu, "
		  "len=%u, rw_flags=0x%x, user_data=0x%llx, buf_index=%d, "
	TP_printk("ring %p, req %p, user_data 0x%llx, "
		"op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, "
		  "len=%u, rw_flags=0x%x, buf_index=%d, "
		  "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d",
		  __entry->ctx, __entry->req, __entry->user_data,
		  __entry->opcode, __entry->flags, __entry->ioprio,
		  (unsigned long long)__entry->off,
		  (unsigned long long) __entry->addr, __entry->len,
		  __entry->op_flags, (unsigned long long) __entry->user_data,
		  __entry->op_flags,
		  __entry->buf_index, __entry->personality, __entry->file_index,
		  (unsigned long long) __entry->pad1,
		  (unsigned long long) __entry->pad2, __entry->error)
+13 −4
Original line number Diff line number Diff line
@@ -101,6 +101,7 @@ enum {
#define IORING_SETUP_CLAMP	(1U << 4)	/* clamp SQ/CQ ring sizes */
#define IORING_SETUP_ATTACH_WQ	(1U << 5)	/* attach to existing wq */
#define IORING_SETUP_R_DISABLED	(1U << 6)	/* start with ring disabled */
#define IORING_SETUP_SUBMIT_ALL	(1U << 7)	/* continue submit on error */

enum {
	IORING_OP_NOP,
@@ -143,6 +144,7 @@ enum {
	IORING_OP_MKDIRAT,
	IORING_OP_SYMLINKAT,
	IORING_OP_LINKAT,
	IORING_OP_MSG_RING,

	/* this goes last, obviously */
	IORING_OP_LAST,
@@ -199,9 +201,11 @@ struct io_uring_cqe {
 *
 * IORING_CQE_F_BUFFER	If set, the upper 16 bits are the buffer ID
 * IORING_CQE_F_MORE	If set, parent SQE will generate more CQE entries
 * IORING_CQE_F_MSG	If set, CQE was generated with IORING_OP_MSG_RING
 */
#define IORING_CQE_F_BUFFER		(1U << 0)
#define IORING_CQE_F_MORE		(1U << 1)
#define IORING_CQE_F_MSG		(1U << 2)

enum {
	IORING_CQE_BUFFER_SHIFT		= 16,
@@ -261,6 +265,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_SQ_WAKEUP		(1U << 1)
#define IORING_ENTER_SQ_WAIT		(1U << 2)
#define IORING_ENTER_EXT_ARG		(1U << 3)
#define IORING_ENTER_REGISTERED_RING	(1U << 4)

/*
 * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -325,6 +330,10 @@ enum {
	/* set/get max number of io-wq workers */
	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,

	/* register/unregister io_uring fd with the ring */
	IORING_REGISTER_RING_FDS		= 20,
	IORING_UNREGISTER_RING_FDS		= 21,

	/* this goes last */
	IORING_REGISTER_LAST
};