Commit 5b9a7bb7 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-6.4/io_uring-2023-04-21' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:

 - Cleanup of the io-wq per-node mapping, notably getting rid of it so
   we just have a single io_wq entry per ring (Breno)

 - Followup to the above, move accounting to io_wq as well and
   completely drop struct io_wqe (Gabriel)

 - Enable KASAN for the internal io_uring caches (Breno)

 - Add support for multishot timeouts. Some applications use timeouts to
   wake someone waiting on completion entries, and this makes it a bit
   easier to just have a recurring timer rather than needing to rearm it
   every time (David)

 - Support archs that have shared cache coloring between userspace and
   the kernel, and hence have strict address requirements for mmap'ing
   the ring into userspace. This should only be parisc/hppa. (Helge, me)

 - XFS has supported O_DIRECT writes without needing to lock the inode
   exclusively for a long time, and ext4 now supports it as well. This
   is true for the common cases of not extending the file size. Flag the
   fs as having that feature, and utilize that to avoid serializing
   those writes in io_uring (me)

 - Enable completion batching for uring commands (me)

 - Revert patch adding io_uring restriction to what can be GUP mapped or
   not. This does not belong in io_uring, as io_uring isn't really
   special in this regard. Since this is also getting in the way of
   cleanups and improvements to the GUP code, get rid of if (me)

 - A few series greatly reducing the complexity of registered resources,
   like buffers or files. Not only does this clean up the code a lot,
   the simplified code is also a LOT more efficient (Pavel)

 - Series optimizing how we wait for events and run task_work related to
   it (Pavel)

 - Fixes for file/buffer unregistration with DEFER_TASKRUN (Pavel)

 - Misc cleanups and improvements (Pavel, me)

* tag 'for-6.4/io_uring-2023-04-21' of git://git.kernel.dk/linux: (71 commits)
  Revert "io_uring/rsrc: disallow multi-source reg buffers"
  io_uring: add support for multishot timeouts
  io_uring/rsrc: disassociate nodes and rsrc_data
  io_uring/rsrc: devirtualise rsrc put callbacks
  io_uring/rsrc: pass node to io_rsrc_put_work()
  io_uring/rsrc: inline io_rsrc_put_work()
  io_uring/rsrc: add empty flag in rsrc_node
  io_uring/rsrc: merge nodes and io_rsrc_put
  io_uring/rsrc: infer node from ctx on io_queue_rsrc_removal
  io_uring/rsrc: remove unused io_rsrc_node::llist
  io_uring/rsrc: refactor io_queue_rsrc_removal
  io_uring/rsrc: simplify single file node switching
  io_uring/rsrc: clean up __io_sqe_buffers_update()
  io_uring/rsrc: inline switch_start fast path
  io_uring/rsrc: remove rsrc_data refs
  io_uring/rsrc: fix DEFER_TASKRUN rsrc quiesce
  io_uring/rsrc: use wq for quiescing
  io_uring/rsrc: refactor io_rsrc_ref_quiesce
  io_uring/rsrc: remove io_rsrc_node::done
  io_uring/rsrc: use nospec'ed indexes
  ...
parents 5c7ecada 3c85cc43
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -899,7 +899,8 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
			return ret;
	}

	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC |
			FMODE_DIO_PARALLEL_WRITE;
	return dquot_file_open(inode, filp);
}

+2 −1
Original line number Diff line number Diff line
@@ -1171,7 +1171,8 @@ xfs_file_open(
{
	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
		return -EIO;
	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
			FMODE_DIO_PARALLEL_WRITE;
	return generic_file_open(inode, file);
}

+3 −0
Original line number Diff line number Diff line
@@ -168,6 +168,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,

#define	FMODE_NOREUSE		((__force fmode_t)0x800000)

/* File supports non-exclusive O_DIRECT writes from multiple threads */
#define FMODE_DIO_PARALLEL_WRITE	((__force fmode_t)0x1000000)

/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY		((__force fmode_t)0x4000000)

+15 −9
Original line number Diff line number Diff line
@@ -188,8 +188,10 @@ struct io_ev_fd {
};

struct io_alloc_cache {
	struct hlist_head	list;
	struct io_wq_work_node	list;
	unsigned int		nr_cached;
	unsigned int		max_cached;
	size_t			elem_size;
};

struct io_ring_ctx {
@@ -239,7 +241,6 @@ struct io_ring_ctx {
		 * uring_lock, and updated through io_uring_register(2)
		 */
		struct io_rsrc_node	*rsrc_node;
		int			rsrc_cached_refs;
		atomic_t		cancel_seq;
		struct io_file_table	file_table;
		unsigned		nr_user_files;
@@ -295,7 +296,7 @@ struct io_ring_ctx {
		spinlock_t		completion_lock;

		bool			poll_multi_queue;
		bool			cq_waiting;
		atomic_t		cq_wait_nr;

		/*
		 * ->iopoll_list is protected by the ctx->uring_lock for
@@ -325,16 +326,15 @@ struct io_ring_ctx {
	struct io_restriction		restrictions;

	/* slow path rsrc auxilary data, used by update/register */
	struct io_rsrc_node		*rsrc_backup_node;
	struct io_mapped_ubuf		*dummy_ubuf;
	struct io_rsrc_data		*file_data;
	struct io_rsrc_data		*buf_data;

	struct delayed_work		rsrc_put_work;
	struct callback_head		rsrc_put_tw;
	struct llist_head		rsrc_put_llist;
	/* protected by ->uring_lock */
	struct list_head		rsrc_ref_list;
	spinlock_t			rsrc_ref_lock;
	struct io_alloc_cache		rsrc_node_cache;
	struct wait_queue_head		rsrc_quiesce_wq;
	unsigned			rsrc_quiesce;

	struct list_head		io_buffers_pages;

@@ -366,6 +366,11 @@ struct io_ring_ctx {
	unsigned			evfd_last_cq_tail;
};

struct io_tw_state {
	/* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */
	bool locked;
};

enum {
	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
@@ -472,7 +477,7 @@ enum {
	REQ_F_HASH_LOCKED	= BIT(REQ_F_HASH_LOCKED_BIT),
};

typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);

struct io_task_work {
	struct llist_node		node;
@@ -562,6 +567,7 @@ struct io_kiocb {
	atomic_t			refs;
	atomic_t			poll_refs;
	struct io_task_work		io_task_work;
	unsigned			nr_tw;
	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
	union {
		struct hlist_node	hash_node;
+6 −9
Original line number Diff line number Diff line
@@ -360,19 +360,18 @@ TRACE_EVENT(io_uring_complete,
);

/**
 * io_uring_submit_sqe - called before submitting one SQE
 * io_uring_submit_req - called before submitting a request
 *
 * @req:		pointer to a submitted request
 * @force_nonblock:	whether a context blocking or not
 *
 * Allows to track SQE submitting, to understand what was the source of it, SQ
 * thread or io_uring_enter call.
 */
TRACE_EVENT(io_uring_submit_sqe,
TRACE_EVENT(io_uring_submit_req,

	TP_PROTO(struct io_kiocb *req, bool force_nonblock),
	TP_PROTO(struct io_kiocb *req),

	TP_ARGS(req, force_nonblock),
	TP_ARGS(req),

	TP_STRUCT__entry (
		__field(  void *,		ctx		)
@@ -380,7 +379,6 @@ TRACE_EVENT(io_uring_submit_sqe,
		__field(  unsigned long long,	user_data	)
		__field(  u8,			opcode		)
		__field(  u32,			flags		)
		__field(  bool,			force_nonblock	)
		__field(  bool,			sq_thread	)

		__string( op_str, io_uring_get_opcode(req->opcode) )
@@ -392,16 +390,15 @@ TRACE_EVENT(io_uring_submit_sqe,
		__entry->user_data	= req->cqe.user_data;
		__entry->opcode		= req->opcode;
		__entry->flags		= req->flags;
		__entry->force_nonblock	= force_nonblock;
		__entry->sq_thread	= req->ctx->flags & IORING_SETUP_SQPOLL;

		__assign_str(op_str, io_uring_get_opcode(req->opcode));
	),

	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
		  "non block %d, sq_thread %d", __entry->ctx, __entry->req,
		  "sq_thread %d", __entry->ctx, __entry->req,
		  __entry->user_data, __get_str(op_str),
		  __entry->flags, __entry->force_nonblock, __entry->sq_thread)
		  __entry->flags, __entry->sq_thread)
);

/*
Loading