1159 lines
31 KiB
Groff
1159 lines
31 KiB
Groff
.\" Copyright (C) 2019 Jens Axboe <axboe@kernel.dk>
|
|
.\" Copyright (C) 2019 Red Hat, Inc.
|
|
.\"
|
|
.\" SPDX-License-Identifier: LGPL-2.0-or-later
|
|
.\"
|
|
.TH IO_URING_ENTER 2 2019-01-22 "Linux" "Linux Programmer's Manual"
|
|
.SH NAME
|
|
io_uring_enter \- initiate and/or complete asynchronous I/O
|
|
.SH SYNOPSIS
|
|
.nf
|
|
.BR "#include <linux/io_uring.h>"
|
|
.PP
|
|
.BI "int io_uring_enter(unsigned int " fd ", unsigned int " to_submit ,
|
|
.BI " unsigned int " min_complete ", unsigned int " flags ,
|
|
.BI " sigset_t *" sig );
|
|
.fi
|
|
.PP
|
|
.SH DESCRIPTION
|
|
.PP
|
|
.BR io_uring_enter ()
|
|
is used to initiate and complete I/O using the shared submission and
|
|
completion queues setup by a call to
|
|
.BR io_uring_setup (2).
|
|
A single call can both submit new I/O and wait for completions of I/O
|
|
initiated by this call or previous calls to
|
|
.BR io_uring_enter ().
|
|
|
|
.I fd
|
|
is the file descriptor returned by
|
|
.BR io_uring_setup (2).
|
|
.I to_submit
|
|
specifies the number of I/Os to submit from the submission queue.
|
|
.I flags
|
|
is a bitmask of the following values:
|
|
.TP
|
|
.B IORING_ENTER_GETEVENTS
|
|
If this flag is set, then the system call will wait for the specificied
|
|
number of events in
|
|
.I min_complete
|
|
before returning. This flag can be set along with
|
|
.I to_submit
|
|
to both submit and complete events in a single system call.
|
|
.TP
|
|
.B IORING_ENTER_SQ_WAKEUP
|
|
If the ring has been created with
|
|
.B IORING_SETUP_SQPOLL,
|
|
then this flag asks the kernel to wakeup the SQ kernel thread to submit IO.
|
|
.TP
|
|
.B IORING_ENTER_SQ_WAIT
|
|
If the ring has been created with
|
|
.B IORING_SETUP_SQPOLL,
|
|
then the application has no real insight into when the SQ kernel thread has
|
|
consumed entries from the SQ ring. This can lead to a situation where the
|
|
application can no longer get a free SQE entry to submit, without knowing
|
|
when it one becomes available as the SQ kernel thread consumes them. If
|
|
the system call is used with this flag set, then it will wait until at least
|
|
one entry is free in the SQ ring.
|
|
.PP
|
|
.PP
|
|
If the io_uring instance was configured for polling, by specifying
|
|
.B IORING_SETUP_IOPOLL
|
|
in the call to
|
|
.BR io_uring_setup (2),
|
|
then min_complete has a slightly different meaning. Passing a value
|
|
of 0 instructs the kernel to return any events which are already complete,
|
|
without blocking. If
|
|
.I min_complete
|
|
is a non-zero value, the kernel will still return immediately if any
|
|
completion events are available. If no event completions are
|
|
available, then the call will poll either until one or more
|
|
completions become available, or until the process has exceeded its
|
|
scheduler time slice.
|
|
|
|
Note that, for interrupt driven I/O (where
|
|
.B IORING_SETUP_IOPOLL
|
|
was not specified in the call to
|
|
.BR io_uring_setup (2)),
|
|
an application may check the completion queue for event completions
|
|
without entering the kernel at all.
|
|
.PP
|
|
When the system call returns that a certain amount of SQEs have been
|
|
consumed and submitted, it's safe to reuse SQE entries in the ring. This is
|
|
true even if the actual IO submission had to be punted to async context,
|
|
which means that the SQE may in fact not have been submitted yet. If the
|
|
kernel requires later use of a particular SQE entry, it will have made a
|
|
private copy of it.
|
|
|
|
.I sig
|
|
is a pointer to a signal mask (see
|
|
.BR sigprocmask (2));
|
|
if
|
|
.I sig
|
|
is not NULL,
|
|
.BR io_uring_enter ()
|
|
first replaces the current signal mask by the one pointed to by
|
|
.IR sig ,
|
|
then waits for events to become available in the completion queue, and
|
|
then restores the original signal mask. The following
|
|
.BR io_uring_enter ()
|
|
call:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, &sig);
|
|
.EE
|
|
.in
|
|
.PP
|
|
is equivalent to
|
|
.I atomically
|
|
executing the following calls:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
pthread_sigmask(SIG_SETMASK, &sig, &orig);
|
|
ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, NULL);
|
|
pthread_sigmask(SIG_SETMASK, &orig, NULL);
|
|
.EE
|
|
.in
|
|
.PP
|
|
See the description of
|
|
.BR pselect (2)
|
|
for an explanation of why the
|
|
.I sig
|
|
parameter is necessary.
|
|
|
|
Submission queue entries are represented using the following data
|
|
structure:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
/*
|
|
* IO submission data structure (Submission Queue Entry)
|
|
*/
|
|
struct io_uring_sqe {
|
|
__u8 opcode; /* type of operation for this sqe */
|
|
__u8 flags; /* IOSQE_ flags */
|
|
__u16 ioprio; /* ioprio for the request */
|
|
__s32 fd; /* file descriptor to do IO on */
|
|
union {
|
|
__u64 off; /* offset into file */
|
|
__u64 addr2;
|
|
};
|
|
union {
|
|
__u64 addr; /* pointer to buffer or iovecs */
|
|
__u64 splice_off_in;
|
|
}
|
|
__u32 len; /* buffer size or number of iovecs */
|
|
union {
|
|
__kernel_rwf_t rw_flags;
|
|
__u32 fsync_flags;
|
|
__u16 poll_events; /* compatibility */
|
|
__u32 poll32_events; /* word-reversed for BE */
|
|
__u32 sync_range_flags;
|
|
__u32 msg_flags;
|
|
__u32 timeout_flags;
|
|
__u32 accept_flags;
|
|
__u32 cancel_flags;
|
|
__u32 open_flags;
|
|
__u32 statx_flags;
|
|
__u32 fadvise_advice;
|
|
__u32 splice_flags;
|
|
};
|
|
__u64 user_data; /* data to be passed back at completion time */
|
|
union {
|
|
struct {
|
|
/* index into fixed buffers, if used */
|
|
union {
|
|
/* index into fixed buffers, if used */
|
|
__u16 buf_index;
|
|
/* for grouped buffer selection */
|
|
__u16 buf_group;
|
|
}
|
|
/* personality to use, if used */
|
|
__u16 personality;
|
|
__s32 splice_fd_in;
|
|
};
|
|
__u64 __pad2[3];
|
|
};
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
The
|
|
.I opcode
|
|
describes the operation to be performed. It can be one of:
|
|
.TP
|
|
.B IORING_OP_NOP
|
|
Do not perform any I/O. This is useful for testing the performance of
|
|
the io_uring implementation itself.
|
|
.TP
|
|
.B IORING_OP_READV
|
|
.TP
|
|
.B IORING_OP_WRITEV
|
|
Vectored read and write operations, similar to
|
|
.BR preadv2 (2)
|
|
and
|
|
.BR pwritev2 (2).
|
|
If the file is not seekable,
|
|
.I off
|
|
must be set to zero.
|
|
|
|
.TP
|
|
.B IORING_OP_READ_FIXED
|
|
.TP
|
|
.B IORING_OP_WRITE_FIXED
|
|
Read from or write to pre-mapped buffers. See
|
|
.BR io_uring_register (2)
|
|
for details on how to setup a context for fixed reads and writes.
|
|
|
|
.TP
|
|
.B IORING_OP_FSYNC
|
|
File sync. See also
|
|
.BR fsync (2).
|
|
Note that, while I/O is initiated in the order in which it appears in
|
|
the submission queue, completions are unordered. For example, an
|
|
application which places a write I/O followed by an fsync in the
|
|
submission queue cannot expect the fsync to apply to the write. The
|
|
two operations execute in parallel, so the fsync may complete before
|
|
the write is issued to the storage. The same is also true for
|
|
previously issued writes that have not completed prior to the fsync.
|
|
|
|
.TP
|
|
.B IORING_OP_POLL_ADD
|
|
Poll the
|
|
.I fd
|
|
specified in the submission queue entry for the events
|
|
specified in the
|
|
.I poll_events
|
|
field. Unlike poll or epoll without
|
|
.BR EPOLLONESHOT ,
|
|
this interface always works in one shot mode. That is, once the poll
|
|
operation is completed, it will have to be resubmitted. This command works like
|
|
an async
|
|
.BR poll(2)
|
|
and the completion event result is the returned mask of events.
|
|
|
|
.TP
|
|
.B IORING_OP_POLL_REMOVE
|
|
Remove an existing poll request. If found, the
|
|
.I res
|
|
field of the
|
|
.I "struct io_uring_cqe"
|
|
will contain 0. If not found,
|
|
.I res
|
|
will contain
|
|
.B -ENOENT.
|
|
|
|
.TP
|
|
.B IORING_OP_EPOLL_CTL
|
|
Add, remove or modify entries in the interest list of
|
|
.BR epoll (7).
|
|
See
|
|
.BR epoll_ctl (2)
|
|
for details of the system call.
|
|
.I fd
|
|
holds the file descriptor that represents the epoll instance,
|
|
.I addr
|
|
holds the file descriptor to add, remove or modify,
|
|
.I len
|
|
holds the operation (EPOLL_CTL_ADD, EPOLL_CTL_DEL, EPOLL_CTL_MOD) to perform and,
|
|
.I off
|
|
holds a pointer to the
|
|
.I epoll_events
|
|
structure. Available since 5.6.
|
|
|
|
.TP
|
|
.B IORING_OP_SYNC_FILE_RANGE
|
|
Issue the equivalent of a \fBsync_file_range\fR (2) on the file descriptor. The
|
|
.I fd
|
|
field is the file descriptor to sync, the
|
|
.I off
|
|
field holds the offset in bytes, the
|
|
.I len
|
|
field holds the length in bytes, and the
|
|
.I sync_range_flags
|
|
field holds the flags for the command. See also
|
|
.BR sync_file_range (2)
|
|
for the general description of the related system call. Available since 5.2.
|
|
|
|
.TP
|
|
.B IORING_OP_SENDMSG
|
|
Issue the equivalent of a
|
|
.BR sendmsg(2)
|
|
system call.
|
|
.I fd
|
|
must be set to the socket file descriptor,
|
|
.I addr
|
|
must contain a pointer to the msghdr structure, and
|
|
.I msg_flags
|
|
holds the flags associated with the system call. See also
|
|
.BR sendmsg (2)
|
|
for the general description of the related system call. Available since 5.3.
|
|
|
|
.TP
|
|
.B IORING_OP_RECVMSG
|
|
Works just like IORING_OP_SENDMSG, except for
|
|
.BR recvmsg(2)
|
|
instead. See the description of IORING_OP_SENDMSG. Available since 5.3.
|
|
|
|
.TP
|
|
.B IORING_OP_SEND
|
|
Issue the equivalent of a
|
|
.BR send(2)
|
|
system call.
|
|
.I fd
|
|
must be set to the socket file descriptor,
|
|
.I addr
|
|
must contain a pointer to the buffer,
|
|
.I len
|
|
denotes the length of the buffer to send, and
|
|
.I msg_flags
|
|
holds the flags associated with the system call. See also
|
|
.BR send(2)
|
|
for the general description of the related system call. Available since 5.6.
|
|
|
|
.TP
|
|
.B IORING_OP_RECV
|
|
Works just like IORING_OP_SEND, except for
|
|
.BR recv(2)
|
|
instead. See the description of IORING_OP_SEND. Available since 5.6.
|
|
|
|
.TP
|
|
.B IORING_OP_TIMEOUT
|
|
This command will register a timeout operation. The
|
|
.I addr
|
|
field must contain a pointer to a struct timespec64 structure,
|
|
.I len
|
|
must contain 1 to signify one timespec64 structure,
|
|
.I timeout_flags
|
|
may contain IORING_TIMEOUT_ABS
|
|
for an absolute timeout value, or 0 for a relative timeout.
|
|
.I off
|
|
may contain a completion event count. A timeout
|
|
will trigger a wakeup event on the completion ring for anyone waiting for
|
|
events. A timeout condition is met when either the specified timeout expires,
|
|
or the specified number of events have completed. Either condition will
|
|
trigger the event. If set to 0, completed events are not counted, which
|
|
effectively acts like a timer. io_uring timeouts use the
|
|
.B CLOCK_MONOTONIC
|
|
clock source. The request will complete with
|
|
.I -ETIME
|
|
if the timeout got completed through expiration of the timer, or
|
|
.I 0
|
|
if the timeout got completed through requests completing on their own. If
|
|
the timeout was cancelled before it expired, the request will complete with
|
|
.I -ECANCELED.
|
|
Available since 5.4.
|
|
|
|
.TP
|
|
.B IORING_OP_TIMEOUT_REMOVE
|
|
If
|
|
.I timeout_flags are zero, then it attempts to remove an existing timeout
|
|
operation.
|
|
.I addr
|
|
must contain the
|
|
.I user_data
|
|
field of the previously issued timeout operation. If the specified timeout
|
|
request is found and cancelled successfully, this request will terminate
|
|
with a result value of
|
|
.I 0
|
|
If the timeout request was found but expiration was already in progress,
|
|
this request will terminate with a result value of
|
|
.I -EBUSY
|
|
If the timeout request wasn't found, the request will terminate with a result
|
|
value of
|
|
.I -ENOENT
|
|
Available since 5.5.
|
|
|
|
If
|
|
.I timeout_flags
|
|
contain
|
|
.I IORING_TIMEOUT_UPDATE,
|
|
instead of removing an existing operation it updates it.
|
|
.I addr
|
|
and return values are same as before.
|
|
.I addr2
|
|
field must contain a pointer to a struct timespec64 structure.
|
|
.I timeout_flags
|
|
may also contain IORING_TIMEOUT_ABS.
|
|
Available since 5.11.
|
|
|
|
.TP
|
|
.B IORING_OP_ACCEPT
|
|
Issue the equivalent of an
|
|
.BR accept4(2)
|
|
system call.
|
|
.I fd
|
|
must be set to the socket file descriptor,
|
|
.I addr
|
|
must contain the pointer to the sockaddr structure, and
|
|
.I addr2
|
|
must contain a pointer to the socklen_t addrlen field. See also
|
|
.BR accept4(2)
|
|
for the general description of the related system call. Available since 5.5.
|
|
|
|
.TP
|
|
.B IORING_OP_ASYNC_CANCEL
|
|
Attempt to cancel an already issued request.
|
|
.I addr
|
|
must contain the
|
|
.I user_data
|
|
field of the request that should be cancelled. The cancellation request will
|
|
complete with one of the following results codes. If found, the
|
|
.I res
|
|
field of the cqe will contain 0. If not found,
|
|
.I res
|
|
will contain -ENOENT. If found and attempted cancelled, the
|
|
.I res
|
|
field will contain -EALREADY. In this case, the request may or may not
|
|
terminate. In general, requests that are interruptible (like socket IO) will
|
|
get cancelled, while disk IO requests cannot be cancelled if already started.
|
|
Available since 5.5.
|
|
|
|
.TP
|
|
.B IORING_OP_LINK_TIMEOUT
|
|
This request must be linked with another request through
|
|
.I IOSQE_IO_LINK
|
|
which is described below. Unlike
|
|
.I IORING_OP_TIMEOUT,
|
|
.I IORING_OP_LINK_TIMEOUT
|
|
acts on the linked request, not the completion queue. The format of the command
|
|
is otherwise like
|
|
.I IORING_OP_TIMEOUT,
|
|
except there's no completion event count as it's tied to a specific request.
|
|
If used, the timeout specified in the command will cancel the linked command,
|
|
unless the linked command completes before the timeout. The timeout will
|
|
complete with
|
|
.I -ETIME
|
|
if the timer expired and the linked request was attempted cancelled, or
|
|
.I -ECANCELED
|
|
if the timer got cancelled because of completion of the linked request. Like
|
|
.B IORING_OP_TIMEOUT
|
|
the clock source used is
|
|
.B CLOCK_MONOTONIC
|
|
Available since 5.5.
|
|
|
|
|
|
.TP
|
|
.B IORING_OP_CONNECT
|
|
Issue the equivalent of a
|
|
.BR connect(2)
|
|
system call.
|
|
.I fd
|
|
must be set to the socket file descriptor,
|
|
.I addr
|
|
must contain the const pointer to the sockaddr structure, and
|
|
.I off
|
|
must contain the socklen_t addrlen field. See also
|
|
.BR connect(2)
|
|
for the general description of the related system call. Available since 5.5.
|
|
|
|
.TP
|
|
.B IORING_OP_FALLOCATE
|
|
Issue the equivalent of a
|
|
.BR fallocate(2)
|
|
system call.
|
|
.I fd
|
|
must be set to the file descriptor,
|
|
.I len
|
|
must contain the mode associated with the operation,
|
|
.I off
|
|
must contain the offset on which to operate, and
|
|
.I addr
|
|
must contain the length. See also
|
|
.BR fallocate(2)
|
|
for the general description of the related system call. Available since 5.6.
|
|
|
|
.TP
|
|
.B IORING_OP_FADVISE
|
|
Issue the equivalent of a
|
|
.BR posix_fadvise(2)
|
|
system call.
|
|
.I fd
|
|
must be set to the file descriptor,
|
|
.I off
|
|
must contain the offset on which to operate,
|
|
.I len
|
|
must contain the length, and
|
|
.I fadvise_advice
|
|
must contain the advice associated with the operation. See also
|
|
.BR posix_fadvise(2)
|
|
for the general description of the related system call. Available since 5.6.
|
|
|
|
.TP
|
|
.B IORING_OP_MADVISE
|
|
Issue the equivalent of a
|
|
.BR madvise(2)
|
|
system call.
|
|
.I addr
|
|
must contain the address to operate on,
|
|
.I len
|
|
must contain the length on which to operate,
|
|
and
|
|
.I fadvise_advice
|
|
must contain the advice associated with the operation. See also
|
|
.BR madvise(2)
|
|
for the general description of the related system call. Available since 5.6.
|
|
|
|
.TP
|
|
.B IORING_OP_OPENAT
|
|
Issue the equivalent of a
|
|
.BR openat(2)
|
|
system call.
|
|
.I fd
|
|
is the
|
|
.I dirfd
|
|
argument,
|
|
.I addr
|
|
must contain a pointer to the
|
|
.I *pathname
|
|
argument,
|
|
.I open_flags
|
|
should contain any flags passed in, and
|
|
.I len
|
|
is access mode of the file. See also
|
|
.BR openat(2)
|
|
for the general description of the related system call. Available since 5.6.
|
|
|
|
.TP
|
|
.B IORING_OP_OPENAT2
|
|
Issue the equivalent of a
|
|
.BR openat2(2)
|
|
system call.
|
|
.I fd
|
|
is the
|
|
.I dirfd
|
|
argument,
|
|
.I addr
|
|
must contain a pointer to the
|
|
.I *pathname
|
|
argument,
|
|
.I len
|
|
should contain the size of the open_how structure, and
|
|
.I off
|
|
should be set to the address of the open_how structure. See also
|
|
.BR openat2(2)
|
|
for the general description of the related system call. Available since 5.6.
|
|
|
|
.TP
|
|
.B IORING_OP_CLOSE
|
|
Issue the equivalent of a
|
|
.BR close(2)
|
|
system call.
|
|
.I fd
|
|
is the file descriptor to be closed. See also
|
|
.BR close(2)
|
|
for the general description of the related system call. Available since 5.6.
|
|
|
|
.TP
|
|
.B IORING_OP_STATX
|
|
Issue the equivalent of a
|
|
.BR statx(2)
|
|
system call.
|
|
.I fd
|
|
is the
|
|
.I dirfd
|
|
argument,
|
|
.I addr
|
|
must contain a pointer to the
|
|
.I *pathname
|
|
string,
|
|
.I statx_flags
|
|
is the
|
|
.I flags
|
|
argument,
|
|
.I len
|
|
should be the
|
|
.I mask
|
|
argument, and
|
|
.I off
|
|
must contain a pointer to the
|
|
.I statxbuf
|
|
to be filled in. See also
|
|
.BR statx(2)
|
|
for the general description of the related system call. Available since 5.6.
|
|
|
|
.TP
|
|
.B IORING_OP_READ
|
|
.TP
|
|
.B IORING_OP_WRITE
|
|
Issue the equivalent of a
|
|
.BR pread(2)
|
|
or
|
|
.BR pwrite(2)
|
|
system call.
|
|
.I fd
|
|
is the file descriptor to be operated on,
|
|
.I addr
|
|
contains the buffer in question,
|
|
.I len
|
|
contains the length of the IO operation, and
|
|
.I offs
|
|
contains the read or write offset. If
|
|
.I fd
|
|
does not refer to a seekable file,
|
|
.I off
|
|
must be set to zero. If
|
|
.I offs
|
|
is set to -1, the offset will use (and advance) the file position, like the
|
|
.BR read(2)
|
|
and
|
|
.BR write(2)
|
|
system calls. These are non-vectored versions of the
|
|
.B IORING_OP_READV
|
|
and
|
|
.B IORING_OP_WRITEV
|
|
opcodes. See also
|
|
.BR read(2)
|
|
and
|
|
.BR write(2)
|
|
for the general description of the related system call. Available since 5.6.
|
|
|
|
.TP
|
|
.B IORING_OP_SPLICE
|
|
Issue the equivalent of a
|
|
.BR splice(2)
|
|
system call.
|
|
.I splice_fd_in
|
|
is the file descriptor to read from,
|
|
.I splice_off_in
|
|
is an offset to read from,
|
|
.I fd
|
|
is the file descriptor to write to,
|
|
.I off
|
|
is an offset from which to start writing to. A sentinel value of -1 is used
|
|
to pass the equivalent of a NULL for the offsets to
|
|
.BR splice(2).
|
|
.I len
|
|
contains the number of bytes to copy.
|
|
.I splice_flags
|
|
contains a bit mask for the flag field associated with the system call.
|
|
Please note that one of the file descriptors must refer to a pipe.
|
|
See also
|
|
.BR splice(2)
|
|
for the general description of the related system call. Available since 5.7.
|
|
|
|
.TP
|
|
.B IORING_OP_TEE
|
|
Issue the equivalent of a
|
|
.BR tee(2)
|
|
system call.
|
|
.I splice_fd_in
|
|
is the file descriptor to read from,
|
|
.I fd
|
|
is the file descriptor to write to,
|
|
.I len
|
|
contains the number of bytes to copy, and
|
|
.I splice_flags
|
|
contains a bit mask for the flag field associated with the system call.
|
|
Please note that both of the file descriptors must refer to a pipe.
|
|
See also
|
|
.BR tee(2)
|
|
for the general description of the related system call. Available since 5.8.
|
|
|
|
.TP
|
|
.B IORING_OP_FILES_UPDATE
|
|
This command is an alternative to using
|
|
.B IORING_REGISTER_FILES_UPDATE
|
|
which then works in an async fashion, like the rest of the io_uring commands.
|
|
The arguments passed in are the same.
|
|
.I addr
|
|
must contain a pointer to the array of file descriptors,
|
|
.I len
|
|
must contain the length of the array, and
|
|
.I off
|
|
must contain the offset at which to operate. Note that the array of file
|
|
descriptors pointed to in
|
|
.I addr
|
|
must remain valid until this operation has completed. Available since 5.6.
|
|
|
|
.TP
|
|
.B IORING_OP_PROVIDE_BUFFERS
|
|
This command allows an application to register a group of buffers to be used
|
|
by commands that read/receive data. Using buffers in this manner can eliminate
|
|
the need to separate the poll + read, which provides a convenient point in
|
|
time to allocate a buffer for a given request. It's often infeasible to have
|
|
as many buffers available as pending reads or receive. With this feature, the
|
|
application can have its pool of buffers ready in the kernel, and when the
|
|
file or socket is ready to read/receive data, a buffer can be selected for the
|
|
operation.
|
|
.I fd
|
|
must contain the number of buffers to provide,
|
|
.I addr
|
|
must contain the starting address to add buffers from,
|
|
.I len
|
|
must contain the length of each buffer to add from the range,
|
|
.I buf_group
|
|
must contain the group ID of this range of buffers, and
|
|
.I off
|
|
must contain the starting buffer ID of this range of buffers. With that set,
|
|
the kernel adds buffers starting with the memory address in
|
|
.I addr,
|
|
each with a length of
|
|
.I len.
|
|
Hence the application should provide
|
|
.I len * fd
|
|
worth of memory in
|
|
.I addr.
|
|
Buffers are grouped by the group ID, and each buffer within this group will be
|
|
identical in size according to the above arguments. This allows the application
|
|
to provide different groups of buffers, and this is often used to have
|
|
differently sized buffers available depending on what the expectations are of
|
|
the individual request. When submitting a request that should use a provided
|
|
buffer, the
|
|
.B IOSQE_BUFFER_SELECT
|
|
flag must be set, and
|
|
.I buf_group
|
|
must be set to the desired buffer group ID where the buffer should be selected
|
|
from. Available since 5.7.
|
|
|
|
.TP
|
|
.B IORING_OP_REMOVE_BUFFERS
|
|
Remove buffers previously registered with
|
|
.B IORING_OP_PROVIDE_BUFFERS.
|
|
.I fd
|
|
must contain the number of buffers to remove, and
|
|
.I buf_group
|
|
must contain the buffer group ID from which to remove the buffers. Available
|
|
since 5.7.
|
|
|
|
.TP
|
|
.B IORING_OP_SHUTDOWN
|
|
Issue the equivalent of a
|
|
.BR shutdown(2)
|
|
system call.
|
|
.I fd
|
|
is the file descriptor to the socket being shutdown, no other fields should
|
|
be set. Available since 5.11.
|
|
|
|
.TP
|
|
.B IORING_OP_RENAMEAT
|
|
Issue the equivalent of a
|
|
.BR renameat2(2)
|
|
system call.
|
|
.I fd
|
|
should be set to the
|
|
.I olddirfd,
|
|
.I addr
|
|
should be set to the
|
|
.I oldpath,
|
|
.I len
|
|
should be set to the
|
|
.I newdirfd,
|
|
.I addr
|
|
should be set to the
|
|
.I oldpath,
|
|
.I addr2
|
|
should be set to the
|
|
.I newpath,
|
|
and finally
|
|
.I rename_flags
|
|
should be set to the
|
|
.I flags
|
|
passed in to
|
|
.BR renameat2(2).
|
|
Available since 5.11.
|
|
|
|
.TP
|
|
.B IORING_OP_UNLINKAT
|
|
Issue the equivalent of a
|
|
.BR unlinkat2(2)
|
|
system call.
|
|
.I fd
|
|
should be set to the
|
|
.I dirfd,
|
|
.I addr
|
|
should be set to the
|
|
.I pathname,
|
|
and
|
|
.I unlink_flags
|
|
should be set to the
|
|
.I flags
|
|
being passed in to
|
|
.BR unlinkat(2).
|
|
Available since 5.11.
|
|
|
|
.PP
|
|
The
|
|
.I flags
|
|
field is a bit mask. The supported flags are:
|
|
.TP
|
|
.B IOSQE_FIXED_FILE
|
|
When this flag is specified,
|
|
.I fd
|
|
is an index into the files array registered with the io_uring instance (see the
|
|
.B IORING_REGISTER_FILES
|
|
section of the
|
|
.BR io_uring_register (2)
|
|
man page). Available since 5.1.
|
|
.TP
|
|
.B IOSQE_IO_DRAIN
|
|
When this flag is specified, the SQE will not be started before previously
|
|
submitted SQEs have completed, and new SQEs will not be started before this
|
|
one completes. Available since 5.2.
|
|
.TP
|
|
.B IOSQE_IO_LINK
|
|
When this flag is specified, it forms a link with the next SQE in the
|
|
submission ring. That next SQE will not be started before this one completes.
|
|
This, in effect, forms a chain of SQEs, which can be arbitrarily long. The tail
|
|
of the chain is denoted by the first SQE that does not have this flag set.
|
|
This flag has no effect on previous SQE submissions, nor does it impact SQEs
|
|
that are outside of the chain tail. This means that multiple chains can be
|
|
executing in parallel, or chains and individual SQEs. Only members inside the
|
|
chain are serialized. A chain of SQEs will be broken, if any request in that
|
|
chain ends in error. io_uring considers any unexpected result an error. This
|
|
means that, eg, a short read will also terminate the remainder of the chain.
|
|
If a chain of SQE links is broken, the remaining unstarted part of the chain
|
|
will be terminated and completed with
|
|
.B -ECANCELED
|
|
as the error code. Available since 5.3.
|
|
.TP
|
|
.B IOSQE_IO_HARDLINK
|
|
Like IOSQE_IO_LINK, but it doesn't sever regardless of the completion result.
|
|
Note that the link will still sever if we fail submitting the parent request,
|
|
hard links are only resilient in the presence of completion results for
|
|
requests that did submit correctly. IOSQE_IO_HARDLINK implies IOSQE_IO_LINK.
|
|
Available since 5.5.
|
|
.TP
|
|
.B IOSQE_ASYNC
|
|
Normal operation for io_uring is to try and issue an sqe as non-blocking first,
|
|
and if that fails, execute it in an async manner. To support more efficient
|
|
overlapped operation of requests that the application knows/assumes will
|
|
always (or most of the time) block, the application can ask for an sqe to be
|
|
issued async from the start. Available since 5.6.
|
|
.TP
|
|
.B IOSQE_BUFFER_SELECT
|
|
Used in conjunction with the
|
|
.B IORING_OP_PROVIDE_BUFFERS
|
|
command, which registers a pool of buffers to be used by commands that read
|
|
or receive data. When buffers are registered for this use case, and this
|
|
flag is set in the command, io_uring will grab a buffer from this pool when
|
|
the request is ready to receive or read data. If succesful, the resulting CQE
|
|
will have
|
|
.B IORING_CQE_F_BUFFER
|
|
set in the flags part of the struct, and the upper
|
|
.B IORING_CQE_BUFFER_SHIFT
|
|
bits will contain the ID of the selected buffers. This allows the application
|
|
to know exactly which buffer was selected for the operation. If no buffers
|
|
are available and this flag is set, then the request will fail with
|
|
.B -ENOBUFS
|
|
as the error code. Once a buffer has been used, it is no longer available in
|
|
the kernel pool. The application must re-register the given buffer again when
|
|
it is ready to recycle it (eg has completed using it). Available since 5.7.
|
|
|
|
.PP
|
|
.I ioprio
|
|
specifies the I/O priority. See
|
|
.BR ioprio_get (2)
|
|
for a description of Linux I/O priorities.
|
|
|
|
.I fd
|
|
specifies the file descriptor against which the operation will be
|
|
performed, with the exception noted above.
|
|
|
|
If the operation is one of
|
|
.B IORING_OP_READ_FIXED
|
|
or
|
|
.BR IORING_OP_WRITE_FIXED ,
|
|
.I addr
|
|
and
|
|
.I len
|
|
must fall within the buffer located at
|
|
.I buf_index
|
|
in the fixed buffer array. If the operation is either
|
|
.B IORING_OP_READV
|
|
or
|
|
.BR IORING_OP_WRITEV ,
|
|
then
|
|
.I addr
|
|
points to an iovec array of
|
|
.I len
|
|
entries.
|
|
|
|
.IR rw_flags ,
|
|
specified for read and write operations, contains a bitwise OR of
|
|
per-I/O flags, as described in the
|
|
.BR preadv2 (2)
|
|
man page.
|
|
|
|
The
|
|
.I fsync_flags
|
|
bit mask may contain either 0, for a normal file integrity sync, or
|
|
.B IORING_FSYNC_DATASYNC
|
|
to provide data sync only semantics. See the descriptions of
|
|
.B O_SYNC
|
|
and
|
|
.B O_DSYNC
|
|
in the
|
|
.BR open (2)
|
|
manual page for more information.
|
|
|
|
The bits that may be set in
|
|
.I poll_events
|
|
are defined in \fI<poll.h>\fP, and documented in
|
|
.BR poll (2).
|
|
|
|
.I user_data
|
|
is an application-supplied value that will be copied into
|
|
the completion queue entry (see below).
|
|
.I buf_index
|
|
is an index into an array of fixed buffers, and is only valid if fixed
|
|
buffers were registered.
|
|
.I personality
|
|
is the credentials id to use for this operation. See
|
|
.BR io_uring_register(2)
|
|
for how to register personalities with io_uring. If set to 0, the current
|
|
personality of the submitting task is used.
|
|
.PP
|
|
Once the submission queue entry is initialized, I/O is submitted by
|
|
placing the index of the submission queue entry into the tail of the
|
|
submission queue. After one or more indexes are added to the queue,
|
|
and the queue tail is advanced, the
|
|
.BR io_uring_enter (2)
|
|
system call can be invoked to initiate the I/O.
|
|
|
|
Completions use the following data structure:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
/*
|
|
* IO completion data structure (Completion Queue Entry)
|
|
*/
|
|
struct io_uring_cqe {
|
|
__u64 user_data; /* sqe->data submission passed back */
|
|
__s32 res; /* result code for this event */
|
|
__u32 flags;
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
.I user_data
|
|
is copied from the field of the same name in the submission queue
|
|
entry. The primary use case is to store data that the application
|
|
will need to access upon completion of this particular I/O. The
|
|
.I flags
|
|
is reserved for future use.
|
|
.I res
|
|
is the operation-specific result, but io_uring-specific errors
|
|
(e.g. flags or opcode invalid) are returned through this field.
|
|
They are described in section
|
|
.B CQE ERRORS.
|
|
.PP
|
|
For read and write opcodes, the
|
|
return values match those documented in the
|
|
.BR preadv2 (2)
|
|
and
|
|
.BR pwritev2 (2)
|
|
man pages.
|
|
Return codes for the io_uring-specific opcodes are documented in the
|
|
description of the opcodes above.
|
|
.PP
|
|
.SH RETURN VALUE
|
|
.BR io_uring_enter ()
|
|
returns the number of I/Os successfully consumed. This can be zero
|
|
if
|
|
.I to_submit
|
|
was zero or if the submission queue was empty. Note that if the ring was
|
|
created with
|
|
.B IORING_SETUP_SQPOLL
|
|
specified, then the return value will generally be the same as
|
|
.I to_submit
|
|
as submission happens outside the context of the system call.
|
|
|
|
The errors related to a submission queue entry will be returned through a
|
|
completion queue entry (see section
|
|
.B CQE ERRORS),
|
|
rather than through the system call itself.
|
|
|
|
Errors that occur not on behalf of a submission queue entry are returned via the
|
|
system call directly. On such an error, -1 is returned and
|
|
.I errno
|
|
is set appropriately.
|
|
.PP
|
|
.SH ERRORS
|
|
These are the errors returned by
|
|
.BR io_uring_enter ()
|
|
system call.
|
|
.TP
|
|
.B EAGAIN
|
|
The kernel was unable to allocate memory for the request, or otherwise ran out
|
|
of resources to handle it. The application should wait for some completions and
|
|
try again.
|
|
.TP
|
|
.B EBADF
|
|
.I fd
|
|
is not a valid file descriptor.
|
|
.TP
|
|
.B EBADFD
|
|
.I fd
|
|
is a valid file descriptor, but the io_uring ring is not in the right state
|
|
(enabled). See
|
|
.BR io_uring_register (2)
|
|
for details on how to enable the ring.
|
|
.TP
|
|
.B EBUSY
|
|
The application is attempting to overcommit the number of requests it can have
|
|
pending. The application should wait for some completions and try again. May
|
|
occur if the application tries to queue more requests than we have room for in
|
|
the CQ ring, or if the application attempts to wait for more events without
|
|
having reaped the ones already present in the CQ ring.
|
|
.TP
|
|
.B EINVAL
|
|
Some bits in the
|
|
.I flags
|
|
argument are invalid.
|
|
.TP
|
|
.B EFAULT
|
|
An invalid user space address was specified for the
|
|
.I sig
|
|
argument.
|
|
.TP
|
|
.B ENXIO
|
|
The io_uring instance is in the process of being torn down.
|
|
.TP
|
|
.B EOPNOTSUPP
|
|
.I fd
|
|
does not refer to an io_uring instance.
|
|
.TP
|
|
.B EINTR
|
|
The operation was interrupted by a delivery of a signal before it could
|
|
complete; see
|
|
.BR signal(7).
|
|
Can happen while waiting for events with
|
|
.B IORING_ENTER_GETEVENTS.
|
|
|
|
.SH CQE ERRORS
|
|
These io_uring-specific errors are returned as a negative value in the
|
|
.I res
|
|
field of the completion queue entry.
|
|
.TP
|
|
.B EACCES
|
|
The
|
|
.I flags
|
|
field or
|
|
.I opcode
|
|
in a submission queue entry is not allowed due to registered restrictions.
|
|
See
|
|
.BR io_uring_register (2)
|
|
for details on how restrictions work.
|
|
.TP
|
|
.B EBADF
|
|
The
|
|
.I fd
|
|
field in the submission queue entry is invalid, or the
|
|
.B IOSQE_FIXED_FILE
|
|
flag was set in the submission queue entry, but no files were registered
|
|
with the io_uring instance.
|
|
.TP
|
|
.B EFAULT
|
|
buffer is outside of the process' accessible address space
|
|
.TP
|
|
.B EFAULT
|
|
.B IORING_OP_READ_FIXED
|
|
or
|
|
.B IORING_OP_WRITE_FIXED
|
|
was specified in the
|
|
.I opcode
|
|
field of the submission queue entry, but either buffers were not
|
|
registered for this io_uring instance, or the address range described
|
|
by
|
|
.I addr
|
|
and
|
|
.I len
|
|
does not fit within the buffer registered at
|
|
.IR buf_index .
|
|
.TP
|
|
.B EINVAL
|
|
The
|
|
.I flags
|
|
field or
|
|
.I opcode
|
|
in a submission queue entry is invalid.
|
|
.TP
|
|
.B EINVAL
|
|
The
|
|
.I buf_index
|
|
member of the submission queue entry is invalid.
|
|
.TP
|
|
.B EINVAL
|
|
The
|
|
.I personality
|
|
field in a submission queue entry is invalid.
|
|
.TP
|
|
.B EINVAL
|
|
.B IORING_OP_NOP
|
|
was specified in the submission queue entry, but the io_uring context
|
|
was setup for polling
|
|
.RB ( IORING_SETUP_IOPOLL
|
|
was specified in the call to io_uring_setup).
|
|
.TP
|
|
.B EINVAL
|
|
.B IORING_OP_READV
|
|
or
|
|
.B IORING_OP_WRITEV
|
|
was specified in the submission queue entry, but the io_uring instance
|
|
has fixed buffers registered.
|
|
.TP
|
|
.B EINVAL
|
|
.B IORING_OP_READ_FIXED
|
|
or
|
|
.B IORING_OP_WRITE_FIXED
|
|
was specified in the submission queue entry, and the
|
|
.I buf_index
|
|
is invalid.
|
|
.TP
|
|
.B EINVAL
|
|
.BR IORING_OP_READV ,
|
|
.BR IORING_OP_WRITEV ,
|
|
.BR IORING_OP_READ_FIXED ,
|
|
.B IORING_OP_WRITE_FIXED
|
|
or
|
|
.B IORING_OP_FSYNC
|
|
was specified in the submission queue entry, but the io_uring instance
|
|
was configured for IOPOLLing, or any of
|
|
.IR addr ,
|
|
.IR ioprio ,
|
|
.IR off ,
|
|
.IR len ,
|
|
or
|
|
.I buf_index
|
|
was set in the submission queue entry.
|
|
.TP
|
|
.B EINVAL
|
|
.B IORING_OP_POLL_ADD
|
|
or
|
|
.B IORING_OP_POLL_REMOVE
|
|
was specified in the
|
|
.I opcode
|
|
field of the submission queue entry, but the io_uring instance was
|
|
configured for busy-wait polling
|
|
.RB ( IORING_SETUP_IOPOLL ),
|
|
or any of
|
|
.IR ioprio ,
|
|
.IR off ,
|
|
.IR len ,
|
|
or
|
|
.I buf_index
|
|
was non-zero in the submission queue entry.
|
|
.TP
|
|
.B EINVAL
|
|
.B IORING_OP_POLL_ADD
|
|
was specified in the
|
|
.I opcode
|
|
field of the submission queue entry, and the
|
|
.I addr
|
|
field was non-zero.
|
|
.TP
|
|
.B EOPNOTSUPP
|
|
.I opcode
|
|
is valid, but not supported by this kernel.
|
|
.TP
|
|
.B EOPNOTSUPP
|
|
.B IOSQE_BUFFER_SELECT
|
|
was set in the
|
|
.I flags
|
|
field of the submission queue entry, but the
|
|
.I opcode
|
|
doesn't support buffer selection.
|