472 lines
13 KiB
Groff
472 lines
13 KiB
Groff
.\" Copyright (C) 2019 Jens Axboe <axboe@kernel.dk>
|
|
.\" Copyright (C) 2019 Jon Corbet <corbet@lwn.net>
|
|
.\" Copyright (C) 2019 Red Hat, Inc.
|
|
.\"
|
|
.\" SPDX-License-Identifier: LGPL-2.0-or-later
|
|
.\"
|
|
.TH IO_URING_SETUP 2 2019-01-29 "Linux" "Linux Programmer's Manual"
|
|
.SH NAME
|
|
io_uring_setup \- setup a context for performing asynchronous I/O
|
|
.SH SYNOPSIS
|
|
.nf
|
|
.BR "#include <linux/io_uring.h>"
|
|
.PP
|
|
.BI "int io_uring_setup(u32 " entries ", struct io_uring_params *" p );
|
|
.fi
|
|
.PP
|
|
.SH DESCRIPTION
|
|
.PP
|
|
The io_uring_setup() system call sets up a submission queue (SQ) and
|
|
completion queue (CQ) with at least
|
|
.I entries
|
|
entries, and returns a file descriptor which can be used to perform
|
|
subsequent operations on the io_uring instance. The submission and
|
|
completion queues are shared between userspace and the kernel, which
|
|
eliminates the need to copy data when initiating and completing I/O.
|
|
|
|
.I params
|
|
is used by the application to pass options to the kernel, and by the
|
|
kernel to convey information about the ring buffers.
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
struct io_uring_params {
|
|
__u32 sq_entries;
|
|
__u32 cq_entries;
|
|
__u32 flags;
|
|
__u32 sq_thread_cpu;
|
|
__u32 sq_thread_idle;
|
|
__u32 features;
|
|
__u32 resv[4];
|
|
struct io_sqring_offsets sq_off;
|
|
struct io_cqring_offsets cq_off;
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
The
|
|
.IR flags ,
|
|
.IR sq_thread_cpu ,
|
|
and
|
|
.I sq_thread_idle
|
|
fields are used to configure the io_uring instance.
|
|
.I flags
|
|
is a bit mask of 0 or more of the following values ORed
|
|
together:
|
|
.TP
|
|
.B IORING_SETUP_IOPOLL
|
|
Perform busy-waiting for an I/O completion, as opposed to getting
|
|
notifications via an asynchronous IRQ (Interrupt Request). The file
|
|
system (if any) and block device must support polling in order for
|
|
this to work. Busy-waiting provides lower latency, but may consume
|
|
more CPU resources than interrupt driven I/O. Currently, this feature
|
|
is usable only on a file descriptor opened using the
|
|
.B O_DIRECT
|
|
flag. When a read or write is submitted to a polled context, the
|
|
application must poll for completions on the CQ ring by calling
|
|
.BR io_uring_enter (2).
|
|
It is illegal to mix and match polled and non-polled I/O on an io_uring
|
|
instance.
|
|
|
|
.TP
|
|
.B IORING_SETUP_SQPOLL
|
|
When this flag is specified, a kernel thread is created to perform
|
|
submission queue polling. An io_uring instance configured in this way
|
|
enables an application to issue I/O without ever context switching
|
|
into the kernel. By using the submission queue to fill in new
|
|
submission queue entries and watching for completions on the
|
|
completion queue, the application can submit and reap I/Os without
|
|
doing a single system call.
|
|
|
|
If the kernel thread is idle for more than
|
|
.I sq_thread_idle
|
|
milliseconds, it will set the
|
|
.B IORING_SQ_NEED_WAKEUP
|
|
bit in the
|
|
.I flags
|
|
field of the
|
|
.IR "struct io_sq_ring" .
|
|
When this happens, the application must call
|
|
.BR io_uring_enter (2)
|
|
to wake the kernel thread. If I/O is kept busy, the kernel thread
|
|
will never sleep. An application making use of this feature will need
|
|
to guard the
|
|
.BR io_uring_enter (2)
|
|
call with the following code sequence:
|
|
|
|
.in +4n
|
|
.EX
|
|
/*
|
|
* Ensure that the wakeup flag is read after the tail pointer
|
|
* has been written. It's important to use memory load acquire
|
|
* semantics for the flags read, as otherwise the application
|
|
* and the kernel might not agree on the consistency of the
|
|
* wakeup flag.
|
|
*/
|
|
unsigned flags = atomic_load_relaxed(sq_ring->flags);
|
|
if (flags & IORING_SQ_NEED_WAKEUP)
|
|
io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP);
|
|
.EE
|
|
.in
|
|
|
|
where
|
|
.I sq_ring
|
|
is a submission queue ring setup using the
|
|
.I struct io_sqring_offsets
|
|
described below.
|
|
.TP
|
|
.BR
|
|
Before version 5.11 of the Linux kernel, to successfully use this feature, the
|
|
application must register a set of files to be used for IO through
|
|
.BR io_uring_register (2)
|
|
using the
|
|
.B IORING_REGISTER_FILES
|
|
opcode. Failure to do so will result in submitted IO being errored with
|
|
.B EBADF.
|
|
The presence of this feature can be detected by the
|
|
.B IORING_FEAT_SQPOLL_NONFIXED
|
|
feature flag.
|
|
In version 5.11 and later, it is no longer necessary to register files to use
|
|
this feature. 5.11 also allows using this as non-root, if the user has the
|
|
.B CAP_SYS_NICE
|
|
capability.
|
|
.TP
|
|
.B IORING_SETUP_SQ_AFF
|
|
If this flag is specified, then the poll thread will be bound to the
|
|
cpu set in the
|
|
.I sq_thread_cpu
|
|
field of the
|
|
.IR "struct io_uring_params" .
|
|
This flag is only meaningful when
|
|
.B IORING_SETUP_SQPOLL
|
|
is specified. When cgroup setting
|
|
.I cpuset.cpus
|
|
changes (typically in container environment), the bounded cpu set may be
|
|
changed as well.
|
|
.TP
|
|
.B IORING_SETUP_CQSIZE
|
|
Create the completion queue with
|
|
.IR "struct io_uring_params.cq_entries"
|
|
entries. The value must be greater than
|
|
.IR entries ,
|
|
and may be rounded up to the next power-of-two.
|
|
.TP
|
|
.B IORING_SETUP_CLAMP
|
|
If this flag is specified, and if
|
|
.IR entries
|
|
exceeds
|
|
.B IORING_MAX_ENTRIES ,
|
|
then
|
|
.IR entries
|
|
will be clamped at
|
|
.B IORING_MAX_ENTRIES .
|
|
If the flag
|
|
.BR IORING_SETUP_SQPOLL
|
|
is set, and if the value of
|
|
.IR "struct io_uring_params.cq_entries"
|
|
exceeds
|
|
.B IORING_MAX_CQ_ENTRIES ,
|
|
then it will be clamped at
|
|
.B IORING_MAX_CQ_ENTRIES .
|
|
.TP
|
|
.B IORING_SETUP_ATTACH_WQ
|
|
This flag should be set in conjunction with
|
|
.IR "struct io_uring_params.wq_fd"
|
|
being set to an existing io_uring ring file descriptor. When set, the
|
|
io_uring instance being created will share the asynchronous worker
|
|
thread backend of the specified io_uring ring, rather than create a new
|
|
separate thread pool.
|
|
.TP
|
|
.B IORING_SETUP_R_DISABLED
|
|
If this flag is specified, the io_uring ring starts in a disabled state.
|
|
In this state, restrictions can be registered, but submissions are not allowed.
|
|
See
|
|
.BR io_uring_register (2)
|
|
for details on how to enable the ring. Available since 5.10.
|
|
.PP
|
|
If no flags are specified, the io_uring instance is setup for
|
|
interrupt driven I/O. I/O may be submitted using
|
|
.BR io_uring_enter (2)
|
|
and can be reaped by polling the completion queue.
|
|
|
|
The
|
|
.I resv
|
|
array must be initialized to zero.
|
|
|
|
.I features
|
|
is filled in by the kernel, which specifies various features supported
|
|
by current kernel version.
|
|
.TP
|
|
.B IORING_FEAT_SINGLE_MMAP
|
|
If this flag is set, the two SQ and CQ rings can be mapped with a single
|
|
.I mmap(2)
|
|
call. The SQEs must still be allocated separately. This brings the necessary
|
|
.I mmap(2)
|
|
calls down from three to two.
|
|
.TP
|
|
.B IORING_FEAT_NODROP
|
|
If this flag is set, io_uring supports never dropping completion events.
|
|
If a completion event occurs and the CQ ring is full, the kernel stores
|
|
the event internally until such a time that the CQ ring has room for more
|
|
entries. If this overflow condition is entered, attempting to submit more
|
|
IO with fail with the
|
|
.B -EBUSY
|
|
error value, if it can't flush the overflown events to the CQ ring. If this
|
|
happens, the application must reap events from the CQ ring and attempt the
|
|
submit again.
|
|
.TP
|
|
.B IORING_FEAT_SUBMIT_STABLE
|
|
If this flag is set, applications can be certain that any data for
|
|
async offload has been consumed when the kernel has consumed the SQE.
|
|
.TP
|
|
.B IORING_FEAT_RW_CUR_POS
|
|
If this flag is set, applications can specify
|
|
.I offset
|
|
== -1 with
|
|
.B IORING_OP_{READV,WRITEV}
|
|
,
|
|
.B IORING_OP_{READ,WRITE}_FIXED
|
|
, and
|
|
.B IORING_OP_{READ,WRITE}
|
|
to mean current file position, which behaves like
|
|
.I preadv2(2)
|
|
and
|
|
.I pwritev2(2)
|
|
with
|
|
.I offset
|
|
== -1. It'll use (and update) the current file position. This obviously comes
|
|
with the caveat that if the application has multiple reads or writes in flight,
|
|
then the end result will not be as expected. This is similar to threads sharing
|
|
a file descriptor and doing IO using the current file position.
|
|
.TP
|
|
.B IORING_FEAT_CUR_PERSONALITY
|
|
If this flag is set, then io_uring guarantees that both sync and async
|
|
execution of a request assumes the credentials of the task that called
|
|
.I
|
|
io_uring_enter(2)
|
|
to queue the requests. If this flag isn't set, then requests are issued with
|
|
the credentials of the task that originally registered the io_uring. If only
|
|
one task is using a ring, then this flag doesn't matter as the credentials
|
|
will always be the same. Note that this is the default behavior, tasks can
|
|
still register different personalities through
|
|
.I
|
|
io_uring_register(2)
|
|
with
|
|
.B IORING_REGISTER_PERSONALITY
|
|
and specify the personality to use in the sqe.
|
|
.TP
|
|
.B IORING_FEAT_FAST_POLL
|
|
If this flag is set, then io_uring supports using an internal poll mechanism
|
|
to drive data/space readiness. This means that requests that cannot read or
|
|
write data to a file no longer need to be punted to an async thread for
|
|
handling, instead they will begin operation when the file is ready. This is
|
|
similar to doing poll + read/write in userspace, but eliminates the need to do
|
|
so. If this flag is set, requests waiting on space/data consume a lot less
|
|
resources doing so as they are not blocking a thread.
|
|
.TP
|
|
.B IORING_FEAT_POLL_32BITS
|
|
If this flag is set, the
|
|
.B IORING_OP_POLL_ADD
|
|
command accepts the full 32-bit range of epoll based flags. Most notably
|
|
.B EPOLLEXCLUSIVE
|
|
which allows exclusive (waking single waiters) behavior.
|
|
.TP
|
|
.B IORING_FEAT_SQPOLL_NONFIXED
|
|
If this flag is set, the
|
|
.B IORING_SETUP_SQPOLL
|
|
feature no longer requires the use of fixed files. Any normal file descriptor
|
|
can be used for IO commands without needing registration.
|
|
|
|
.PP
|
|
The rest of the fields in the
|
|
.I struct io_uring_params
|
|
are filled in by the kernel, and provide the information necessary to
|
|
memory map the submission queue, completion queue, and the array of
|
|
submission queue entries.
|
|
.I sq_entries
|
|
specifies the number of submission queue entries allocated.
|
|
.I sq_off
|
|
describes the offsets of various ring buffer fields:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
struct io_sqring_offsets {
|
|
__u32 head;
|
|
__u32 tail;
|
|
__u32 ring_mask;
|
|
__u32 ring_entries;
|
|
__u32 flags;
|
|
__u32 dropped;
|
|
__u32 array;
|
|
__u32 resv[3];
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
Taken together,
|
|
.I sq_entries
|
|
and
|
|
.I sq_off
|
|
provide all of the information necessary for accessing the submission
|
|
queue ring buffer and the submission queue entry array. The
|
|
submission queue can be mapped with a call like:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
ptr = mmap(0, sq_off.array + sq_entries * sizeof(__u32),
|
|
PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
|
|
ring_fd, IORING_OFF_SQ_RING);
|
|
.EE
|
|
.in
|
|
.PP
|
|
where
|
|
.I sq_off
|
|
is the
|
|
.I io_sqring_offsets
|
|
structure, and
|
|
.I ring_fd
|
|
is the file descriptor returned from
|
|
.BR io_uring_setup (2).
|
|
The addition of
|
|
.I sq_off.array
|
|
to the length of the region accounts for the fact that the ring
|
|
located at the end of the data structure. As an example, the ring
|
|
buffer head pointer can be accessed by adding
|
|
.I sq_off.head
|
|
to the address returned from
|
|
.BR mmap (2):
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
head = ptr + sq_off.head;
|
|
.EE
|
|
.in
|
|
|
|
The
|
|
.I flags
|
|
field is used by the kernel to communicate state information to the
|
|
application. Currently, it is used to inform the application when a
|
|
call to
|
|
.BR io_uring_enter (2)
|
|
is necessary. See the documentation for the
|
|
.B IORING_SETUP_SQPOLL
|
|
flag above.
|
|
The
|
|
.I dropped
|
|
member is incremented for each invalid submission queue entry
|
|
encountered in the ring buffer.
|
|
|
|
The head and tail track the ring buffer state. The tail is
|
|
incremented by the application when submitting new I/O, and the head
|
|
is incremented by the kernel when the I/O has been successfully
|
|
submitted. Determining the index of the head or tail into the ring is
|
|
accomplished by applying a mask:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
index = tail & ring_mask;
|
|
.EE
|
|
.in
|
|
.PP
|
|
The array of submission queue entries is mapped with:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
sqentries = mmap(0, sq_entries * sizeof(struct io_uring_sqe),
|
|
PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
|
|
ring_fd, IORING_OFF_SQES);
|
|
.EE
|
|
.in
|
|
.PP
|
|
The completion queue is described by
|
|
.I cq_entries
|
|
and
|
|
.I cq_off
|
|
shown here:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
struct io_cqring_offsets {
|
|
__u32 head;
|
|
__u32 tail;
|
|
__u32 ring_mask;
|
|
__u32 ring_entries;
|
|
__u32 overflow;
|
|
__u32 cqes;
|
|
__u32 flags;
|
|
__u32 resv[3];
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
The completion queue is simpler, since the entries are not separated
|
|
from the queue itself, and can be mapped with:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
ptr = mmap(0, cq_off.cqes + cq_entries * sizeof(struct io_uring_cqe),
|
|
PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, ring_fd,
|
|
IORING_OFF_CQ_RING);
|
|
.EE
|
|
.in
|
|
.PP
|
|
Closing the file descriptor returned by
|
|
.BR io_uring_setup (2)
|
|
will free all resources associated with the io_uring context.
|
|
.PP
|
|
.SH RETURN VALUE
|
|
.BR io_uring_setup (2)
|
|
returns a new file descriptor on success. The application may then
|
|
provide the file descriptor in a subsequent
|
|
.BR mmap (2)
|
|
call to map the submission and completion queues, or to the
|
|
.BR io_uring_register (2)
|
|
or
|
|
.BR io_uring_enter (2)
|
|
system calls.
|
|
|
|
On error, -1 is returned and
|
|
.I errno
|
|
is set appropriately.
|
|
.PP
|
|
.SH ERRORS
|
|
.TP
|
|
.B EFAULT
|
|
params is outside your accessible address space.
|
|
.TP
|
|
.B EINVAL
|
|
The resv array contains non-zero data, p.flags contains an unsupported
|
|
flag,
|
|
.I entries
|
|
is out of bounds,
|
|
.B IORING_SETUP_SQ_AFF
|
|
was specified, but
|
|
.B IORING_SETUP_SQPOLL
|
|
was not, or
|
|
.B IORING_SETUP_CQSIZE
|
|
was specified, but
|
|
.I io_uring_params.cq_entries
|
|
was invalid.
|
|
.TP
|
|
.B EMFILE
|
|
The per-process limit on the number of open file descriptors has been
|
|
reached (see the description of
|
|
.B RLIMIT_NOFILE
|
|
in
|
|
.BR getrlimit (2)).
|
|
.TP
|
|
.B ENFILE
|
|
The system-wide limit on the total number of open files has been
|
|
reached.
|
|
.TP
|
|
.B ENOMEM
|
|
Insufficient kernel resources are available.
|
|
.TP
|
|
.B EPERM
|
|
.B IORING_SETUP_SQPOLL
|
|
was specified, but the effective user ID of the caller did not have sufficient
|
|
privileges.
|
|
.SH SEE ALSO
|
|
.BR io_uring_register (2),
|
|
.BR io_uring_enter (2)
|