at /root/krizhanovsky/server/storage/innobase/ut/ut0dbg.cc:60
#7 0x000055af0a8c3fe0 in io_callback (cb=<optimized out>) at /root/krizhanovsky/server/storage/innobase/os/os0file.cc:3843
#8 io_callback (cb=<optimized out>) at /root/krizhanovsky/server/storage/innobase/os/os0file.cc:3841
#9 0x000055af0b035668 in tpool::task_group::execute (this=0x55af0c8f27d0, t=0x55af0c917c78) at /root/krizhanovsky/server/tpool/task_group.cc:55
#10 0x000055af0b0345af in tpool::thread_pool_generic::worker_main (this=0x55af0c816320, thread_var=0x55af0c823fc0) at /root/krizhanovsky/server/tpool/tpool_generic.cc:550
#11 0x000055af0b0f7cff in execute_native_thread_routine ()
#12 0x00007f2e109d3ea5 in start_thread () from /lib64/libpthread.so.0
#13 0x00007f2e106fc8dd in clone () from /lib64/libc.so.6
Following patch
--- a/tpool/aio_liburing.cc
+++ b/tpool/aio_liburing.cc
@@ -152,6 +152,9 @@ class aio_uring final : public tpool::aio
if (res < 0)
{
iocb->m_err= -res;
+ my_printf_error(ER_UNKNOWN_ERROR,
+ "io_uring_cqe_get_data() returned %d\n",
+ ME_ERROR_LOG | ME_FATAL, res);
iocb->m_ret_len= 0;
}
else
produces line
2021-05-23 11:07:09 0 [ERROR] mariadbd: io_uring_cqe_get_data() returned -11
against the similar 40-core machine with the mariadb server. After several minutes of the workload the server crashes.
Backtrace:
#0 0x00007f2e109d8aa1 in pthread_kill () from /lib64/libpthread.so.0
#1 0x000055af0b0902c7 in my_write_core (sig=<optimized out>) at /root/krizhanovsky/server/mysys/stacktrace.c:424
#2 0x000055af0abd3610 in handle_fatal_signal (sig=6) at /root/krizhanovsky/server/sql/signal_handler.cc:343
#3 <signal handler called>
#4 0x00007f2e10634387 in raise () from /lib64/libc.so.6
#5 0x00007f2e10635a78 in abort () from /lib64/libc.so.6
#6 0x000055af0a8da889 in ut_dbg_assertion_failed (expr=expr@entry=0x55af0b2a86a7 "cb->m_err == DB_SUCCESS",
file=file@entry=0x55af0b2a8a10 "/root/krizhanovsky/server/storage/innobase/os/os0file.cc", line=line@entry=3843)
at /root/krizhanovsky/server/storage/innobase/ut/ut0dbg.cc:60
#7 0x000055af0a8c3fe0 in io_callback (cb=<optimized out>) at /root/krizhanovsky/server/storage/innobase/os/os0file.cc:3843
#8 io_callback (cb=<optimized out>) at /root/krizhanovsky/server/storage/innobase/os/os0file.cc:3841
#9 0x000055af0b035668 in tpool::task_group::execute (this=0x55af0c8f27d0, t=0x55af0c917c78) at /root/krizhanovsky/server/tpool/task_group.cc:55
#10 0x000055af0b0345af in tpool::thread_pool_generic::worker_main (this=0x55af0c816320, thread_var=0x55af0c823fc0) at /root/krizhanovsky/server/tpool/tpool_generic.cc:550
#11 0x000055af0b0f7cff in execute_native_thread_routine ()
#12 0x00007f2e109d3ea5 in start_thread () from /lib64/libpthread.so.0
#13 0x00007f2e106fc8dd in clone () from /lib64/libc.so.6
Following patch
--- a/tpool/aio_liburing.cc
+++ b/tpool/aio_liburing.cc
@@ -152,6 +152,9 @@ class aio_uring final : public tpool::aio
if (res < 0)
{
iocb->m_err= -res;
+ my_printf_error(ER_UNKNOWN_ERROR,
+ "io_uring_cqe_get_data() returned %d\n",
+ ME_ERROR_LOG | ME_FATAL, res);
iocb->m_ret_len= 0;
}
else
produces line
2021-05-23 11:07:09 0 [ERROR] mariadbd: io_uring_cqe_get_data() returned -11
in the error log.
This is almost 100% reproducable crash on MariaDB server 10.6 as of 7e1ec1550ceff29a983bf799622d97b73b79ce43 compiled with -DWITH_URING=yes.
I run sysbench-tpcc (https://github.com/Percona-Lab/sysbench-tpcc) prepare on 40-core machine as
{code:sh}
./tpcc.lua --mysql-host=yang04g --mysql-user=sbtest --mysql-password=sbtest --mysql-db=sbtest --time=1200 --threads=56 --report-interval=1 --tables=10 --scale=100 --use_fk=0 --mysql_table_options='DEFAULT CHARSET=utf8mb4' prepare
{code}
against the similar 40-core machine with the mariadb server. After several minutes of the workload the server crashes.
Backtrace:
{noformat:title=10.6 7e1ec1550ceff29a983bf799622d97b73b79ce43}
#0 0x00007f2e109d8aa1 in pthread_kill () from /lib64/libpthread.so.0
#1 0x000055af0b0902c7 in my_write_core (sig=<optimized out>) at /root/krizhanovsky/server/mysys/stacktrace.c:424
#2 0x000055af0abd3610 in handle_fatal_signal (sig=6) at /root/krizhanovsky/server/sql/signal_handler.cc:343
#3 <signal handler called>
#4 0x00007f2e10634387 in raise () from /lib64/libc.so.6
#5 0x00007f2e10635a78 in abort () from /lib64/libc.so.6
#6 0x000055af0a8da889 in ut_dbg_assertion_failed (expr=expr@entry=0x55af0b2a86a7 "cb->m_err == DB_SUCCESS",
file=file@entry=0x55af0b2a8a10 "/root/krizhanovsky/server/storage/innobase/os/os0file.cc", line=line@entry=3843)
at /root/krizhanovsky/server/storage/innobase/ut/ut0dbg.cc:60
#7 0x000055af0a8c3fe0 in io_callback (cb=<optimized out>) at /root/krizhanovsky/server/storage/innobase/os/os0file.cc:3843
#8 io_callback (cb=<optimized out>) at /root/krizhanovsky/server/storage/innobase/os/os0file.cc:3841
#9 0x000055af0b035668 in tpool::task_group::execute (this=0x55af0c8f27d0, t=0x55af0c917c78) at /root/krizhanovsky/server/tpool/task_group.cc:55
#10 0x000055af0b0345af in tpool::thread_pool_generic::worker_main (this=0x55af0c816320, thread_var=0x55af0c823fc0) at /root/krizhanovsky/server/tpool/tpool_generic.cc:550
#11 0x000055af0b0f7cff in execute_native_thread_routine ()
#12 0x00007f2e109d3ea5 in start_thread () from /lib64/libpthread.so.0
#13 0x00007f2e106fc8dd in clone () from /lib64/libc.so.6
{noformat}
Following patch
{code:diff}
--- a/tpool/aio_liburing.cc
+++ b/tpool/aio_liburing.cc
@@ -152,6 +152,9 @@ class aio_uring final : public tpool::aio
if (res < 0)
{
iocb->m_err= -res;
+ my_printf_error(ER_UNKNOWN_ERROR,
+ "io_uring_cqe_get_data() returned %d\n",
+ ME_ERROR_LOG | ME_FATAL, res);
iocb->m_ret_len= 0;
}
else
{code}
produces line
{noformat}
2021-05-23 11:07:09 0 [ERROR] mariadbd: io_uring_cqe_get_data() returned -11
{noformat}
in the error log.
I use kernel 5.4.105-1.el7.elrepo.x86_64 from the CentOS 7 packages. The distro is outdated, so I build liburing on my own from the git sources (master branch, the last commit 8bfdf56caaca4dc58e230f07d33f875b7d0648c8). I built MariaDB server from source code using branch 10.6 as of commit 7e1ec1550ceff29a983bf799622d97b73b79ce43 . Build options
Alexander Krizhanovsky
added a comment - I use kernel 5.4.105-1.el7.elrepo.x86_64 from the CentOS 7 packages. The distro is outdated, so I build liburing on my own from the git sources (master branch, the last commit 8bfdf56caaca4dc58e230f07d33f875b7d0648c8). I built MariaDB server from source code using branch 10.6 as of commit 7e1ec1550ceff29a983bf799622d97b73b79ce43 . Build options
cmake3 -DWITHOUT_TOKUDB_STORAGE_ENGINE=1 -DWITHOUT_ROCKSDB_STORAGE_ENGINE=1 -DWITHOUT_XTRADB_STORAGE_ENGINE=1 -DWITHOUT_ROCKSDB_SE_STORAGE_ENGINE=1 -DWITH_COLUMNSTORE_STORAGE_ENGINE=0 -DWITH_WSREP=OFF -DWITHOUT_PLUGIN_AUTH_PAM=1 -DPLUGIN_AUTH_PAM=OFF -DWITH_URING=yes -DCMAKE_INSTALL_PREFIX=/opt/mariadb-10.6-dbg ..
krizhanovsky, does this only happen for reads, or also for page writes? Our only asynchronous reads are for read-ahead. I see that linear read-ahead is enabled by default. Please try if the following settings would prevent the crashes:
innodb_random_read_ahead=OFF
innodb_read_ahead_threshold=0
Page writes should always be asynchronous, ever since MDEV-24626 removed the synchronous write of a dummy first page when creating .ibd files.
If disabling the read-ahead cures this, I would like to declare that this is the way how to work around the kernel bug. Maybe we should even consider disabling linear read-ahead on SSD, just like like in MDEV-17380 we disabled neighbour flushing.
Marko Mäkelä
added a comment - krizhanovsky , does this only happen for reads, or also for page writes? Our only asynchronous reads are for read-ahead. I see that linear read-ahead is enabled by default. Please try if the following settings would prevent the crashes:
innodb_random_read_ahead=OFF
innodb_read_ahead_threshold=0
Page writes should always be asynchronous, ever since MDEV-24626 removed the synchronous write of a dummy first page when creating .ibd files.
If disabling the read-ahead cures this, I would like to declare that this is the way how to work around the kernel bug. Maybe we should even consider disabling linear read-ahead on SSD, just like like in MDEV-17380 we disabled neighbour flushing.
I've made some instrumentation and found that there are only write operations leading to the failure. Switching off read ahead doesn't affect reproducability of the issues. However, I overstated that the issue is 100% reproducable - sometimes I need to make several runs to hit the assertion. I see cb->m_len = 2097152 just before the assertion in io_callback().
Unfortunately, this is a challenge to make BPF work in the outdated environment to trace the kernel - I spend some time today to make bpftrace run, but without success. If the tracing will help in debugging the problem, then I'll try to run it in a chroot'ed distro environment, e.g. from debootstrap.
Alexander Krizhanovsky
added a comment - I've made some instrumentation and found that there are only write operations leading to the failure. Switching off read ahead doesn't affect reproducability of the issues. However, I overstated that the issue is 100% reproducable - sometimes I need to make several runs to hit the assertion. I see cb->m_len = 2097152 just before the assertion in io_callback() .
Unfortunately, this is a challenge to make BPF work in the outdated environment to trace the kernel - I spend some time today to make bpftrace run, but without success. If the tracing will help in debugging the problem, then I'll try to run it in a chroot'ed distro environment, e.g. from debootstrap.
2097152 is 128*innodb_page_size (16KiB by default). That should only be initiated by buf_dblwr_t::flush_buffered_writes() and collected by buf_dblwr_t::flush_buffered_writes_completed(). Before MDEV-23855, we used 64-page synchronous writes for the doublewrite buffer. All other asynchronous writes should be of innodb_page_size bytes.
Marko Mäkelä
added a comment - 2097152 is 128* innodb_page_size (16KiB by default). That should only be initiated by buf_dblwr_t::flush_buffered_writes() and collected by buf_dblwr_t::flush_buffered_writes_completed() . Before MDEV-23855 , we used 64-page synchronous writes for the doublewrite buffer. All other asynchronous writes should be of innodb_page_size bytes.
A relatively easy workaround for Linux bugs could be to retry with synchronous IO instead once EAGAIN was collected.
Like below (but note, I did not test nor compile as I do not have a uring env)
krizhanovsky, maybe you can check if that patch helps?
Vladislav Vaintroub
added a comment - - edited A relatively easy workaround for Linux bugs could be to retry with synchronous IO instead once EAGAIN was collected.
Like below (but note, I did not test nor compile as I do not have a uring env)
diff --git a/tpool/aio_liburing.cc b/tpool/aio_liburing.cc
index bdc3601ae35..b5a5279b99a 100644
--- a/tpool/aio_liburing.cc
+++ b/tpool/aio_liburing.cc
@@ -162,6 +162,16 @@ class aio_uring final : public tpool::aio
io_uring_cqe_seen(&aio->uring_, cqe);
+ if (iocb->m_err == EAGAIN)
+ {
+ if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD)
+ iocb->m_ret_len= pread(iocb->m_fh, iocb->m_buffer, iocb->m_len, iocb->m_offset);
+ else
+ iocb->m_ret_len= pwrite(iocb->m_fh, iocb->m_buffer, iocb->m_len, iocb->m_offset);
+
+ iocb->m_err= iocb->m_ret_len < 0 ? errno : 0;
+ }
+
iocb->m_internal_task.m_func= iocb->m_callback;
iocb->m_internal_task.m_arg= iocb;
iocb->m_internal_task.m_group= iocb->m_group;
krizhanovsky , maybe you can check if that patch helps?
This is neither a kernel nor liburing bug. I had a look into Linux 5.4.105 (the one I use in the tests) and the latest 5.4.125 as well as the current Linus tree - in all the kernels the corresponding code is the same. While liburing changed the code involved in the problem a little bit, the main logic remains the same.
The problem is caused by the __io_uring_peek_cqe() called from io_uring_wait_eqe():
We firstly read the RB queue head, next we read the cqe->res and process an error if it's non-zero. The function in our case returns 0, so in this code we observer cqe->res = 0, but later, when we check it in io_uring_cqe_get_data() it's -EAGAIN.
The kernel function io_cqring_fill_event() firstly moves the tail pointer and next writes the cqe->res:
@@ -151,6 +156,11 @@ class aio_uring final : public tpool::aio
int res= cqe->res;
if (res < 0)
{
+ if (cqe->res == -EAGAIN) {
+ aio->queue_prepped(iocb);
+ io_uring_cqe_seen(&aio->uring_, cqe);
+ continue;
+ }
iocb->m_err= -res;
iocb->m_ret_len= 0;
}
This is very close to what Vlad proposed, but in io_uring way.
Alexander Krizhanovsky
added a comment - I tested the Vlad's patch and it works.
This is neither a kernel nor liburing bug. I had a look into Linux 5.4.105 (the one I use in the tests) and the latest 5.4.125 as well as the current Linus tree - in all the kernels the corresponding code is the same. While liburing changed the code involved in the problem a little bit, the main logic remains the same.
The problem is caused by the __io_uring_peek_cqe() called from io_uring_wait_eqe():
do {
unsigned tail = io_uring_smp_load_acquire(ring->cq.ktail);
unsigned head = *ring->cq.khead;
cqe = NULL;
available = tail - head;
if (!available)
break ;
cqe = &ring->cq.cqes[head & mask];
if (!(ring->features & IORING_FEAT_EXT_ARG) &&
cqe->user_data == LIBURING_UDATA_TIMEOUT) {
if (cqe->res < 0)
err = cqe->res;
io_uring_cq_advance(ring, 1);
if (!err)
continue ;
cqe = NULL;
}
break ;
} while (1);
We firstly read the RB queue head, next we read the cqe->res and process an error if it's non-zero. The function in our case returns 0, so in this code we observer cqe->res = 0, but later, when we check it in io_uring_cqe_get_data() it's -EAGAIN.
The kernel function io_cqring_fill_event() firstly moves the tail pointer and next writes the cqe->res:
static struct io_uring_cqe *io_get_cqring( struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
unsigned tail;
tail = ctx->cached_cq_tail;
/*
* writes to the cq entry need to come after reading head; the
* control dependency is enough as we're using WRITE_ONCE to
* fill the cq entry
*/
if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
return NULL;
ctx->cached_cq_tail++;
return &rings->cqes[tail & ctx->cq_mask];
}
static void io_cqring_fill_event( struct io_ring_ctx *ctx, u64 ki_user_data,
long res)
{
struct io_uring_cqe *cqe;
cqe = io_get_cqring(ctx);
if (cqe) {
WRITE_ONCE(cqe->res, res);
.....
It's easy to see that the place is racy: we can read old value of cqe->res in liburing and after that read the new value.
Actually, this isn't a bug and liburing/examples/io_uring-cp.c implements following:
static void queue_prepped( struct io_uring *ring, struct io_data *data)
{
struct io_uring_sqe *sqe;
sqe = io_uring_get_sqe(ring);
assert (sqe);
if (data->read)
io_uring_prep_readv(sqe, infd, &data->iov, 1, data->offset);
else
io_uring_prep_writev(sqe, outfd, &data->iov, 1, data->offset);
io_uring_sqe_set_data(sqe, data);
}
static int copy_file( struct io_uring *ring, off_t insize)
{
....
data = io_uring_cqe_get_data(cqe);
if (cqe->res < 0) {
if (cqe->res == -EAGAIN) {
queue_prepped(ring, data);
io_uring_cqe_seen(ring, cqe);
continue ;
}
So I think the right patch should look like (the servers are still down after the recent outage, so I didn't test the patch yet):
diff --git a/tpool/aio_liburing.cc b/tpool/aio_liburing.cc
index bdc3601ae35..ba1138b2323 100644
--- a/tpool/aio_liburing.cc
+++ b/tpool/aio_liburing.cc
@@ -96,14 +96,7 @@ class aio_uring final : public tpool::aio
// must be atomical. This is because liburing provides thread-unsafe calls.
std::lock_guard<std::mutex> _(mutex_);
- io_uring_sqe *sqe= io_uring_get_sqe(&uring_);
- if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD)
- io_uring_prep_readv(sqe, cb->m_fh, static_cast<struct iovec *>(cb), 1,
- cb->m_offset);
- else
- io_uring_prep_writev(sqe, cb->m_fh, static_cast<struct iovec *>(cb), 1,
- cb->m_offset);
- io_uring_sqe_set_data(sqe, cb);
+ queue_prepped(cb);
return io_uring_submit(&uring_) == 1 ? 0 : -1;
}
@@ -129,6 +122,18 @@ class aio_uring final : public tpool::aio
}
private:
+ void queue_prepped(tpool::aiocb *cb)
+ {
+ io_uring_sqe *sqe= io_uring_get_sqe(&uring_);
+ if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD)
+ io_uring_prep_readv(sqe, cb->m_fh, static_cast<struct iovec *>(cb), 1,
+ cb->m_offset);
+ else
+ io_uring_prep_writev(sqe, cb->m_fh, static_cast<struct iovec *>(cb), 1,
+ cb->m_offset);
+ io_uring_sqe_set_data(sqe, cb);
+ }
+
static void thread_routine(aio_uring *aio)
{
for (;;)
@@ -151,6 +156,11 @@ class aio_uring final : public tpool::aio
int res= cqe->res;
if (res < 0)
{
+ if (cqe->res == -EAGAIN) {
+ aio->queue_prepped(iocb);
+ io_uring_cqe_seen(&aio->uring_, cqe);
+ continue;
+ }
iocb->m_err= -res;
iocb->m_ret_len= 0;
}
This is very close to what Vlad proposed, but in io_uring way.
krizhanovsky, thank you for your analysis. I have not looked at liburing or the kernel code, but I fully trust your expertise. Please file a pull request, and I will merge it as soon as possible.
Marko Mäkelä
added a comment - krizhanovsky , thank you for your analysis. I have not looked at liburing or the kernel code, but I fully trust your expertise. Please file a pull request, and I will merge it as soon as possible.
The patch handles the very rare case of the race in when -EAGAIN error code is written to cqe->res between checking the value in __io_uring_peek_cqe() and reading it again via io_uring_cqe_get_data(). In normal cases io_uring_wait_eqe() returns an error code aio_uring::thread_routine() aborts.
If there are too many I/O requests and there is not space in the io_uring ring buffers, then the patch leads to the same error path as any other errors, i.e. aborting the server.
If the underlying filesystem/block layers constantly return -EAGAIN and we constantly get into the race condition, then the code will be spinning until the race condition is not satisfied and io_uring_wait_eqe() returns with -EAGAIN.
Alexander Krizhanovsky
added a comment - - edited The patch handles the very rare case of the race in when -EAGAIN error code is written to cqe->res between checking the value in __io_uring_peek_cqe() and reading it again via io_uring_cqe_get_data() . In normal cases io_uring_wait_eqe() returns an error code aio_uring::thread_routine() aborts.
If there are too many I/O requests and there is not space in the io_uring ring buffers, then the patch leads to the same error path as any other errors, i.e. aborting the server.
If the underlying filesystem/block layers constantly return -EAGAIN and we constantly get into the race condition, then the code will be spinning until the race condition is not satisfied and io_uring_wait_eqe() returns with -EAGAIN .
Error code 11 ought to be EAGAIN.