This is almost 100% reproducable crash on MariaDB server 10.6 as of 7e1ec1550ceff29a983bf799622d97b73b79ce43 compiled with -DWITH_URING=yes.
I run sysbench-tpcc (https://github.com/Percona-Lab/sysbench-tpcc) prepare on 40-core machine as
./tpcc.lua --mysql-host=yang04g --mysql-user=sbtest --mysql-password=sbtest --mysql-db=sbtest --time=1200 --threads=56 --report-interval=1 --tables=10 --scale=100 --use_fk=0 --mysql_table_options='DEFAULT CHARSET=utf8mb4' prepare
|
against the similar 40-core machine with the mariadb server. After several minutes of the workload the server crashes.
Backtrace:
10.6 7e1ec1550ceff29a983bf799622d97b73b79ce43
|
#0 0x00007f2e109d8aa1 in pthread_kill () from /lib64/libpthread.so.0
|
#1 0x000055af0b0902c7 in my_write_core (sig=<optimized out>) at /root/krizhanovsky/server/mysys/stacktrace.c:424
|
#2 0x000055af0abd3610 in handle_fatal_signal (sig=6) at /root/krizhanovsky/server/sql/signal_handler.cc:343
|
#3 <signal handler called>
|
#4 0x00007f2e10634387 in raise () from /lib64/libc.so.6
|
#5 0x00007f2e10635a78 in abort () from /lib64/libc.so.6
|
#6 0x000055af0a8da889 in ut_dbg_assertion_failed (expr=expr@entry=0x55af0b2a86a7 "cb->m_err == DB_SUCCESS",
|
file=file@entry=0x55af0b2a8a10 "/root/krizhanovsky/server/storage/innobase/os/os0file.cc", line=line@entry=3843)
|
at /root/krizhanovsky/server/storage/innobase/ut/ut0dbg.cc:60
|
#7 0x000055af0a8c3fe0 in io_callback (cb=<optimized out>) at /root/krizhanovsky/server/storage/innobase/os/os0file.cc:3843
|
#8 io_callback (cb=<optimized out>) at /root/krizhanovsky/server/storage/innobase/os/os0file.cc:3841
|
#9 0x000055af0b035668 in tpool::task_group::execute (this=0x55af0c8f27d0, t=0x55af0c917c78) at /root/krizhanovsky/server/tpool/task_group.cc:55
|
#10 0x000055af0b0345af in tpool::thread_pool_generic::worker_main (this=0x55af0c816320, thread_var=0x55af0c823fc0) at /root/krizhanovsky/server/tpool/tpool_generic.cc:550
|
#11 0x000055af0b0f7cff in execute_native_thread_routine ()
|
#12 0x00007f2e109d3ea5 in start_thread () from /lib64/libpthread.so.0
|
#13 0x00007f2e106fc8dd in clone () from /lib64/libc.so.6
|
Following patch
--- a/tpool/aio_liburing.cc
|
+++ b/tpool/aio_liburing.cc
|
@@ -152,6 +152,9 @@ class aio_uring final : public tpool::aio
|
if (res < 0)
|
{
|
iocb->m_err= -res;
|
+ my_printf_error(ER_UNKNOWN_ERROR,
|
+ "io_uring_cqe_get_data() returned %d\n",
|
+ ME_ERROR_LOG | ME_FATAL, res);
|
iocb->m_ret_len= 0;
|
}
|
else
|
produces line
2021-05-23 11:07:09 0 [ERROR] mariadbd: io_uring_cqe_get_data() returned -11
|
in the error log.
- is caused by
-
MDEV-24883
add io_uring support for tpool
-
-
Closed
{"report":{"fcp":1651.4000000953674,"ttfb":601.3000001907349,"pageVisibility":"visible","entityId":99881,"key":"jira.project.issue.view-issue","isInitial":true,"threshold":1000,"elementTimings":{},"userDeviceMemory":8,"userDeviceProcessors":64,"apdex":0.5,"journeyId":"29d1b610-90ae-422e-bdcf-40e5f298e1b3","navigationType":0,"readyForUser":1739.6000001430511,"redirectCount":0,"resourceLoadedEnd":1938.6000001430511,"resourceLoadedStart":606.6000001430511,"resourceTiming":[{"duration":459.2999999523163,"initiatorType":"link","name":"https://jira.mariadb.org/s/2c21342762a6a02add1c328bed317ffd-CDN/lu2cib/820016/12ta74/0a8bac35585be7fc6c9cc5a0464cd4cf/_/download/contextbatch/css/_super/batch.css","startTime":606.6000001430511,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":606.6000001430511,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":1065.9000000953674,"responseStart":0,"secureConnectionStart":0},{"duration":460,"initiatorType":"link","name":"https://jira.mariadb.org/s/7ebd35e77e471bc30ff0eba799ebc151-CDN/lu2cib/820016/12ta74/494e4c556ecbb29f90a3d3b4f09cb99c/_/download/contextbatch/css/jira.browse.project,project.issue.navigator,jira.view.issue,jira.general,jira.global,atl.general,-_super/batch.css?agile_global_admin_condition=true&jag=true&jira.create.linked.issue=true&slack-enabled=true&whisper-enabled=true","startTime":606.9000000953674,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":606.9000000953674,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":1066.9000000953674,"responseStart":0,"secureConnectionStart":0},{"duration":517.7000000476837,"initiatorType":"script","name":"https://jira.mariadb.org/s/0917945aaa57108d00c5076fea35e069-CDN/lu2cib/820016/12ta74/0a8bac35585be7fc6c9cc5a0464cd4cf/_/download/contextbatch/js/_super/batch.js?locale=en","startTime":607,"connectEnd":607,"connectStart":607,"domainLookupEnd":607,"domainLookupStart":607,"fetchStart":607,"redirectEnd":0,"redirectStart":0,"requestStart":607,"responseEnd":1124.7000000476837,"responseStart":1124.7000000476837,"secureConnectionStart":607},{"duration":604.7999999523163,"initiatorType":"script","name":"https://jira.mariadb.org/s/2d8175ec2fa4c816e8023260bd8c1786-CDN/lu2cib/820016/12ta74/494e4c556ecbb29f90a3d3b4f09cb99c/_/download/contextbatch/js/jira.browse.project,project.issue.navigator,jira.view.issue,jira.general,jira.global,atl.general,-_super/batch.js?agile_global_admin_condition=true&jag=true&jira.create.linked.issue=true&locale=en&slack-enabled=true&whisper-enabled=true","startTime":607.2000000476837,"connectEnd":607.2000000476837,"connectStart":607.2000000476837,"domainLookupEnd":607.2000000476837,"domainLookupStart":607.2000000476837,"fetchStart":607.2000000476837,"redirectEnd":0,"redirectStart":0,"requestStart":607.2000000476837,"responseEnd":1212,"responseStart":1212,"secureConnectionStart":607.2000000476837},{"duration":608.4000000953674,"initiatorType":"script","name":"https://jira.mariadb.org/s/a9324d6758d385eb45c462685ad88f1d-CDN/lu2cib/820016/12ta74/c92c0caa9a024ae85b0ebdbed7fb4bd7/_/download/contextbatch/js/atl.global,-_super/batch.js?locale=en","startTime":607.4000000953674,"connectEnd":607.4000000953674,"connectStart":607.4000000953674,"domainLookupEnd":607.4000000953674,"domainLookupStart":607.4000000953674,"fetchStart":607.4000000953674,"redirectEnd":0,"redirectStart":0,"requestStart":607.4000000953674,"responseEnd":1215.8000001907349,"responseStart":1215.8000001907349,"secureConnectionStart":607.4000000953674},{"duration":608.7999999523163,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2cib/820016/12ta74/1.0/_/download/batch/jira.webresources:calendar-en/jira.webresources:calendar-en.js","startTime":607.6000001430511,"connectEnd":607.6000001430511,"connectStart":607.6000001430511,"domainLookupEnd":607.6000001430511,"domainLookupStart":607.6000001430511,"fetchStart":607.6000001430511,"redirectEnd":0,"redirectStart":0,"requestStart":607.6000001430511,"responseEnd":1216.4000000953674,"responseStart":1216.4000000953674,"secureConnectionStart":607.6000001430511},{"duration":609.0999999046326,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2cib/820016/12ta74/1.0/_/download/batch/jira.webresources:calendar-localisation-moment/jira.webresources:calendar-localisation-moment.js","startTime":607.8000001907349,"connectEnd":607.8000001907349,"connectStart":607.8000001907349,"domainLookupEnd":607.8000001907349,"domainLookupStart":607.8000001907349,"fetchStart":607.8000001907349,"redirectEnd":0,"redirectStart":0,"requestStart":607.8000001907349,"responseEnd":1216.9000000953674,"responseStart":1216.9000000953674,"secureConnectionStart":607.8000001907349},{"duration":690.3000001907349,"initiatorType":"link","name":"https://jira.mariadb.org/s/b04b06a02d1959df322d9cded3aeecc1-CDN/lu2cib/820016/12ta74/a2ff6aa845ffc9a1d22fe23d9ee791fc/_/download/contextbatch/css/jira.global.look-and-feel,-_super/batch.css","startTime":608,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":608,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":1298.3000001907349,"responseStart":0,"secureConnectionStart":0},{"duration":609.2000000476837,"initiatorType":"script","name":"https://jira.mariadb.org/rest/api/1.0/shortcuts/820016/47140b6e0a9bc2e4913da06536125810/shortcuts.js?context=issuenavigation&context=issueaction","startTime":608.1000001430511,"connectEnd":608.1000001430511,"connectStart":608.1000001430511,"domainLookupEnd":608.1000001430511,"domainLookupStart":608.1000001430511,"fetchStart":608.1000001430511,"redirectEnd":0,"redirectStart":0,"requestStart":608.1000001430511,"responseEnd":1217.3000001907349,"responseStart":1217.3000001907349,"secureConnectionStart":608.1000001430511},{"duration":690.5,"initiatorType":"link","name":"https://jira.mariadb.org/s/3ac36323ba5e4eb0af2aa7ac7211b4bb-CDN/lu2cib/820016/12ta74/d176f0986478cc64f24226b3d20c140d/_/download/contextbatch/css/com.atlassian.jira.projects.sidebar.init,-_super,-project.issue.navigator,-jira.view.issue/batch.css?jira.create.linked.issue=true","startTime":608.3000001907349,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":608.3000001907349,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":1298.8000001907349,"responseStart":0,"secureConnectionStart":0},{"duration":609.4000000953674,"initiatorType":"script","name":"https://jira.mariadb.org/s/5d5e8fe91fbc506585e83ea3b62ccc4b-CDN/lu2cib/820016/12ta74/d176f0986478cc64f24226b3d20c140d/_/download/contextbatch/js/com.atlassian.jira.projects.sidebar.init,-_super,-project.issue.navigator,-jira.view.issue/batch.js?jira.create.linked.issue=true&locale=en","startTime":608.4000000953674,"connectEnd":608.4000000953674,"connectStart":608.4000000953674,"domainLookupEnd":608.4000000953674,"domainLookupStart":608.4000000953674,"fetchStart":608.4000000953674,"redirectEnd":0,"redirectStart":0,"requestStart":608.4000000953674,"responseEnd":1217.8000001907349,"responseStart":1217.8000001907349,"secureConnectionStart":608.4000000953674},{"duration":1141.4000000953674,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2cib/820016/12ta74/1.0/_/download/batch/jira.webresources:bigpipe-js/jira.webresources:bigpipe-js.js","startTime":614,"connectEnd":614,"connectStart":614,"domainLookupEnd":614,"domainLookupStart":614,"fetchStart":614,"redirectEnd":0,"redirectStart":0,"requestStart":614,"responseEnd":1755.4000000953674,"responseStart":1755.4000000953674,"secureConnectionStart":614},{"duration":1292.2999999523163,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2cib/820016/12ta74/1.0/_/download/batch/jira.webresources:bigpipe-init/jira.webresources:bigpipe-init.js","startTime":646.3000001907349,"connectEnd":646.3000001907349,"connectStart":646.3000001907349,"domainLookupEnd":646.3000001907349,"domainLookupStart":646.3000001907349,"fetchStart":646.3000001907349,"redirectEnd":0,"redirectStart":0,"requestStart":646.3000001907349,"responseEnd":1938.6000001430511,"responseStart":1938.6000001430511,"secureConnectionStart":646.3000001907349},{"duration":467.2000000476837,"initiatorType":"xmlhttprequest","name":"https://jira.mariadb.org/rest/webResources/1.0/resources","startTime":1316.1000001430511,"connectEnd":1316.1000001430511,"connectStart":1316.1000001430511,"domainLookupEnd":1316.1000001430511,"domainLookupStart":1316.1000001430511,"fetchStart":1316.1000001430511,"redirectEnd":0,"redirectStart":0,"requestStart":1316.1000001430511,"responseEnd":1783.3000001907349,"responseStart":1783.3000001907349,"secureConnectionStart":1316.1000001430511},{"duration":447.80000019073486,"initiatorType":"script","name":"https://www.google-analytics.com/analytics.js","startTime":1638.5,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":1638.5,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":2086.300000190735,"responseStart":0,"secureConnectionStart":0}],"fetchStart":0,"domainLookupStart":0,"domainLookupEnd":0,"connectStart":0,"connectEnd":0,"requestStart":422,"responseStart":601,"responseEnd":646,"domLoading":605,"domInteractive":2021,"domContentLoadedEventStart":2021,"domContentLoadedEventEnd":2086,"domComplete":2847,"loadEventStart":2847,"loadEventEnd":2847,"userAgent":"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ClaudeBot/1.0; +claudebot@anthropic.com)","marks":[{"name":"bigPipe.sidebar-id.start","time":1957.9000000953674},{"name":"bigPipe.sidebar-id.end","time":1958.8000001907349},{"name":"bigPipe.activity-panel-pipe-id.start","time":1959},{"name":"bigPipe.activity-panel-pipe-id.end","time":1976.4000000953674},{"name":"activityTabFullyLoaded","time":2106.9000000953674}],"measures":[],"correlationId":"b15c8dcc79d0c7","effectiveType":"4g","downlink":9.3,"rtt":0,"serverDuration":114,"dbReadsTimeInMs":11,"dbConnsTimeInMs":20,"applicationHash":"9d11dbea5f4be3d4cc21f03a88dd11d8c8687422","experiments":[]}}
A relatively easy workaround for Linux bugs could be to retry with synchronous IO instead once EAGAIN was collected.
Like below (but note, I did not test nor compile as I do not have a uring env)
diff --git a/tpool/aio_liburing.cc b/tpool/aio_liburing.cc
index bdc3601ae35..b5a5279b99a 100644
--- a/tpool/aio_liburing.cc
+++ b/tpool/aio_liburing.cc
@@ -162,6 +162,16 @@ class aio_uring final : public tpool::aio
io_uring_cqe_seen(&aio->uring_, cqe);
+ if (iocb->m_err == EAGAIN)
+ {
+ if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD)
+ iocb->m_ret_len= pread(iocb->m_fh, iocb->m_buffer, iocb->m_len, iocb->m_offset);
+ else
+ iocb->m_ret_len= pwrite(iocb->m_fh, iocb->m_buffer, iocb->m_len, iocb->m_offset);
+
+ iocb->m_err= iocb->m_ret_len < 0 ? errno : 0;
+ }
+
iocb->m_internal_task.m_func= iocb->m_callback;
iocb->m_internal_task.m_arg= iocb;
iocb->m_internal_task.m_group= iocb->m_group;
krizhanovsky, maybe you can check if that patch helps?