This is almost 100% reproducable crash on MariaDB server 10.6 as of 7e1ec1550ceff29a983bf799622d97b73b79ce43 compiled with -DWITH_URING=yes.
I run sysbench-tpcc (https://github.com/Percona-Lab/sysbench-tpcc) prepare on 40-core machine as
./tpcc.lua --mysql-host=yang04g --mysql-user=sbtest --mysql-password=sbtest --mysql-db=sbtest --time=1200 --threads=56 --report-interval=1 --tables=10 --scale=100 --use_fk=0 --mysql_table_options='DEFAULT CHARSET=utf8mb4' prepare
|
against the similar 40-core machine with the mariadb server. After several minutes of the workload the server crashes.
Backtrace:
10.6 7e1ec1550ceff29a983bf799622d97b73b79ce43
|
#0 0x00007f2e109d8aa1 in pthread_kill () from /lib64/libpthread.so.0
|
#1 0x000055af0b0902c7 in my_write_core (sig=<optimized out>) at /root/krizhanovsky/server/mysys/stacktrace.c:424
|
#2 0x000055af0abd3610 in handle_fatal_signal (sig=6) at /root/krizhanovsky/server/sql/signal_handler.cc:343
|
#3 <signal handler called>
|
#4 0x00007f2e10634387 in raise () from /lib64/libc.so.6
|
#5 0x00007f2e10635a78 in abort () from /lib64/libc.so.6
|
#6 0x000055af0a8da889 in ut_dbg_assertion_failed (expr=expr@entry=0x55af0b2a86a7 "cb->m_err == DB_SUCCESS",
|
file=file@entry=0x55af0b2a8a10 "/root/krizhanovsky/server/storage/innobase/os/os0file.cc", line=line@entry=3843)
|
at /root/krizhanovsky/server/storage/innobase/ut/ut0dbg.cc:60
|
#7 0x000055af0a8c3fe0 in io_callback (cb=<optimized out>) at /root/krizhanovsky/server/storage/innobase/os/os0file.cc:3843
|
#8 io_callback (cb=<optimized out>) at /root/krizhanovsky/server/storage/innobase/os/os0file.cc:3841
|
#9 0x000055af0b035668 in tpool::task_group::execute (this=0x55af0c8f27d0, t=0x55af0c917c78) at /root/krizhanovsky/server/tpool/task_group.cc:55
|
#10 0x000055af0b0345af in tpool::thread_pool_generic::worker_main (this=0x55af0c816320, thread_var=0x55af0c823fc0) at /root/krizhanovsky/server/tpool/tpool_generic.cc:550
|
#11 0x000055af0b0f7cff in execute_native_thread_routine ()
|
#12 0x00007f2e109d3ea5 in start_thread () from /lib64/libpthread.so.0
|
#13 0x00007f2e106fc8dd in clone () from /lib64/libc.so.6
|
Following patch
--- a/tpool/aio_liburing.cc
|
+++ b/tpool/aio_liburing.cc
|
@@ -152,6 +152,9 @@ class aio_uring final : public tpool::aio
|
if (res < 0)
|
{
|
iocb->m_err= -res;
|
+ my_printf_error(ER_UNKNOWN_ERROR,
|
+ "io_uring_cqe_get_data() returned %d\n",
|
+ ME_ERROR_LOG | ME_FATAL, res);
|
iocb->m_ret_len= 0;
|
}
|
else
|
produces line
2021-05-23 11:07:09 0 [ERROR] mariadbd: io_uring_cqe_get_data() returned -11
|
in the error log.
- is caused by
-
MDEV-24883
add io_uring support for tpool
-
-
Closed
{"report":{"fcp":943,"ttfb":267.5,"pageVisibility":"visible","entityId":99881,"key":"jira.project.issue.view-issue","isInitial":true,"threshold":1000,"elementTimings":{},"userDeviceMemory":8,"userDeviceProcessors":64,"apdex":0.5,"journeyId":"d2815596-5ee9-4e68-bd84-b34a1bb08d24","navigationType":0,"readyForUser":1018.2999997138977,"redirectCount":0,"resourceLoadedEnd":624.2999997138977,"resourceLoadedStart":275.59999990463257,"resourceTiming":[{"duration":22,"initiatorType":"link","name":"https://jira.mariadb.org/s/2c21342762a6a02add1c328bed317ffd-CDN/lu2cib/820016/12ta74/0a8bac35585be7fc6c9cc5a0464cd4cf/_/download/contextbatch/css/_super/batch.css","startTime":275.59999990463257,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":275.59999990463257,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":297.59999990463257,"responseStart":0,"secureConnectionStart":0},{"duration":22.100000381469727,"initiatorType":"link","name":"https://jira.mariadb.org/s/7ebd35e77e471bc30ff0eba799ebc151-CDN/lu2cib/820016/12ta74/494e4c556ecbb29f90a3d3b4f09cb99c/_/download/contextbatch/css/jira.browse.project,project.issue.navigator,jira.view.issue,jira.general,jira.global,atl.general,-_super/batch.css?agile_global_admin_condition=true&jag=true&jira.create.linked.issue=true&slack-enabled=true&whisper-enabled=true","startTime":275.8999996185303,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":275.8999996185303,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":298,"responseStart":0,"secureConnectionStart":0},{"duration":220.69999980926514,"initiatorType":"script","name":"https://jira.mariadb.org/s/0917945aaa57108d00c5076fea35e069-CDN/lu2cib/820016/12ta74/0a8bac35585be7fc6c9cc5a0464cd4cf/_/download/contextbatch/js/_super/batch.js?locale=en","startTime":276.09999990463257,"connectEnd":276.09999990463257,"connectStart":276.09999990463257,"domainLookupEnd":276.09999990463257,"domainLookupStart":276.09999990463257,"fetchStart":276.09999990463257,"redirectEnd":0,"redirectStart":0,"requestStart":302.3999996185303,"responseEnd":496.7999997138977,"responseStart":325.3999996185303,"secureConnectionStart":276.09999990463257},{"duration":347.69999980926514,"initiatorType":"script","name":"https://jira.mariadb.org/s/2d8175ec2fa4c816e8023260bd8c1786-CDN/lu2cib/820016/12ta74/494e4c556ecbb29f90a3d3b4f09cb99c/_/download/contextbatch/js/jira.browse.project,project.issue.navigator,jira.view.issue,jira.general,jira.global,atl.general,-_super/batch.js?agile_global_admin_condition=true&jag=true&jira.create.linked.issue=true&locale=en&slack-enabled=true&whisper-enabled=true","startTime":276.59999990463257,"connectEnd":301.2999997138977,"connectStart":301.2999997138977,"domainLookupEnd":301.2999997138977,"domainLookupStart":301.2999997138977,"fetchStart":276.59999990463257,"redirectEnd":0,"redirectStart":0,"requestStart":302.19999980926514,"responseEnd":624.2999997138977,"responseStart":320.19999980926514,"secureConnectionStart":301.2999997138977},{"duration":40.200000286102295,"initiatorType":"script","name":"https://jira.mariadb.org/s/a9324d6758d385eb45c462685ad88f1d-CDN/lu2cib/820016/12ta74/c92c0caa9a024ae85b0ebdbed7fb4bd7/_/download/contextbatch/js/atl.global,-_super/batch.js?locale=en","startTime":276.8999996185303,"connectEnd":276.8999996185303,"connectStart":276.8999996185303,"domainLookupEnd":276.8999996185303,"domainLookupStart":276.8999996185303,"fetchStart":276.8999996185303,"redirectEnd":0,"redirectStart":0,"requestStart":302.8999996185303,"responseEnd":317.09999990463257,"responseStart":316.09999990463257,"secureConnectionStart":276.8999996185303},{"duration":60,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2cib/820016/12ta74/1.0/_/download/batch/jira.webresources:calendar-en/jira.webresources:calendar-en.js","startTime":277,"connectEnd":277,"connectStart":277,"domainLookupEnd":277,"domainLookupStart":277,"fetchStart":277,"redirectEnd":0,"redirectStart":0,"requestStart":306.59999990463257,"responseEnd":337,"responseStart":335.69999980926514,"secureConnectionStart":277},{"duration":62.80000019073486,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2cib/820016/12ta74/1.0/_/download/batch/jira.webresources:calendar-localisation-moment/jira.webresources:calendar-localisation-moment.js","startTime":277.19999980926514,"connectEnd":277.19999980926514,"connectStart":277.19999980926514,"domainLookupEnd":277.19999980926514,"domainLookupStart":277.19999980926514,"fetchStart":277.19999980926514,"redirectEnd":0,"redirectStart":0,"requestStart":309,"responseEnd":340,"responseStart":337.09999990463257,"secureConnectionStart":277.19999980926514},{"duration":30.40000009536743,"initiatorType":"link","name":"https://jira.mariadb.org/s/b04b06a02d1959df322d9cded3aeecc1-CDN/lu2cib/820016/12ta74/a2ff6aa845ffc9a1d22fe23d9ee791fc/_/download/contextbatch/css/jira.global.look-and-feel,-_super/batch.css","startTime":277.2999997138977,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":277.2999997138977,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":307.69999980926514,"responseStart":0,"secureConnectionStart":0},{"duration":63,"initiatorType":"script","name":"https://jira.mariadb.org/rest/api/1.0/shortcuts/820016/47140b6e0a9bc2e4913da06536125810/shortcuts.js?context=issuenavigation&context=issueaction","startTime":277.3999996185303,"connectEnd":277.3999996185303,"connectStart":277.3999996185303,"domainLookupEnd":277.3999996185303,"domainLookupStart":277.3999996185303,"fetchStart":277.3999996185303,"redirectEnd":0,"redirectStart":0,"requestStart":309.09999990463257,"responseEnd":340.3999996185303,"responseStart":337.7999997138977,"secureConnectionStart":277.3999996185303},{"duration":30.90000009536743,"initiatorType":"link","name":"https://jira.mariadb.org/s/3ac36323ba5e4eb0af2aa7ac7211b4bb-CDN/lu2cib/820016/12ta74/d176f0986478cc64f24226b3d20c140d/_/download/contextbatch/css/com.atlassian.jira.projects.sidebar.init,-_super,-project.issue.navigator,-jira.view.issue/batch.css?jira.create.linked.issue=true","startTime":277.59999990463257,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":277.59999990463257,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":308.5,"responseStart":0,"secureConnectionStart":0},{"duration":63,"initiatorType":"script","name":"https://jira.mariadb.org/s/5d5e8fe91fbc506585e83ea3b62ccc4b-CDN/lu2cib/820016/12ta74/d176f0986478cc64f24226b3d20c140d/_/download/contextbatch/js/com.atlassian.jira.projects.sidebar.init,-_super,-project.issue.navigator,-jira.view.issue/batch.js?jira.create.linked.issue=true&locale=en","startTime":277.7999997138977,"connectEnd":277.7999997138977,"connectStart":277.7999997138977,"domainLookupEnd":277.7999997138977,"domainLookupStart":277.7999997138977,"fetchStart":277.7999997138977,"redirectEnd":0,"redirectStart":0,"requestStart":311.09999990463257,"responseEnd":340.7999997138977,"responseStart":338.7999997138977,"secureConnectionStart":277.7999997138977},{"duration":325.6000003814697,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2cib/820016/12ta74/1.0/_/download/batch/jira.webresources:bigpipe-js/jira.webresources:bigpipe-js.js","startTime":279.8999996185303,"connectEnd":279.8999996185303,"connectStart":279.8999996185303,"domainLookupEnd":279.8999996185303,"domainLookupStart":279.8999996185303,"fetchStart":279.8999996185303,"redirectEnd":0,"redirectStart":0,"requestStart":345.59999990463257,"responseEnd":605.5,"responseStart":601.1999998092651,"secureConnectionStart":279.8999996185303},{"duration":311.09999990463257,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2cib/820016/12ta74/1.0/_/download/batch/jira.webresources:bigpipe-init/jira.webresources:bigpipe-init.js","startTime":298.7999997138977,"connectEnd":298.7999997138977,"connectStart":298.7999997138977,"domainLookupEnd":298.7999997138977,"domainLookupStart":298.7999997138977,"fetchStart":298.7999997138977,"redirectEnd":0,"redirectStart":0,"requestStart":367.5,"responseEnd":609.8999996185303,"responseStart":604.5,"secureConnectionStart":298.7999997138977},{"duration":57.40000009536743,"initiatorType":"xmlhttprequest","name":"https://jira.mariadb.org/rest/webResources/1.0/resources","startTime":642.6999998092651,"connectEnd":642.6999998092651,"connectStart":642.6999998092651,"domainLookupEnd":642.6999998092651,"domainLookupStart":642.6999998092651,"fetchStart":642.6999998092651,"redirectEnd":0,"redirectStart":0,"requestStart":665.0999999046326,"responseEnd":700.0999999046326,"responseStart":699.1999998092651,"secureConnectionStart":642.6999998092651},{"duration":181.59999990463257,"initiatorType":"xmlhttprequest","name":"https://jira.mariadb.org/rest/webResources/1.0/resources","startTime":850.0999999046326,"connectEnd":850.0999999046326,"connectStart":850.0999999046326,"domainLookupEnd":850.0999999046326,"domainLookupStart":850.0999999046326,"fetchStart":850.0999999046326,"redirectEnd":0,"redirectStart":0,"requestStart":1000.6999998092651,"responseEnd":1031.6999998092651,"responseStart":1031,"secureConnectionStart":850.0999999046326}],"fetchStart":0,"domainLookupStart":24,"domainLookupEnd":35,"connectStart":35,"connectEnd":57,"secureConnectionStart":45,"requestStart":57,"responseStart":267,"responseEnd":298,"domLoading":270,"domInteractive":1101,"domContentLoadedEventStart":1101,"domContentLoadedEventEnd":1164,"domComplete":1276,"loadEventStart":1276,"loadEventEnd":1276,"userAgent":"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ClaudeBot/1.0; +claudebot@anthropic.com)","marks":[{"name":"bigPipe.sidebar-id.start","time":1060.8999996185303},{"name":"bigPipe.sidebar-id.end","time":1061.5999999046326},{"name":"bigPipe.activity-panel-pipe-id.start","time":1061.7999997138977},{"name":"bigPipe.activity-panel-pipe-id.end","time":1067.2999997138977},{"name":"activityTabFullyLoaded","time":1181.7999997138977}],"measures":[],"correlationId":"f3592d5c0413a3","effectiveType":"4g","downlink":10,"rtt":0,"serverDuration":121,"dbReadsTimeInMs":12,"dbConnsTimeInMs":21,"applicationHash":"9d11dbea5f4be3d4cc21f03a88dd11d8c8687422","experiments":[]}}
A relatively easy workaround for Linux bugs could be to retry with synchronous IO instead once EAGAIN was collected.
Like below (but note, I did not test nor compile as I do not have a uring env)
diff --git a/tpool/aio_liburing.cc b/tpool/aio_liburing.cc
index bdc3601ae35..b5a5279b99a 100644
--- a/tpool/aio_liburing.cc
+++ b/tpool/aio_liburing.cc
@@ -162,6 +162,16 @@ class aio_uring final : public tpool::aio
io_uring_cqe_seen(&aio->uring_, cqe);
+ if (iocb->m_err == EAGAIN)
+ {
+ if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD)
+ iocb->m_ret_len= pread(iocb->m_fh, iocb->m_buffer, iocb->m_len, iocb->m_offset);
+ else
+ iocb->m_ret_len= pwrite(iocb->m_fh, iocb->m_buffer, iocb->m_len, iocb->m_offset);
+
+ iocb->m_err= iocb->m_ret_len < 0 ? errno : 0;
+ }
+
iocb->m_internal_task.m_func= iocb->m_callback;
iocb->m_internal_task.m_arg= iocb;
iocb->m_internal_task.m_group= iocb->m_group;
krizhanovsky, maybe you can check if that patch helps?