Hi,
We've run into an issue with MariaDB when running Sysbench "oltp.lua" test with 8 threads. The server daemon crashed mostly with an assertion failure at storage/xtradb/fil/fil0fil.c:5288:
fil_node_complete_io(
|
/*=================*/
|
fil_node_t* node, /*!< in: file node */
|
fil_system_t* system, /*!< in: tablespace memory cache */
|
ulint type) /*!< in: OS_FILE_WRITE or OS_FILE_READ; marks
|
the node as modified if
|
type == OS_FILE_WRITE */
|
{
|
ut_ad(node);
|
ut_ad(system);
|
ut_ad(mutex_own(&(system->mutex)));
|
|
ut_a(node->n_pending > 0); <-- failure point
|
|
node->n_pending--;
|
An attached debugger gave the following backtrace:
(gdb) bt full
|
#0 0x0000007fb1d44d18 in __GI_raise (sig=sig@entry=6)
|
at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
|
_sys_result = 0
|
pd = 0x7fa2fff1a0
|
pid = <optimised out>
|
selftid = 5661
|
#1 0x0000007fb1d4818c in __GI_abort () at abort.c:89
|
save_stage = 2
|
act = {__sigaction_handler = {sa_handler = 0x7f00000000,
|
sa_sigaction = 0x7f00000000}, sa_mask = {__val = {548445445976,
|
404, 1, 404, 0, 366924161824, 548195526816, 366921716804,
|
366927454208, 3, 0, 548434850424, 366927869208, 366936283408,
|
548195527344, 548434850424}}, sa_flags = 5288,
|
sa_restorer = 0xa2fff1a0}
|
sigs = {__val = {32, 0 <repeats 15 times>}}
|
#2 0x000000556e3d1448 in fil_node_complete_io (node=<optimised out>,
|
system=<optimised out>, type=<optimised out>)
|
at /build/buildd/mariadb-5.5-5.5.36/storage/xtradb/fil/fil0fil.c:5288
|
No locals.
|
#3 0x000000556e3db800 in fil_aio_wait (segment=segment@entry=3)
|
at /build/buildd/mariadb-5.5-5.5.36/storage/xtradb/fil/fil0fil.c:5705
|
ret = <optimised out>
|
fil_node = 0x7fb14a0e78
|
message = 0x7fa54d4350
|
type = 10
|
space_id = 0
|
#4 0x000000556e3592a4 in io_handler_thread (arg=<optimised out>)
|
at /build/buildd/mariadb-5.5-5.5.36/storage/xtradb/srv/srv0start.c:486
|
segment = 3
|
#5 0x0000007fb220ae2c in start_thread (arg=0x7fa2fff1a0)
|
at pthread_create.c:314
|
pd = 0x7fa2fff1a0
|
unwind_buf = {cancel_jmp_buf = {{jmp_buf = {548195529120,
|
548981639624, 548449456128, 0, 548449452032, 548195529312,
|
548195527344, 548443965168, 8388608, 548449472512,
|
548195527056, 13770210553321828185, 0, 13770210553602140361,
|
0, 0, 0, 0, 0, 0, 0, 0}, mask_was_saved = 0}}, priv = {pad = {
|
0x0, 0x0, 0x7fb220ad7c <start_thread>, 0x7fa2fff1a0}, data = {
|
prev = 0x0, cleanup = 0x0, canceltype = -1306481284}}}
|
not_first_call = 0
|
pagesize_m1 = <optimised out>
|
sp = <optimised out>
|
freesize = <optimised out>
|
__PRETTY_FUNCTION__ = "start_thread"
|
#6 0x0000007fb1dd9c40 in clone ()
|
at ../ports/sysdeps/unix/sysv/linux/aarch64/nptl/../clone.S:96
|
No locals.
|
Once the daemon crashed we've sometimes been unable to start it again without wiping out the database and re-installing it.
Having done some digging it is apparent that there is a problem in the mutex_exit code path; in particular at:
http://bazaar.launchpad.net/~maria-captains/maria/5.5/view/head:/storage/xtradb/include/sync0sync.ic#L106
A load-acquire is used to exit the mutex rather than a store-release. This leads to unpredictable results for architectures with a weak memory model.
We have the following in program order:
- mutex_enter -> load-acquire lock, loop until it is 0, then set to 1 relaxed
- protected work
- mutex_exit -> load-acquire lock, set it to 0 regardless.
However, the following sequence of events can be observed by another core:
- mutex_enter -> load-acquire lock, loop until it is 0, then set to 1 relaxed
- some of the protected work
- mutex_exit -> load-acquire lock, set it to 0 regardless.
- some more of the protected work (not protected).
The above can (and has for our test system) lead to severe data corruption; that prevents the daemon from even re-starting.
I've attached an emergency patch that re-introduces __ sync_lock_release to release the mutex. This fixes the crash and data corruption issues for me, but I understand from comments in the code that there were issues with this function in the past? Could the gcc intrinsics be moved over to the __ atomic_* functions? Ideally:
To acquire the lock:
__atomic_exchange_n(ptr, (byte) new_val, __ATOMIC_ACQUIRE)
|
To release the lock:
__atomic_store_n(ptr, (byte) new_val, __ATOMIC_RELEASE)
|
(which also worked on my test system).
I believe this issue may affect other versions of MariaDB, but I've only tested 5.5.36.
Cheers,
–
Steve Capper
{"report":{"fcp":1185.3000001907349,"ttfb":228.5999994277954,"pageVisibility":"visible","entityId":43522,"key":"jira.project.issue.view-issue","isInitial":true,"threshold":1000,"elementTimings":{},"userDeviceMemory":8,"userDeviceProcessors":32,"apdex":0.5,"journeyId":"4b00191e-6238-4257-bb40-37f3fbafc683","navigationType":0,"readyForUser":1462.1999998092651,"redirectCount":0,"resourceLoadedEnd":1635.5,"resourceLoadedStart":255.30000019073486,"resourceTiming":[{"duration":379,"initiatorType":"link","name":"https://jira.mariadb.org/s/2c21342762a6a02add1c328bed317ffd-CDN/lu2cib/820016/12ta74/0a8bac35585be7fc6c9cc5a0464cd4cf/_/download/contextbatch/css/_super/batch.css","startTime":255.30000019073486,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":255.30000019073486,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":634.3000001907349,"responseStart":0,"secureConnectionStart":0},{"duration":378.19999980926514,"initiatorType":"link","name":"https://jira.mariadb.org/s/7ebd35e77e471bc30ff0eba799ebc151-CDN/lu2cib/820016/12ta74/2bf333562ca6724060a9d5f1535471f6/_/download/contextbatch/css/jira.browse.project,project.issue.navigator,jira.view.issue,jira.general,jira.global,atl.general,-_super/batch.css?agile_global_admin_condition=true&jag=true&jira.create.linked.issue=true&slack-enabled=true","startTime":256.3999996185303,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":256.3999996185303,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":634.5999994277954,"responseStart":0,"secureConnectionStart":0},{"duration":387.20000076293945,"initiatorType":"script","name":"https://jira.mariadb.org/s/0917945aaa57108d00c5076fea35e069-CDN/lu2cib/820016/12ta74/0a8bac35585be7fc6c9cc5a0464cd4cf/_/download/contextbatch/js/_super/batch.js?locale=en","startTime":256.5999994277954,"connectEnd":256.5999994277954,"connectStart":256.5999994277954,"domainLookupEnd":256.5999994277954,"domainLookupStart":256.5999994277954,"fetchStart":256.5999994277954,"redirectEnd":0,"redirectStart":0,"requestStart":256.5999994277954,"responseEnd":643.8000001907349,"responseStart":643.6999998092651,"secureConnectionStart":256.5999994277954},{"duration":422.79999923706055,"initiatorType":"script","name":"https://jira.mariadb.org/s/2d8175ec2fa4c816e8023260bd8c1786-CDN/lu2cib/820016/12ta74/2bf333562ca6724060a9d5f1535471f6/_/download/contextbatch/js/jira.browse.project,project.issue.navigator,jira.view.issue,jira.general,jira.global,atl.general,-_super/batch.js?agile_global_admin_condition=true&jag=true&jira.create.linked.issue=true&locale=en&slack-enabled=true","startTime":256.80000019073486,"connectEnd":256.80000019073486,"connectStart":256.80000019073486,"domainLookupEnd":256.80000019073486,"domainLookupStart":256.80000019073486,"fetchStart":256.80000019073486,"redirectEnd":0,"redirectStart":0,"requestStart":256.80000019073486,"responseEnd":679.5999994277954,"responseStart":679.5999994277954,"secureConnectionStart":256.80000019073486},{"duration":426.30000019073486,"initiatorType":"script","name":"https://jira.mariadb.org/s/a9324d6758d385eb45c462685ad88f1d-CDN/lu2cib/820016/12ta74/c92c0caa9a024ae85b0ebdbed7fb4bd7/_/download/contextbatch/js/atl.global,-_super/batch.js?locale=en","startTime":257.0999994277954,"connectEnd":257.0999994277954,"connectStart":257.0999994277954,"domainLookupEnd":257.0999994277954,"domainLookupStart":257.0999994277954,"fetchStart":257.0999994277954,"redirectEnd":0,"redirectStart":0,"requestStart":257.0999994277954,"responseEnd":683.3999996185303,"responseStart":683.3999996185303,"secureConnectionStart":257.0999994277954},{"duration":426.80000019073486,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2cib/820016/12ta74/1.0/_/download/batch/jira.webresources:calendar-en/jira.webresources:calendar-en.js","startTime":257.19999980926514,"connectEnd":257.19999980926514,"connectStart":257.19999980926514,"domainLookupEnd":257.19999980926514,"domainLookupStart":257.19999980926514,"fetchStart":257.19999980926514,"redirectEnd":0,"redirectStart":0,"requestStart":257.19999980926514,"responseEnd":684,"responseStart":684,"secureConnectionStart":257.19999980926514},{"duration":427.19999980926514,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2cib/820016/12ta74/1.0/_/download/batch/jira.webresources:calendar-localisation-moment/jira.webresources:calendar-localisation-moment.js","startTime":257.30000019073486,"connectEnd":257.30000019073486,"connectStart":257.30000019073486,"domainLookupEnd":257.30000019073486,"domainLookupStart":257.30000019073486,"fetchStart":257.30000019073486,"redirectEnd":0,"redirectStart":0,"requestStart":257.30000019073486,"responseEnd":684.5,"responseStart":684.5,"secureConnectionStart":257.30000019073486},{"duration":439.30000019073486,"initiatorType":"link","name":"https://jira.mariadb.org/s/b04b06a02d1959df322d9cded3aeecc1-CDN/lu2cib/820016/12ta74/a2ff6aa845ffc9a1d22fe23d9ee791fc/_/download/contextbatch/css/jira.global.look-and-feel,-_super/batch.css","startTime":257.5999994277954,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":257.5999994277954,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":696.8999996185303,"responseStart":0,"secureConnectionStart":0},{"duration":427.19999980926514,"initiatorType":"script","name":"https://jira.mariadb.org/rest/api/1.0/shortcuts/820016/47140b6e0a9bc2e4913da06536125810/shortcuts.js?context=issuenavigation&context=issueaction","startTime":257.80000019073486,"connectEnd":257.80000019073486,"connectStart":257.80000019073486,"domainLookupEnd":257.80000019073486,"domainLookupStart":257.80000019073486,"fetchStart":257.80000019073486,"redirectEnd":0,"redirectStart":0,"requestStart":257.80000019073486,"responseEnd":685,"responseStart":685,"secureConnectionStart":257.80000019073486},{"duration":438.8999996185303,"initiatorType":"link","name":"https://jira.mariadb.org/s/3ac36323ba5e4eb0af2aa7ac7211b4bb-CDN/lu2cib/820016/12ta74/d176f0986478cc64f24226b3d20c140d/_/download/contextbatch/css/com.atlassian.jira.projects.sidebar.init,-_super,-project.issue.navigator,-jira.view.issue/batch.css?jira.create.linked.issue=true","startTime":258,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":258,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":696.8999996185303,"responseStart":0,"secureConnectionStart":0},{"duration":427.3999996185303,"initiatorType":"script","name":"https://jira.mariadb.org/s/5d5e8fe91fbc506585e83ea3b62ccc4b-CDN/lu2cib/820016/12ta74/d176f0986478cc64f24226b3d20c140d/_/download/contextbatch/js/com.atlassian.jira.projects.sidebar.init,-_super,-project.issue.navigator,-jira.view.issue/batch.js?jira.create.linked.issue=true&locale=en","startTime":258.19999980926514,"connectEnd":258.19999980926514,"connectStart":258.19999980926514,"domainLookupEnd":258.19999980926514,"domainLookupStart":258.19999980926514,"fetchStart":258.19999980926514,"redirectEnd":0,"redirectStart":0,"requestStart":258.19999980926514,"responseEnd":685.5999994277954,"responseStart":685.5999994277954,"secureConnectionStart":258.19999980926514},{"duration":621,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2cib/820016/12ta74/1.0/_/download/batch/jira.webresources:bigpipe-js/jira.webresources:bigpipe-js.js","startTime":259.19999980926514,"connectEnd":259.19999980926514,"connectStart":259.19999980926514,"domainLookupEnd":259.19999980926514,"domainLookupStart":259.19999980926514,"fetchStart":259.19999980926514,"redirectEnd":0,"redirectStart":0,"requestStart":259.19999980926514,"responseEnd":880.1999998092651,"responseStart":880.1999998092651,"secureConnectionStart":259.19999980926514},{"duration":1376.1000003814697,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2cib/820016/12ta74/1.0/_/download/batch/jira.webresources:bigpipe-init/jira.webresources:bigpipe-init.js","startTime":259.3999996185303,"connectEnd":259.3999996185303,"connectStart":259.3999996185303,"domainLookupEnd":259.3999996185303,"domainLookupStart":259.3999996185303,"fetchStart":259.3999996185303,"redirectEnd":0,"redirectStart":0,"requestStart":259.3999996185303,"responseEnd":1635.5,"responseStart":1635.5,"secureConnectionStart":259.3999996185303},{"duration":601.6000003814697,"initiatorType":"xmlhttprequest","name":"https://jira.mariadb.org/rest/webResources/1.0/resources","startTime":913.6999998092651,"connectEnd":913.6999998092651,"connectStart":913.6999998092651,"domainLookupEnd":913.6999998092651,"domainLookupStart":913.6999998092651,"fetchStart":913.6999998092651,"redirectEnd":0,"redirectStart":0,"requestStart":913.6999998092651,"responseEnd":1515.3000001907349,"responseStart":1515.3000001907349,"secureConnectionStart":913.6999998092651}],"fetchStart":0,"domainLookupStart":0,"domainLookupEnd":0,"connectStart":0,"connectEnd":0,"requestStart":60,"responseStart":229,"responseEnd":239,"domLoading":253,"domInteractive":1673,"domContentLoadedEventStart":1673,"domContentLoadedEventEnd":1728,"domComplete":2968,"loadEventStart":2968,"loadEventEnd":2970,"userAgent":"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ClaudeBot/1.0; +claudebot@anthropic.com)","marks":[{"name":"bigPipe.sidebar-id.start","time":1637.5999994277954},{"name":"bigPipe.sidebar-id.end","time":1638.3999996185303},{"name":"bigPipe.activity-panel-pipe-id.start","time":1638.5},{"name":"bigPipe.activity-panel-pipe-id.end","time":1640.8000001907349},{"name":"activityTabFullyLoaded","time":1747.0999994277954}],"measures":[],"correlationId":"dc7f9b4f2bd92c","effectiveType":"4g","downlink":10,"rtt":0,"serverDuration":96,"dbReadsTimeInMs":11,"dbConnsTimeInMs":19,"applicationHash":"9d11dbea5f4be3d4cc21f03a88dd11d8c8687422","experiments":[]}}