Start by applying the attached patch to 10.11, revision 6ebdd3013a18b01dbecec76b870810329eb76586. It contains 17 additional random usleeps in InnoDB's locking code in storage/innobase/lock/lock0lock.cc (with a few matching extra brackets were needed). Then run the following code using a C based client but not the CLI.
Note: Most of the code in this testcase is repeated to trigger the issue more readily. Using a patched tree, the issue should show as good as every run.
# mysqld options possibly required for replay: --innodb_change_buffering=inserts --innodb_fast_shutdown=1 --innodb_file_per_table=1
CREATE DEFINER=root@localhost EVENT e1 ON SCHEDULE EVERY '1'SECOND COMMENT 'a' DO DELETEFROM t;
SELECT SLEEP(0.2);
CREATE DEFINER=root@localhost EVENT e2 ON SCHEDULE EVERY '1'SECOND COMMENT 'a' DO DELETEFROM t;
SELECT SLEEP(0.2);
CREATE DEFINER=root@localhost EVENT e3 ON SCHEDULE EVERY '1'SECOND COMMENT 'a' DO DELETEFROM t;
SELECT SLEEP(0.2);
CREATE DEFINER=root@localhost EVENT e4 ON SCHEDULE EVERY '1'SECOND COMMENT 'a' DO DELETEFROM t;
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e4 ON SCHEDULE EVERY '1'SECOND COMMENT 'a' DO DELETEFROM t;
at /test/10.11_dbg/storage/innobase/lock/lock0lock.cc:123
[Current thread is 1 (Thread 0x15192d1d4700 (LWP 2940395))]
(gdb) bt
#0 lock_sys_t::hash_table::resize (this=this@entry=0x55dcb133ba98 <lock_sys+88>, n=n@entry=6553600) at /test/10.11_dbg/storage/innobase/lock/lock0lock.cc:123
#1 0x000055dcafbc307b in lock_sys_t::resize (this=0x55dcb133ba40 <lock_sys>, n_cells=6553600) at /test/10.11_dbg/storage/innobase/lock/lock0lock.cc:464
#2 0x000055dcafdecf88 in buf_pool_t::resize (this=this@entry=0x55dcb0a1f000 <buf_pool>) at /test/10.11_dbg/storage/innobase/buf/buf0buf.cc:1813
#3 0x000055dcafde1065 in buf_resize_callback () at /test/10.11_dbg/storage/innobase/buf/buf0buf.cc:1860
#4 0x000055dcafefbe0d in tpool::task_group::execute (this=0x55dcb1365be0 <single_threaded_group>, t=t@entry=0x55dcb1365b40 <buf_resize_task>) at /test/10.11_dbg/tpool/task_group.cc:70
#5 0x000055dcafefbe95 in tpool::task::execute (this=0x55dcb1365b40 <buf_resize_task>) at /test/10.11_dbg/tpool/task.cc:32
#6 0x000055dcafef98ab in tpool::thread_pool_generic::worker_main (this=0x55dcb18a14b0, thread_var=0x55dcb18a1f60) at /test/10.11_dbg/tpool/tpool_generic.cc:580
#7 0x000055dcafefafde in std::__invoke_impl<void, void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> (__t=<optimized out>, __f=<optimized out>) at /usr/include/c++/9/bits/invoke.h:89
#8 std::__invoke<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> (__fn=<optimized out>) at /usr/include/c++/9/bits/invoke.h:95
and the test case which is basically the same as in the description, I only converted it into an infinite loop
--source include/have_innodb.inc
CREATE DEFINER=root@localhost EVENT e1 ON SCHEDULE EVERY '1'SECOND COMMENT 'a' DO DELETEFROM t;
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e2 ON SCHEDULE EVERY '1'SECOND COMMENT 'a' DO DELETEFROM t;
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e3 ON SCHEDULE EVERY '1'SECOND COMMENT 'a' DO DELETEFROM t;
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e4 ON SCHEDULE EVERY '1'SECOND COMMENT 'a' DO DELETEFROM t;
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e5 ON SCHEDULE EVERY '1'SECOND COMMENT 'a' DO INSERTINTO t VALUES (1),(1);
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e6 ON SCHEDULE EVERY '1'SECOND COMMENT 'a' DO INSERTINTO t VALUES (1),(1);
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e7 ON SCHEDULE EVERY '1'SECOND COMMENT 'a' DO INSERTINTO t VALUES (1),(1);
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e8 ON SCHEDULE EVERY '1'SECOND COMMENT 'a' DO INSERTINTO t VALUES (1),(1);
SELECT SLEEP(0.1);
CREATETABLE t (id INT) ENGINE=InnoDB;
SETGLOBAL event_scheduler=ON;
--delimiter $$
BEGINNOT ATOMIC
DECLARECONTINUE HANDLER FOR 1210 BEGINEND;
LOOP
INSERTINTO t VALUES (1),(1);
SETGLOBAL innodb_buffer_pool_size=21474836480;
SELECT SLEEP(3);
SETGLOBAL innodb_buffer_pool_size=134217728;
SELECT SLEEP(3);
END LOOP;
END $$
--delimiter ;
reproducible easily enough, both via MTR and the client (for the latter, --delimiter needs to be changed to DELIMITER), with a regular sigsegv-ed stack trace
#4 0x000056377b853fa2 in lock_sys_t::hash_table::resize (this=0x56378de2a7a0 <lock_sys+96>, n=40960) at /data/src/10.11-bug/storage/innobase/lock/lock0lock.cc:123
#5 0x000056377b85788c in lock_sys_t::resize (this=0x56378de2a740 <lock_sys>, n_cells=40960) at /data/src/10.11-bug/storage/innobase/lock/lock0lock.cc:463
#6 0x000056377bf54a7a in buf_pool_t::resize (this=0x56378aebbb40 <buf_pool>) at /data/src/10.11-bug/storage/innobase/buf/buf0buf.cc:1813
#7 0x000056377bf23c37 in buf_resize_callback () at /data/src/10.11-bug/storage/innobase/buf/buf0buf.cc:1860
#8 0x000056377c2fc998 in tpool::task_group::execute (this=0x56378de612a0 <single_threaded_group>, t=0x56378de61360 <buf_resize_task>) at /data/src/10.11-bug/tpool/task_group.cc:70
#9 0x000056377c2fdd97 in tpool::task::execute (this=0x56378de61360 <buf_resize_task>) at /data/src/10.11-bug/tpool/task.cc:32
#10 0x000056377c2dceb9 in tpool::thread_pool_generic::worker_main (this=0x618000000880, thread_var=0x630000020400) at /data/src/10.11-bug/tpool/tpool_generic.cc:580
#12 0x000056377c2fb642 in std::__invoke<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> (__fn=@0x604000000d28: (void (tpool::thread_pool_generic::*)(tpool::thread_pool_generic * const, tpool::worker_data *)) 0x56377c2dc8d4 <tpool::thread_pool_generic::worker_main(tpool::worker_data*)>) at /usr/include/c++/10/bits/invoke.h:95
#13 0x000056377c2fb419 in std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> >::_M_invoke<0ul, 1ul, 2ul> (this=0x604000000d18) at /usr/include/c++/10/thread:264
#14 0x000056377c2fb29a in std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> >::operator() (this=0x604000000d18) at /usr/include/c++/10/thread:271
#15 0x000056377c2fb262 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> > >::_M_run (this=0x604000000d10) at /usr/include/c++/10/thread:215
#16 0x00007f3129738ed0 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#17 0x00007f3129844ea7 in start_thread (arg=<optimized out>) at pthread_create.c:477
#18 0x00007f3128ac7aef in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Elena Stepanova
added a comment - - edited With this part alone from the sleeps patch (the one that introduces the loop as marko explained)
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 1c11efafc7a..d3331b0c196 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -118,7 +118,7 @@ void lock_sys_t::hash_table::resize(ulint n)
else
{
lock_t *next= static_cast<lock_t*>(c->node);
- while (next->hash)
+ while (next->hash) usleep(1000000);
next= next->hash;
next->hash= lock;
}
and the test case which is basically the same as in the description, I only converted it into an infinite loop
--source include/have_innodb.inc
CREATE DEFINER=root@localhost EVENT e1 ON SCHEDULE EVERY '1' SECOND COMMENT 'a' DO DELETE FROM t;
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e2 ON SCHEDULE EVERY '1' SECOND COMMENT 'a' DO DELETE FROM t;
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e3 ON SCHEDULE EVERY '1' SECOND COMMENT 'a' DO DELETE FROM t;
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e4 ON SCHEDULE EVERY '1' SECOND COMMENT 'a' DO DELETE FROM t;
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e5 ON SCHEDULE EVERY '1' SECOND COMMENT 'a' DO INSERT INTO t VALUES (1),(1);
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e6 ON SCHEDULE EVERY '1' SECOND COMMENT 'a' DO INSERT INTO t VALUES (1),(1);
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e7 ON SCHEDULE EVERY '1' SECOND COMMENT 'a' DO INSERT INTO t VALUES (1),(1);
SELECT SLEEP(0.1);
CREATE DEFINER=root@localhost EVENT e8 ON SCHEDULE EVERY '1' SECOND COMMENT 'a' DO INSERT INTO t VALUES (1),(1);
SELECT SLEEP(0.1);
CREATE TABLE t (id INT ) ENGINE=InnoDB;
SET GLOBAL event_scheduler= ON ;
--delimiter $$
BEGIN NOT ATOMIC
DECLARE CONTINUE HANDLER FOR 1210 BEGIN END ;
LOOP
INSERT INTO t VALUES (1),(1);
SET GLOBAL innodb_buffer_pool_size=21474836480;
SELECT SLEEP(3);
SET GLOBAL innodb_buffer_pool_size=134217728;
SELECT SLEEP(3);
END LOOP;
END $$
--delimiter ;
reproducible easily enough, both via MTR and the client (for the latter, --delimiter needs to be changed to DELIMITER ), with a regular sigsegv-ed stack trace
10.11 6ebdd3013a18b01dbecec76b870810329eb76586 + patch above
#3 <signal handler called>
#4 0x000056377b853fa2 in lock_sys_t::hash_table::resize (this=0x56378de2a7a0 <lock_sys+96>, n=40960) at /data/src/10.11-bug/storage/innobase/lock/lock0lock.cc:123
#5 0x000056377b85788c in lock_sys_t::resize (this=0x56378de2a740 <lock_sys>, n_cells=40960) at /data/src/10.11-bug/storage/innobase/lock/lock0lock.cc:463
#6 0x000056377bf54a7a in buf_pool_t::resize (this=0x56378aebbb40 <buf_pool>) at /data/src/10.11-bug/storage/innobase/buf/buf0buf.cc:1813
#7 0x000056377bf23c37 in buf_resize_callback () at /data/src/10.11-bug/storage/innobase/buf/buf0buf.cc:1860
#8 0x000056377c2fc998 in tpool::task_group::execute (this=0x56378de612a0 <single_threaded_group>, t=0x56378de61360 <buf_resize_task>) at /data/src/10.11-bug/tpool/task_group.cc:70
#9 0x000056377c2fdd97 in tpool::task::execute (this=0x56378de61360 <buf_resize_task>) at /data/src/10.11-bug/tpool/task.cc:32
#10 0x000056377c2dceb9 in tpool::thread_pool_generic::worker_main (this=0x618000000880, thread_var=0x630000020400) at /data/src/10.11-bug/tpool/tpool_generic.cc:580
#11 0x000056377c2fbc94 in std::__invoke_impl<void, void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> (__f=@0x604000000d28: (void (tpool::thread_pool_generic::*)(tpool::thread_pool_generic * const, tpool::worker_data *)) 0x56377c2dc8d4 <tpool::thread_pool_generic::worker_main(tpool::worker_data*)>, __t=@0x604000000d20: 0x618000000880) at /usr/include/c++/10/bits/invoke.h:73
#12 0x000056377c2fb642 in std::__invoke<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> (__fn=@0x604000000d28: (void (tpool::thread_pool_generic::*)(tpool::thread_pool_generic * const, tpool::worker_data *)) 0x56377c2dc8d4 <tpool::thread_pool_generic::worker_main(tpool::worker_data*)>) at /usr/include/c++/10/bits/invoke.h:95
#13 0x000056377c2fb419 in std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> >::_M_invoke<0ul, 1ul, 2ul> (this=0x604000000d18) at /usr/include/c++/10/thread:264
#14 0x000056377c2fb29a in std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> >::operator() (this=0x604000000d18) at /usr/include/c++/10/thread:271
#15 0x000056377c2fb262 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> > >::_M_run (this=0x604000000d10) at /usr/include/c++/10/thread:215
#16 0x00007f3129738ed0 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#17 0x00007f3129844ea7 in start_thread (arg=<optimized out>) at pthread_create.c:477
#18 0x00007f3128ac7aef in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Roel clarified to me that he is disabling a number of sigaction() calls. That explains why no signal handler was called on the SIGSEGV.
Usually, SIGSEGV is flagged on the instruction that immediately follows the one that caused the fault. In the debugging session that I conducted with Roel a couple of days ago, the signal was flagged on an instruction that was 2 instructions after the one that read next->hash:
0x0000561b190b84cb <+813>: cmpq $0x0,0x20(%r12)
0x0000561b190b84d1 <+819>: jne 0x561b190b84a3 <lock_sys_t::hash_table::resize(unsigned long)+773>
=> 0x0000561b190b84d3 <+821>: mov %rbx,0x20
The sleeps.diff does introduce an infinite loop and therefore will cause buffer pool resizing to hang in those cases where the code will be executed. But, it is also exposing a race condition that I am yet to analyze: both next and next->hash are expected to remain constant in such infinite loop, but instead, we have next=nullptr at some point. A simpler patch for exposing the same race condition might be the following:
Marko Mäkelä
added a comment - Roel clarified to me that he is disabling a number of sigaction() calls. That explains why no signal handler was called on the SIGSEGV .
Usually, SIGSEGV is flagged on the instruction that immediately follows the one that caused the fault. In the debugging session that I conducted with Roel a couple of days ago, the signal was flagged on an instruction that was 2 instructions after the one that read next->hash :
0x0000561b190b84cb <+813>: cmpq $0x0,0x20(%r12)
0x0000561b190b84d1 <+819>: jne 0x561b190b84a3 <lock_sys_t::hash_table::resize(unsigned long)+773>
=> 0x0000561b190b84d3 <+821>: mov %rbx,0x20
The sleeps.diff does introduce an infinite loop and therefore will cause buffer pool resizing to hang in those cases where the code will be executed. But, it is also exposing a race condition that I am yet to analyze: both next and next->hash are expected to remain constant in such infinite loop, but instead, we have next=nullptr at some point. A simpler patch for exposing the same race condition might be the following:
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 0d6a069d50d..27ca8a4c694 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -119,7 +119,7 @@ void lock_sys_t::hash_table::resize(ulint n)
{
lock_t *next= static_cast<lock_t*>(c->node);
while (next->hash)
- next= next->hash;
+ std::this_thread::yield();
next->hash= lock;
}
lock= next;
My version of the patch may be too simplified, because I removed the assignment that used to be the while loop body, while Roel’s patch and the simplified version that elenst posted would move that assignment after the loop.
If we have next->hash==nullptr, then the while loop will not be entered. With sleeps.diff the code would become the following:
Therefore, it remains possible that the sleeps.diff is actually causing a problem.
So far, I was unable to reproduce the problem with any variation of my simple patch that I tried: replacing std::this_thread::yield() with usleep(1000000) or adding the assignment after the loop. Roel, can you try it once more, removing the assignment after the infinite while loop?
Note: I invoked elenst’s test on 10.6 with ./mtr -mysqld=-event-scheduler=on to avoid warnings that the event scheduler is switched off. (I don’t think that this code has been changed between 10.6 and 10.11.)
Marko Mäkelä
added a comment - My version of the patch may be too simplified, because I removed the assignment that used to be the while loop body, while Roel ’s patch and the simplified version that elenst posted would move that assignment after the loop.
If we have next->hash==nullptr , then the while loop will not be entered. With sleeps.diff the code would become the following:
while (next->hash) ...; // not taken
next= next->hash; // next=nullptr!
next->hash= lock; // undefined behaviour: dereferencing next==nullptr
Therefore, it remains possible that the sleeps.diff is actually causing a problem.
So far, I was unable to reproduce the problem with any variation of my simple patch that I tried: replacing std::this_thread::yield() with usleep(1000000) or adding the assignment after the loop. Roel , can you try it once more, removing the assignment after the infinite while loop?
Note: I invoked elenst ’s test on 10.6 with ./mtr - mysqld= -event-scheduler=on to avoid warnings that the event scheduler is switched off. (I don’t think that this code has been changed between 10.6 and 10.11.)
Confirmed with an UBSAN build that the direct assignment of next= next->hash; without the while (next->hash) leads to the undefined behaviour as described in the last comment (null pointer access). Closed as not a bug. It was a valueable learning experience.
The random sleeps setup/testing was based on some ideas we had previously discussed, and the buffer pool resizing was not specifically targeted but part of the input SQL.
Roel Van de Paar
added a comment - Confirmed with an UBSAN build that the direct assignment of next= next->hash; without the while (next->hash) leads to the undefined behaviour as described in the last comment (null pointer access). Closed as not a bug. It was a valueable learning experience.
The random sleeps setup/testing was based on some ideas we had previously discussed, and the buffer pool resizing was not specifically targeted but part of the input SQL.
The approach of injecting random sleeps here and there is valid and useful, but unfortunately in this case it was done incorrectly and actually caused a bug by itself.
Also I learned that SIGSEGV may be flagged a few instructions after the guilty one. This was not a waste of time.
Marko Mäkelä
added a comment - The approach of injecting random sleeps here and there is valid and useful, but unfortunately in this case it was done incorrectly and actually caused a bug by itself.
Also I learned that SIGSEGV may be flagged a few instructions after the guilty one. This was not a waste of time.
People
Roel Van de Paar
Roel Van de Paar
Votes:
0Vote for this issue
Watchers:
3Start watching this issue
Dates
Created:
Updated:
Resolved:
Git Integration
Error rendering 'com.xiplink.jira.git.jira_git_plugin:git-issue-webpanel'. Please contact your Jira administrators.
{"report":{"fcp":854.1999998092651,"ttfb":234.30000019073486,"pageVisibility":"visible","entityId":115590,"key":"jira.project.issue.view-issue","isInitial":true,"threshold":1000,"elementTimings":{},"userDeviceMemory":8,"userDeviceProcessors":64,"apdex":1,"journeyId":"0e6e9145-dc40-4463-b282-3b9246490b39","navigationType":0,"readyForUser":936.3000001907349,"redirectCount":0,"resourceLoadedEnd":993.6000003814697,"resourceLoadedStart":238.80000019073486,"resourceTiming":[{"duration":149.60000038146973,"initiatorType":"link","name":"https://jira.mariadb.org/s/2c21342762a6a02add1c328bed317ffd-CDN/lu2bu7/820016/12ta74/0a8bac35585be7fc6c9cc5a0464cd4cf/_/download/contextbatch/css/_super/batch.css","startTime":238.80000019073486,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":238.80000019073486,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":388.4000005722046,"responseStart":0,"secureConnectionStart":0},{"duration":149.39999961853027,"initiatorType":"link","name":"https://jira.mariadb.org/s/7ebd35e77e471bc30ff0eba799ebc151-CDN/lu2bu7/820016/12ta74/8679b4946efa1a0bb029a3a22206fb5d/_/download/contextbatch/css/jira.browse.project,project.issue.navigator,jira.view.issue,jira.general,jira.global,atl.general,-_super/batch.css?agile_global_admin_condition=true&jag=true&jira.create.linked.issue=true&slack-enabled=true","startTime":239.10000038146973,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":239.10000038146973,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":388.5,"responseStart":0,"secureConnectionStart":0},{"duration":158.0999994277954,"initiatorType":"script","name":"https://jira.mariadb.org/s/fbf975c0cce4b1abf04784eeae9ba1f4-CDN/lu2bu7/820016/12ta74/0a8bac35585be7fc6c9cc5a0464cd4cf/_/download/contextbatch/js/_super/batch.js?locale=en","startTime":239.10000038146973,"connectEnd":239.10000038146973,"connectStart":239.10000038146973,"domainLookupEnd":239.10000038146973,"domainLookupStart":239.10000038146973,"fetchStart":239.10000038146973,"redirectEnd":0,"redirectStart":0,"requestStart":239.10000038146973,"responseEnd":397.19999980926514,"responseStart":397.19999980926514,"secureConnectionStart":239.10000038146973},{"duration":223.39999961853027,"initiatorType":"script","name":"https://jira.mariadb.org/s/099b33461394b8015fc36c0a4b96e19f-CDN/lu2bu7/820016/12ta74/8679b4946efa1a0bb029a3a22206fb5d/_/download/contextbatch/js/jira.browse.project,project.issue.navigator,jira.view.issue,jira.general,jira.global,atl.general,-_super/batch.js?agile_global_admin_condition=true&jag=true&jira.create.linked.issue=true&locale=en&slack-enabled=true","startTime":239.30000019073486,"connectEnd":239.30000019073486,"connectStart":239.30000019073486,"domainLookupEnd":239.30000019073486,"domainLookupStart":239.30000019073486,"fetchStart":239.30000019073486,"redirectEnd":0,"redirectStart":0,"requestStart":239.30000019073486,"responseEnd":462.69999980926514,"responseStart":462.69999980926514,"secureConnectionStart":239.30000019073486},{"duration":227.29999923706055,"initiatorType":"script","name":"https://jira.mariadb.org/s/94c15bff32baef80f4096a08aceae8bc-CDN/lu2bu7/820016/12ta74/c92c0caa9a024ae85b0ebdbed7fb4bd7/_/download/contextbatch/js/atl.global,-_super/batch.js?locale=en","startTime":239.4000005722046,"connectEnd":239.4000005722046,"connectStart":239.4000005722046,"domainLookupEnd":239.4000005722046,"domainLookupStart":239.4000005722046,"fetchStart":239.4000005722046,"redirectEnd":0,"redirectStart":0,"requestStart":239.4000005722046,"responseEnd":466.69999980926514,"responseStart":466.69999980926514,"secureConnectionStart":239.4000005722046},{"duration":227.69999980926514,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2bu7/820016/12ta74/1.0/_/download/batch/jira.webresources:calendar-en/jira.webresources:calendar-en.js","startTime":239.5,"connectEnd":239.5,"connectStart":239.5,"domainLookupEnd":239.5,"domainLookupStart":239.5,"fetchStart":239.5,"redirectEnd":0,"redirectStart":0,"requestStart":239.5,"responseEnd":467.19999980926514,"responseStart":467.19999980926514,"secureConnectionStart":239.5},{"duration":228,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2bu7/820016/12ta74/1.0/_/download/batch/jira.webresources:calendar-localisation-moment/jira.webresources:calendar-localisation-moment.js","startTime":239.5,"connectEnd":239.5,"connectStart":239.5,"domainLookupEnd":239.5,"domainLookupStart":239.5,"fetchStart":239.5,"redirectEnd":0,"redirectStart":0,"requestStart":239.5,"responseEnd":467.5,"responseStart":467.5,"secureConnectionStart":239.5},{"duration":228.20000076293945,"initiatorType":"link","name":"https://jira.mariadb.org/s/b04b06a02d1959df322d9cded3aeecc1-CDN/lu2bu7/820016/12ta74/a2ff6aa845ffc9a1d22fe23d9ee791fc/_/download/contextbatch/css/jira.global.look-and-feel,-_super/batch.css","startTime":239.69999980926514,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":239.69999980926514,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":467.9000005722046,"responseStart":0,"secureConnectionStart":0},{"duration":228.10000038146973,"initiatorType":"script","name":"https://jira.mariadb.org/rest/api/1.0/shortcuts/820016/47140b6e0a9bc2e4913da06536125810/shortcuts.js?context=issuenavigation&context=issueaction","startTime":239.80000019073486,"connectEnd":239.80000019073486,"connectStart":239.80000019073486,"domainLookupEnd":239.80000019073486,"domainLookupStart":239.80000019073486,"fetchStart":239.80000019073486,"redirectEnd":0,"redirectStart":0,"requestStart":239.80000019073486,"responseEnd":467.9000005722046,"responseStart":467.9000005722046,"secureConnectionStart":239.80000019073486},{"duration":228.39999961853027,"initiatorType":"link","name":"https://jira.mariadb.org/s/3ac36323ba5e4eb0af2aa7ac7211b4bb-CDN/lu2bu7/820016/12ta74/d176f0986478cc64f24226b3d20c140d/_/download/contextbatch/css/com.atlassian.jira.projects.sidebar.init,-_super,-project.issue.navigator,-jira.view.issue/batch.css?jira.create.linked.issue=true","startTime":239.9000005722046,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":239.9000005722046,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":468.30000019073486,"responseStart":0,"secureConnectionStart":0},{"duration":228.5,"initiatorType":"script","name":"https://jira.mariadb.org/s/3339d87fa2538a859872f2df449bf8d0-CDN/lu2bu7/820016/12ta74/d176f0986478cc64f24226b3d20c140d/_/download/contextbatch/js/com.atlassian.jira.projects.sidebar.init,-_super,-project.issue.navigator,-jira.view.issue/batch.js?jira.create.linked.issue=true&locale=en","startTime":240,"connectEnd":240,"connectStart":240,"domainLookupEnd":240,"domainLookupStart":240,"fetchStart":240,"redirectEnd":0,"redirectStart":0,"requestStart":240,"responseEnd":468.5,"responseStart":468.5,"secureConnectionStart":240},{"duration":439.29999923706055,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2bu7/820016/12ta74/1.0/_/download/batch/jira.webresources:bigpipe-js/jira.webresources:bigpipe-js.js","startTime":245.9000005722046,"connectEnd":245.9000005722046,"connectStart":245.9000005722046,"domainLookupEnd":245.9000005722046,"domainLookupStart":245.9000005722046,"fetchStart":245.9000005722046,"redirectEnd":0,"redirectStart":0,"requestStart":245.9000005722046,"responseEnd":685.1999998092651,"responseStart":685.1999998092651,"secureConnectionStart":245.9000005722046},{"duration":481.69999980926514,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2bu7/820016/12ta74/1.0/_/download/batch/jira.webresources:bigpipe-init/jira.webresources:bigpipe-init.js","startTime":274,"connectEnd":274,"connectStart":274,"domainLookupEnd":274,"domainLookupStart":274,"fetchStart":274,"redirectEnd":0,"redirectStart":0,"requestStart":274,"responseEnd":755.6999998092651,"responseStart":755.6999998092651,"secureConnectionStart":274},{"duration":144.5,"initiatorType":"xmlhttprequest","name":"https://jira.mariadb.org/rest/webResources/1.0/resources","startTime":609.6999998092651,"connectEnd":609.6999998092651,"connectStart":609.6999998092651,"domainLookupEnd":609.6999998092651,"domainLookupStart":609.6999998092651,"fetchStart":609.6999998092651,"redirectEnd":0,"redirectStart":0,"requestStart":609.6999998092651,"responseEnd":754.1999998092651,"responseStart":754.1999998092651,"secureConnectionStart":609.6999998092651},{"duration":152.5,"initiatorType":"script","name":"https://www.google-analytics.com/analytics.js","startTime":827.1999998092651,"connectEnd":0,"connectStart":0,"domainLookupEnd":0,"domainLookupStart":0,"fetchStart":827.1999998092651,"redirectEnd":0,"redirectStart":0,"requestStart":0,"responseEnd":979.6999998092651,"responseStart":0,"secureConnectionStart":0},{"duration":151.9000005722046,"initiatorType":"script","name":"https://jira.mariadb.org/s/d41d8cd98f00b204e9800998ecf8427e-CDN/lu2bu7/820016/12ta74/e65b778d185daf5aee24936755b43da6/_/download/contextbatch/js/browser-metrics-plugin.contrib,-_super,-project.issue.navigator,-jira.view.issue,-atl.general/batch.js?agile_global_admin_condition=true&jag=true&jira.create.linked.issue=true&slack-enabled=true","startTime":841.6999998092651,"connectEnd":841.6999998092651,"connectStart":841.6999998092651,"domainLookupEnd":841.6999998092651,"domainLookupStart":841.6999998092651,"fetchStart":841.6999998092651,"redirectEnd":0,"redirectStart":0,"requestStart":841.6999998092651,"responseEnd":993.6000003814697,"responseStart":993.6000003814697,"secureConnectionStart":841.6999998092651}],"fetchStart":0,"domainLookupStart":0,"domainLookupEnd":0,"connectStart":0,"connectEnd":0,"requestStart":84,"responseStart":235,"responseEnd":274,"domLoading":238,"domInteractive":1031,"domContentLoadedEventStart":1031,"domContentLoadedEventEnd":1081,"domComplete":1581,"loadEventStart":1581,"loadEventEnd":1582,"userAgent":"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ClaudeBot/1.0; +claudebot@anthropic.com)","marks":[{"name":"bigPipe.sidebar-id.start","time":985.1999998092651},{"name":"bigPipe.sidebar-id.end","time":986},{"name":"bigPipe.activity-panel-pipe-id.start","time":986.1000003814697},{"name":"bigPipe.activity-panel-pipe-id.end","time":991.4000005722046},{"name":"activityTabFullyLoaded","time":1102.8000001907349}],"measures":[],"correlationId":"f35599cf8b4225","effectiveType":"4g","downlink":10,"rtt":0,"serverDuration":86,"dbReadsTimeInMs":10,"dbConnsTimeInMs":17,"applicationHash":"9d11dbea5f4be3d4cc21f03a88dd11d8c8687422","experiments":[]}}
With this part alone from the sleeps patch (the one that introduces the loop as marko explained)
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 1c11efafc7a..d3331b0c196 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -118,7 +118,7 @@ void lock_sys_t::hash_table::resize(ulint n)
else
{
lock_t *next= static_cast<lock_t*>(c->node);
- while (next->hash)
+ while (next->hash) usleep(1000000);
next= next->hash;
next->hash= lock;
}
and the test case which is basically the same as in the description, I only converted it into an infinite loop
--source include/have_innodb.inc
--delimiter $$
LOOP
reproducible easily enough, both via MTR and the client (for the latter, --delimiter needs to be changed to DELIMITER), with a regular sigsegv-ed stack trace
10.11 6ebdd3013a18b01dbecec76b870810329eb76586 + patch above
#3 <signal handler called>
#4 0x000056377b853fa2 in lock_sys_t::hash_table::resize (this=0x56378de2a7a0 <lock_sys+96>, n=40960) at /data/src/10.11-bug/storage/innobase/lock/lock0lock.cc:123
#5 0x000056377b85788c in lock_sys_t::resize (this=0x56378de2a740 <lock_sys>, n_cells=40960) at /data/src/10.11-bug/storage/innobase/lock/lock0lock.cc:463
#6 0x000056377bf54a7a in buf_pool_t::resize (this=0x56378aebbb40 <buf_pool>) at /data/src/10.11-bug/storage/innobase/buf/buf0buf.cc:1813
#7 0x000056377bf23c37 in buf_resize_callback () at /data/src/10.11-bug/storage/innobase/buf/buf0buf.cc:1860
#8 0x000056377c2fc998 in tpool::task_group::execute (this=0x56378de612a0 <single_threaded_group>, t=0x56378de61360 <buf_resize_task>) at /data/src/10.11-bug/tpool/task_group.cc:70
#9 0x000056377c2fdd97 in tpool::task::execute (this=0x56378de61360 <buf_resize_task>) at /data/src/10.11-bug/tpool/task.cc:32
#10 0x000056377c2dceb9 in tpool::thread_pool_generic::worker_main (this=0x618000000880, thread_var=0x630000020400) at /data/src/10.11-bug/tpool/tpool_generic.cc:580
#11 0x000056377c2fbc94 in std::__invoke_impl<void, void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> (__f=@0x604000000d28: (void (tpool::thread_pool_generic::*)(tpool::thread_pool_generic * const, tpool::worker_data *)) 0x56377c2dc8d4 <tpool::thread_pool_generic::worker_main(tpool::worker_data*)>, __t=@0x604000000d20: 0x618000000880) at /usr/include/c++/10/bits/invoke.h:73
#12 0x000056377c2fb642 in std::__invoke<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> (__fn=@0x604000000d28: (void (tpool::thread_pool_generic::*)(tpool::thread_pool_generic * const, tpool::worker_data *)) 0x56377c2dc8d4 <tpool::thread_pool_generic::worker_main(tpool::worker_data*)>) at /usr/include/c++/10/bits/invoke.h:95
#13 0x000056377c2fb419 in std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> >::_M_invoke<0ul, 1ul, 2ul> (this=0x604000000d18) at /usr/include/c++/10/thread:264
#14 0x000056377c2fb29a in std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> >::operator() (this=0x604000000d18) at /usr/include/c++/10/thread:271
#15 0x000056377c2fb262 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> > >::_M_run (this=0x604000000d10) at /usr/include/c++/10/thread:215
#16 0x00007f3129738ed0 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#17 0x00007f3129844ea7 in start_thread (arg=<optimized out>) at pthread_create.c:477
#18 0x00007f3128ac7aef in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95