With MDEV-16678, InnoDB background tasks (most notably, the purge of committed transaction history) can acquire metadata locks. This seems to explain recent failures of the test main.backup_stages in 10.5:
10.5 ce41a9075ad63e5388045943b80f69b7
CURRENT_TEST: main.backup_stages
mysqltest: At line 177: query 'ALTER TABLE t_permanent_innodb ADD COLUMN col3 INT' failed: 1205: Lock wait timeout exceeded; try restarting transaction
I finally repeated this with rr replay (MDEV-22179), cherry-picking a version of the 10.2-22179 branch to 10.5. I also had to make safe_process safer, to get a complete trace:
} else if (MDL_context *mdl_context= static_cast<MDL_context*>(
thd_mdl_context(thd))) {
+ os_thread_sleep(10000);
mdl_context->release_lock(mdl);
}
}
The error was reported here:
10.5 0d6d63e1505bbf9898d46fa641a7a0579a4ae460
Thread 3 hit Breakpoint 1, my_error (nr=1205, MyFlags=0) at /mariadb/10.5m/mysys/my_error.c:113
113 DBUG_ENTER("my_error");
#0 my_error (nr=1205, MyFlags=0) at /mariadb/10.5m/mysys/my_error.c:113
#1 0x000055788770c0ef in MDL_context::acquire_lock (this=0x7ff638000e68, mdl_request=0x7ff6cc0aa3a0, lock_wait_timeout=0) at /mariadb/10.5m/sql/mdl.cc:2322
#2 0x000055788770d055 in MDL_context::upgrade_shared_lock (this=0x7ff638000e68, mdl_ticket=0x7ff638022be0, new_type=MDL_EXCLUSIVE, lock_wait_timeout=0) at /mariadb/10.5m/sql/mdl.cc:2583
#3 0x0000557887690878 in mysql_inplace_alter_table (thd=0x7ff638000d48, table_list=0x7ff638011c28, table=0x7ff6341992f8, altered_table=0x7ff6cc0aa7e0, ha_alter_info=0x7ff6cc0abb30,
inplace_supported=HA_ALTER_INPLACE_INSTANT, target_mdl_request=<optimized out>, alter_ctx=0x7ff6cc0abbd0) at /mariadb/10.5m/sql/sql_table.cc:7797
is_com_multi=<optimized out>, is_next_command=<optimized out>) at /mariadb/10.5m/sql/sql_parse.cc:7992
Thread 13 (a purge worker task) is executing the added sleep, waiting to release the MDL. Below is some more output. The output of thread apply all backtrace was too long for Jira.
10.5 0d6d63e1505bbf9898d46fa641a7a0579a4ae460
(rr) up
#1 0x000055788770c0ef in MDL_context::acquire_lock (this=0x7ff638000e68, mdl_request=0x7ff6cc0aa3a0, lock_wait_timeout=0) at /mariadb/10.5m/sql/mdl.cc:2322
2322 my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
(rr) p *this
$2 = {m_wait = {m_LOCK_wait_status = {m_mutex = {global = pthread_mutex_t = {Type = Error check, Status = Not acquired, Robust = No, Shared = No, Protocol = None}, mutex = pthread_mutex_t = {Type = Normal,
Status = Not acquired, Robust = No, Shared = No, Protocol = None}, file = 0x557888085b9d "/mariadb/10.5m/include/mysql/psi/mysql_thread.h", name = 0x5578880de49f "m_LOCK_wait_status", line = 1257,
m_cond = pthread_cond_t = {Threads known to still execute a wait function = 0, Clock ID = CLOCK_REALTIME, Shared = No}, m_psi = 0x5578899528c0}, m_wait_status = MDL_wait::GRANTED}, m_tickets = {
{<I_P_List_null_counter> = {<No data fields>}, <I_P_List_no_push_back<MDL_ticket>> = {<No data fields>}, m_first = 0x7ff63801c520},
{<I_P_List_null_counter> = {<No data fields>}, <I_P_List_no_push_back<MDL_ticket>> = {<No data fields>}, m_first = 0x7ff638022be0},
m_prlock = {lock = pthread_mutex_t = {Type = Normal, Status = Not acquired, Robust = No, Shared = No, Protocol = None}, no_active_readers = pthread_cond_t = {
Threads known to still execute a wait function = 0, Clock ID = CLOCK_REALTIME, Shared = No}, active_readers = 0, writers_waiting_readers = 0, active_writer = 0 '\000', writer_thread = 0},
Type = Normal, Status = Not acquired, Robust = No, Shared = No, Protocol = None}, no_active_readers = pthread_cond_t = {Threads known to still execute a wait function = 0, Clock ID = CLOCK_REALTIME,
Type = Normal, Status = Not acquired, Robust = No, Shared = No, Protocol = None}, no_active_readers = pthread_cond_t = {Threads known to still execute a wait function = 0, Clock ID = CLOCK_REALTIME,
The test checks that there are no lock conflicts by setting lock timeout to 0 and your sleep in the background mdl lock defeats this purpose.
The simple fix is to change "set session lock_wait_timeout=0" to "set session lock_wait_timeout=1" and then we have to assume that no one of your mdl locks is hold for more than a second.
I can't repeat the issue ,so please try this patch:
— a/mysql-test/main/backup_stages.test
+++ b/mysql-test/main/backup_stages.test
@@ -119,7 +119,7 @@ INSERT INTO t_permanent_myisam SET col1 = 1;
--connect(backup,localhost,root,,)
--connect(con11,localhost,root,,)
SET AUTOCOMMIT = 0;
-set session lock_wait_timeout=0;
+set session lock_wait_timeout=1;
--connect(con12,localhost,root,,)
SET AUTOCOMMIT = 1;
@@ -244,7 +244,7 @@ DROP TABLE t_con1_innodb;
DROP TABLE t_con1_myisam;
--connection con12
--error ER_LOCK_WAIT_TIMEOUT
-SET STATEMENT lock_wait_timeout=0 FOR UPDATE t_permanent_innodb SET col1 = 9;
+SET STATEMENT lock_wait_timeout=1 FOR UPDATE t_permanent_innodb SET col1 = 9;
--connection backup
BACKUP STAGE END;
(You must of course update the result table too)
Michael Widenius
added a comment - The test checks that there are no lock conflicts by setting lock timeout to 0 and your sleep in the background mdl lock defeats this purpose.
The simple fix is to change "set session lock_wait_timeout=0" to "set session lock_wait_timeout=1" and then we have to assume that no one of your mdl locks is hold for more than a second.
I can't repeat the issue ,so please try this patch:
— a/mysql-test/main/backup_stages.test
+++ b/mysql-test/main/backup_stages.test
@@ -119,7 +119,7 @@ INSERT INTO t_permanent_myisam SET col1 = 1;
--connect(backup,localhost,root,,)
--connect(con11,localhost,root,,)
SET AUTOCOMMIT = 0;
-set session lock_wait_timeout=0;
+set session lock_wait_timeout=1;
--connect(con12,localhost,root,,)
SET AUTOCOMMIT = 1;
@@ -244,7 +244,7 @@ DROP TABLE t_con1_innodb;
DROP TABLE t_con1_myisam;
--connection con12
--error ER_LOCK_WAIT_TIMEOUT
-SET STATEMENT lock_wait_timeout=0 FOR UPDATE t_permanent_innodb SET col1 = 9;
+SET STATEMENT lock_wait_timeout=1 FOR UPDATE t_permanent_innodb SET col1 = 9;
--connection backup
BACKUP STAGE END;
(You must of course update the result table too)
I finally repeated this with rr replay (
MDEV-22179), cherry-picking a version of the 10.2-22179 branch to 10.5. I also had to make safe_process safer, to get a complete trace:diff --git a/mysql-test/lib/My/SafeProcess/safe_process.cc b/mysql-test/lib/My/SafeProcess/safe_process.cc
index 4d0d1e2a3a0..abc167a4300 100644
--- a/mysql-test/lib/My/SafeProcess/safe_process.cc
+++ b/mysql-test/lib/My/SafeProcess/safe_process.cc
@@ -144,7 +144,7 @@ static int kill_child(bool was_killed)
message("Killing child: %d", child_pid);
// Terminate whole process group
if (! was_killed)
- kill(-child_pid, SIGKILL);
+ kill(-child_pid, SIGQUIT);
pid_t ret_pid= waitpid(child_pid, &status, 0);
Furthermore, I improved the probability with the following patch:
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
index 00d11d1806e..50decfe7aac 100644
--- a/storage/innobase/dict/dict0dict.cc
+++ b/storage/innobase/dict/dict0dict.cc
@@ -356,6 +356,7 @@ dict_table_close(
if (!thd || !mdl) {
} else if (MDL_context *mdl_context= static_cast<MDL_context*>(
thd_mdl_context(thd))) {
+ os_thread_sleep(10000);
mdl_context->release_lock(mdl);
}
The error was reported here:
10.5 0d6d63e1505bbf9898d46fa641a7a0579a4ae460
Thread 3 hit Breakpoint 1, my_error (nr=1205, MyFlags=0) at /mariadb/10.5m/mysys/my_error.c:113
113 DBUG_ENTER("my_error");
#0 my_error (nr=1205, MyFlags=0) at /mariadb/10.5m/mysys/my_error.c:113
#1 0x000055788770c0ef in MDL_context::acquire_lock (this=0x7ff638000e68, mdl_request=0x7ff6cc0aa3a0, lock_wait_timeout=0) at /mariadb/10.5m/sql/mdl.cc:2322
#2 0x000055788770d055 in MDL_context::upgrade_shared_lock (this=0x7ff638000e68, mdl_ticket=0x7ff638022be0, new_type=MDL_EXCLUSIVE, lock_wait_timeout=0) at /mariadb/10.5m/sql/mdl.cc:2583
#3 0x0000557887690878 in mysql_inplace_alter_table (thd=0x7ff638000d48, table_list=0x7ff638011c28, table=0x7ff6341992f8, altered_table=0x7ff6cc0aa7e0, ha_alter_info=0x7ff6cc0abb30,
inplace_supported=HA_ALTER_INPLACE_INSTANT, target_mdl_request=<optimized out>, alter_ctx=0x7ff6cc0abbd0) at /mariadb/10.5m/sql/sql_table.cc:7797
#4 mysql_alter_table (thd=<optimized out>, new_db=<optimized out>, new_name=<optimized out>, create_info=<optimized out>, table_list=0x7ff638011c28, alter_info=<optimized out>, order_num=<optimized out>,
order=<optimized out>, ignore=<optimized out>, if_exists=<optimized out>) at /mariadb/10.5m/sql/sql_table.cc:10457
#5 0x0000557887707b6d in Sql_cmd_alter_table::execute (this=<optimized out>, thd=0x7ff638000d48) at /mariadb/10.5m/sql/sql_alter.cc:532
#6 0x00005578875d31da in mysql_execute_command (thd=0x7ff638000d48) at /mariadb/10.5m/sql/sql_parse.cc:5950
#7 0x00005578875ce39d in mysql_parse (thd=0x7ff638000d48, rawbuf=0x7ff638011af0 "ALTER TABLE t_permanent_innodb ADD COLUMN col2 INT", length=<optimized out>, parser_state=<optimized out>,
is_com_multi=<optimized out>, is_next_command=<optimized out>) at /mariadb/10.5m/sql/sql_parse.cc:7992
Thread 13 (a purge worker task) is executing the added sleep, waiting to release the MDL. Below is some more output. The output of thread apply all backtrace was too long for Jira.
10.5 0d6d63e1505bbf9898d46fa641a7a0579a4ae460
(rr) up
#1 0x000055788770c0ef in MDL_context::acquire_lock (this=0x7ff638000e68, mdl_request=0x7ff6cc0aa3a0, lock_wait_timeout=0) at /mariadb/10.5m/sql/mdl.cc:2322
2322 my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
(rr) p *this
$2 = {m_wait = {m_LOCK_wait_status = {m_mutex = {global = pthread_mutex_t = {Type = Error check, Status = Not acquired, Robust = No, Shared = No, Protocol = None}, mutex = pthread_mutex_t = {Type = Normal,
Status = Not acquired, Robust = No, Shared = No, Protocol = None}, file = 0x557888085b9d "/mariadb/10.5m/include/mysql/psi/mysql_thread.h", name = 0x5578880de49f "m_LOCK_wait_status", line = 1257,
count = 0, create_flags = 0, active_flags = 0, id = 266, thread = 0, locked_mutex = 0x7ff638007ae8, used_mutex = 0x7ff638007b58, prev = 0x0, next = 0x0}, m_psi = 0x7ff6e187c240}, m_COND_wait_status = {
m_cond = pthread_cond_t = {Threads known to still execute a wait function = 0, Clock ID = CLOCK_REALTIME, Shared = No}, m_psi = 0x5578899528c0}, m_wait_status = MDL_wait::GRANTED}, m_tickets = {
{<I_P_List_null_counter> = {<No data fields>}, <I_P_List_no_push_back<MDL_ticket>> = {<No data fields>}, m_first = 0x7ff63801c520},
{<I_P_List_null_counter> = {<No data fields>}, <I_P_List_no_push_back<MDL_ticket>> = {<No data fields>}, m_first = 0x7ff638022be0},
{<I_P_List_null_counter> = {<No data fields>}, <I_P_List_no_push_back<MDL_ticket>> = {<No data fields>}, m_first = 0x0}}, m_owner = 0x7ff638000e18, m_needs_thr_lock_abort = false, m_LOCK_waiting_for = {
m_prlock = {lock = pthread_mutex_t = {Type = Normal, Status = Not acquired, Robust = No, Shared = No, Protocol = None}, no_active_readers = pthread_cond_t = {
Threads known to still execute a wait function = 0, Clock ID = CLOCK_REALTIME, Shared = No}, active_readers = 0, writers_waiting_readers = 0, active_writer = 0 '\000', writer_thread = 0},
m_psi = 0x7ff6dcd16540}, m_waiting_for = 0x0, m_pins = 0x557889d05ca8}
(rr) p *mdl_request
$3 = {type = MDL_EXCLUSIVE, duration = MDL_TRANSACTION, next_in_list = 0x5578876be101
<open_table_from_share(THD*, TABLE_SHARE*, st_mysql_const_lex_string const*, unsigned int, unsigned int, unsigned int, TABLE*, bool, List<String>*)+1985>, prev_in_list = 0x20753ff5b63ef300, ticket = 0x0,
key = {m_length = 25, m_db_name_length = 4, m_hash_value = 1289468714,
m_ptr = "\002test\000t_permanent_innodb\000\240%\210\020\000\000\000\340\f\000\070\366\177\000\000\374\227%\210xU\000\000\313\022\000\000\000\000\000\000\360\244\n\314\366\177\000\000\060\250\n\314\366\177\000\000\000\363>\266\000\000\000\000H\r\000\070\366\177\000\000\017v\f\210xU\000\000\321f\f\210xU\000\000\a\000\000\200\240\020\000\000\020\246\n\314\366\177\000\000\324|\v\210xU\000\000\031\006\000\000\000\000\000\000\360\244\n\314\366\177\000\000\b\370\376\207xU\000\000\000\363>\266\365?u '\000\000\000\000\000\000\000 \302\065\210xU\000\000\340\f\000\070\366\177\000\000\060\302\065\210xU\000\000=\000\000\000\000\000\000\000"...}, m_src_file = 0x5578880de2ce "/mariadb/10.5m/sql/mdl.cc", m_src_line = 2581}
(rr) thread 13
(rr) frame 11
(rr) p *mdl
$5 = {<MDL_wait_for_subgraph> = {_vptr$MDL_wait_for_subgraph = 0x557888818fe8 <vtable for MDL_ticket+16>}, next_in_context = 0x0, prev_in_context = 0x557889ce31e8, next_in_lock = 0x0,
prev_in_lock = 0x7ff638022bf8, m_type = MDL_SHARED, m_duration = MDL_EXPLICIT, m_ctx = 0x557889ce30e8, m_lock = 0x7ff6342015b8, m_psi = 0x557889d2b0c0}
(rr) p *mdl.m_lock
$6 = {key = {m_length = 25, m_db_name_length = 4, m_hash_value = 1289468714, m_ptr = "\002test\000t_permanent_innodb\000", '\245' <repeats 362 times>}, m_rwlock = {m_prlock = {lock = pthread_mutex_t = {
Type = Normal, Status = Not acquired, Robust = No, Shared = No, Protocol = None}, no_active_readers = pthread_cond_t = {Threads known to still execute a wait function = 0, Clock ID = CLOCK_REALTIME,
Shared = No}, active_readers = 0, writers_waiting_readers = 0, active_writer = 0 '\000', writer_thread = 0}, m_psi = 0x7ff6dcd12a40}, m_granted = {
m_list = {<I_P_List_null_counter> = {<No data fields>}, <I_P_List_fast_push_back<MDL_ticket>> = {m_last = 0x7ff6a8000db8}, m_first = 0x7ff638022be0}, m_bitmap = 34}, m_waiting = {
m_list = {<I_P_List_null_counter> = {<No data fields>}, <I_P_List_fast_push_back<MDL_ticket>> = {m_last = 0x7ff6342017e0}, m_first = 0x0}, m_bitmap = 0}, m_hog_lock_count = 0,
m_strategy = 0x557888a0de98 <MDL_lock::m_object_lock_strategy>}
(rr) thread 13
[Switching to thread 13 (Thread 1437863.1438123)]
#0 0x0000000070000002 in ?? ()
(rr) frame 11
#11 0x0000557887e9b759 in dict_table_close (table=<optimized out>, dict_locked=false, try_drop=<optimized out>, thd=0x557889ce2fc8, mdl=0x7ff6a8000da0) at /mariadb/10.5m/storage/innobase/dict/dict0dict.cc:359
359 os_thread_sleep(10000);
(rr) p *mdl
$5 = {<MDL_wait_for_subgraph> = {_vptr$MDL_wait_for_subgraph = 0x557888818fe8 <vtable for MDL_ticket+16>}, next_in_context = 0x0, prev_in_context = 0x557889ce31e8, next_in_lock = 0x0,
prev_in_lock = 0x7ff638022bf8, m_type = MDL_SHARED, m_duration = MDL_EXPLICIT, m_ctx = 0x557889ce30e8, m_lock = 0x7ff6342015b8, m_psi = 0x557889d2b0c0}
(rr) p *mdl.m_lock
$6 = {key = {m_length = 25, m_db_name_length = 4, m_hash_value = 1289468714, m_ptr = "\002test\000t_permanent_innodb\000", '\245' <repeats 362 times>}, m_rwlock = {m_prlock = {lock = pthread_mutex_t = {
Type = Normal, Status = Not acquired, Robust = No, Shared = No, Protocol = None}, no_active_readers = pthread_cond_t = {Threads known to still execute a wait function = 0, Clock ID = CLOCK_REALTIME,
Shared = No}, active_readers = 0, writers_waiting_readers = 0, active_writer = 0 '\000', writer_thread = 0}, m_psi = 0x7ff6dcd12a40}, m_granted = {
m_list = {<I_P_List_null_counter> = {<No data fields>}, <I_P_List_fast_push_back<MDL_ticket>> = {m_last = 0x7ff6a8000db8}, m_first = 0x7ff638022be0}, m_bitmap = 34}, m_waiting = {
m_list = {<I_P_List_null_counter> = {<No data fields>}, <I_P_List_fast_push_back<MDL_ticket>> = {m_last = 0x7ff6342017e0}, m_first = 0x0}, m_bitmap = 0}, m_hog_lock_count = 0,
m_strategy = 0x557888a0de98 <MDL_lock::m_object_lock_strategy>}