[MDEV-23239] InnoDB: Failing assertion: ((list).init == UT_LIST_INITIALISED) with innodb_evict_tables_on_commit_debug Created: 2020-07-20  Updated: 2020-07-21  Resolved: 2020-07-21

Status: Closed
Project: MariaDB Server
Component/s: Debug, Storage Engine - InnoDB
Affects Version/s: 10.3, 10.4, 10.5
Fix Version/s: 10.3.24, 10.4.14, 10.5.5

Type: Bug Priority: Major
Reporter: Elena Stepanova Assignee: Marko Mäkelä
Resolution: Duplicate Votes: 0
Labels: rr-profile-analyzed

Issue Links:
Relates
relates to MDEV-15880 Server crash or ASAN heap-use-after-f... Closed
relates to MDEV-23240 Assertion `!table->is_temporary()' fa... Closed
relates to MDEV-23241 Assertion `lock->trx->dict_operation ... Closed

 Description   

There isn't much information about innodb_evict_tables_on_commit_debug. I found a note in MDEV-20810 which says that the variable may be unreliable in concurrent DML/DDL scenarios. Maybe this note extends on the scenario below as well – there are concurrent transactions in there, although no concurrent statements. If the variable isn't supposed to be used this way, please feel free to close as not a bug.

--source include/have_innodb.inc
--source include/have_log_bin.inc
 
SET @innodb_var.save= @@innodb_evict_tables_on_commit_debug;
 
--connect (con1,localhost,root,,test)
CREATE TABLE t_error_prone (f INT) ENGINE=InnoDB;
 
--connection default
INSERT INTO t_error_prone (f) VALUES (1);
 
--connection con1
START TRANSACTION;
UPDATE t_error_prone SET f = 2;
 
--connection default
SET innodb_lock_wait_timeout= 0;
START TRANSACTION;
--error ER_LOCK_WAIT_TIMEOUT
UPDATE t_error_prone SET f = NULL;
 
--connect (con2,localhost,root,,test)
FLUSH TABLES;
SET GLOBAL innodb_evict_tables_on_commit_debug= 1;
 
--connection con1
--error ER_NO_SUCH_TABLE
ALTER TABLE x FORCE;
 
--connection default
DROP TABLE t_error_prone;
 
# Cleanup
--disconnect con1
--disconnect con2
SET GLOBAL innodb_evict_tables_on_commit_debug= @innodb_var.save;

10.3 2cae58f8

2020-07-21 00:43:36 0x7fe194ae9700  InnoDB: Assertion failure in file /data/src/10.3/storage/innobase/include/ut0lst.h line 335
InnoDB: Failing assertion: ((list).init == UT_LIST_INITIALISED)
 
#5  0x00007fe19b79a42a in __GI_abort () at abort.c:89
#6  0x000055acc9f7dfe2 in ut_dbg_assertion_failed (expr=0x55acca616938 "((list).init == UT_LIST_INITIALISED)", file=0x55acca616900 "/data/src/10.3/storage/innobase/include/ut0lst.h", line=335) at /data/src/10.3/storage/innobase/ut/ut0dbg.cc:60
#7  0x000055acc9da34fa in ut_list_remove<ut_list_base<ib_lock_t, ut_list_node<ib_lock_t> lock_table_t::*>, TableLockGetNode> (list=..., node=..., get_node=...) at /data/src/10.3/storage/innobase/include/ut0lst.h:335
#8  0x000055acc9da27bc in ut_list_remove<ut_list_base<ib_lock_t, ut_list_node<ib_lock_t> lock_table_t::*>, TableLockGetNode> (list=..., elem=0x7fe195629d90, get_node=...) at /data/src/10.3/storage/innobase/include/ut0lst.h:373
#9  0x000055acc9d936df in lock_table_remove_low (lock=0x7fe195629d90) at /data/src/10.3/storage/innobase/lock/lock0lock.cc:3707
#10 0x000055acc9d94739 in lock_table_dequeue (in_lock=0x7fe195629d90) at /data/src/10.3/storage/innobase/lock/lock0lock.cc:4015
#11 0x000055acc9d956a1 in lock_release (trx=0x7fe1956291c8) at /data/src/10.3/storage/innobase/lock/lock0lock.cc:4310
#12 0x000055acc9f71b8a in trx_t::release_locks (this=0x7fe1956291c8) at /data/src/10.3/storage/innobase/trx/trx0trx.cc:505
#13 0x000055acc9f6d1cd in trx_commit_in_memory (trx=0x7fe1956291c8, mtr=0x0) at /data/src/10.3/storage/innobase/trx/trx0trx.cc:1384
#14 0x000055acc9f6e82a in trx_commit_low (trx=0x7fe1956291c8, mtr=0x0) at /data/src/10.3/storage/innobase/trx/trx0trx.cc:1591
#15 0x000055acc9f6e8e2 in trx_commit (trx=0x7fe1956291c8) at /data/src/10.3/storage/innobase/trx/trx0trx.cc:1615
#16 0x000055acc9f6ed91 in trx_commit_for_mysql (trx=0x7fe1956291c8) at /data/src/10.3/storage/innobase/trx/trx0trx.cc:1751
#17 0x000055acc9ceac02 in innobase_commit_low (trx=0x7fe1956291c8) at /data/src/10.3/storage/innobase/handler/ha_innodb.cc:4355
#18 0x000055acc9ceafe6 in innobase_commit_ordered_2 (trx=0x7fe1956291c8, thd=0x7fe140000af0) at /data/src/10.3/storage/innobase/handler/ha_innodb.cc:4476
#19 0x000055acc9ceb617 in innobase_commit (hton=0x55accc6f9280, thd=0x7fe140000af0, commit_trx=true) at /data/src/10.3/storage/innobase/handler/ha_innodb.cc:4592
#20 0x000055acc9ac46ee in commit_one_phase_2 (thd=0x7fe140000af0, all=true, trans=0x7fe140004008, is_real_trans=true) at /data/src/10.3/sql/handler.cc:1642
#21 0x000055acc9ac45ea in ha_commit_one_phase (thd=0x7fe140000af0, all=true) at /data/src/10.3/sql/handler.cc:1622
#22 0x000055acc9ac3c9e in ha_commit_trans (thd=0x7fe140000af0, all=true) at /data/src/10.3/sql/handler.cc:1484
#23 0x000055acc992d57d in trans_commit_implicit (thd=0x7fe140000af0) at /data/src/10.3/sql/transaction.cc:361
#24 0x000055acc97a5cde in mysql_execute_command (thd=0x7fe140000af0) at /data/src/10.3/sql/sql_parse.cc:3660
#25 0x000055acc97b3ebd in mysql_parse (thd=0x7fe140000af0, rawbuf=0x7fe140012918 "DROP TABLE t_error_prone", length=24, parser_state=0x7fe194ae85e0, is_com_multi=false, is_next_command=false) at /data/src/10.3/sql/sql_parse.cc:7810
#26 0x000055acc97a0704 in dispatch_command (command=COM_QUERY, thd=0x7fe140000af0, packet=0x7fe140165a31 "DROP TABLE t_error_prone", packet_length=24, is_com_multi=false, is_next_command=false) at /data/src/10.3/sql/sql_parse.cc:1848
#27 0x000055acc979f01c in do_command (thd=0x7fe140000af0) at /data/src/10.3/sql/sql_parse.cc:1393
#28 0x000055acc99185f1 in do_handle_one_connection (connect=0x55accca3c5b0) at /data/src/10.3/sql/sql_connect.cc:1403
#29 0x000055acc9918353 in handle_one_connection (arg=0x55accca3c5b0) at /data/src/10.3/sql/sql_connect.cc:1308
#30 0x000055acca2cf7cc in pfs_spawn_thread (arg=0x55accc988790) at /data/src/10.3/storage/perfschema/pfs.cc:1869
#31 0x00007fe19d71a4a4 in start_thread (arg=0x7fe194ae9700) at pthread_create.c:456
#32 0x00007fe19b84ed0f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97

Reproducible on 10.3 and 10.4.
Couldn't reproduce with the provided test case on 10.5, but the failure is definitely possible on 10.5, I've been getting it in concurrent tests.



 Comments   
Comment by Marko Mäkelä [ 2020-07-21 ]

I was unable to repeat this on 10.3 acc58fd83584c49049951a2c54c7f82a0c7ec412 (one merge commit ahead of the stated revision) with either of the following commands:

while ./mtr --parallel=auto --rr --rr-arg=-h testname{,,,,,,,,,,,,,}; do :; done
./mtr --parallel=auto --repeat=1000 testname{,,,,,,,,,,,,,,,,,,,,}

The latter command executed the test 21,000 times.

Please provide an rr replay trace for analysis.

Comment by Elena Stepanova [ 2020-07-21 ]

Oddly, it turned out to be less deterministic than I thought. Why the effect were very reliably on my machine, on another one they were different – particularly, I didn't get the failure described in this report on 10.3, but got it on 10.5 instead. So, rr profile will be available from 10.5.

See also MDEV-15880 with the ASAN version of the failure(s).

Comment by Marko Mäkelä [ 2020-07-21 ]

For the trace that I analyzed, the problematic table lock was created and registered in table->locks.count. But, a little later, the reference count was decremented and the the table object was freed while references still existed, and finally, the assertion failed:

10.5 c89366866bca2df46b0592719a1f6b6dabf470cb

(rr) when
Current event: 131103
(rr) bt
#0  0x000056371eb6b986 in lock_table_create (table=0x7f7a3c01d348, 
    type_mode=1, trx=0x7f7a8e4fc3c0, c_lock=0x0)
    at /home/mariadb/MDEV-23239/10.5/storage/innobase/lock/lock0lock.cc:3495
#1  0x000056371eb6d14d in lock_table (flags=0, table=0x7f7a3c01d348, 
    mode=LOCK_IX, thr=0x7f7a48042de8)
    at /home/mariadb/MDEV-23239/10.5/storage/innobase/lock/lock0lock.cc:3904
#2  0x000056371ec45228 in row_ins_step (thr=0x7f7a48042de8)
    at /home/mariadb/MDEV-23239/10.5/storage/innobase/row/row0ins.cc:3719
#3  0x000056371ec69bc3 in row_insert_for_mysql (
    mysql_rec=0x7f7a480fa180 "\375\001", prebuilt=0x7f7a480426c8, 
    ins_mode=ROW_INS_NORMAL)
    at /home/mariadb/MDEV-23239/10.5/storage/innobase/row/row0mysql.cc:1459
#4  0x000056371eab73fd in ha_innobase::write_row (this=0x7f7a48041e70, 
    record=0x7f7a480fa180 "\375\001")
    at /home/mariadb/MDEV-23239/10.5/storage/innobase/handler/ha_innodb.cc:7729
(rr) when
Current event: 134175
(rr) bt
#0  dict_mem_table_free (table=0x7f7a3c01d348)
    at /home/mariadb/MDEV-23239/10.5/storage/innobase/dict/dict0mem.cc:229
#1  0x000056371ee09dfe in dict_sys_t::remove (this=0x56371fcbb880 <dict_sys>, 
    table=0x7f7a3c01d348, lru=true, keep=false)
    at /home/mariadb/MDEV-23239/10.5/storage/innobase/dict/dict0dict.cc:2001
#2  0x000056371ed36027 in trx_update_mod_tables_timestamp (trx=0x7f7a8e4fb1f8)
    at /home/mariadb/MDEV-23239/10.5/storage/innobase/trx/trx0trx.cc:1285
#3  0x000056371ed3b23a in trx_t::commit_in_memory (this=0x7f7a8e4fb1f8, 
    mtr=0x7f7a7c0ab6b0)
    at /home/mariadb/MDEV-23239/10.5/storage/innobase/trx/trx0trx.cc:1391
#18 0x000056371e4b3b94 in trans_commit_implicit (thd=0x7f7a3c000d78)
    at /home/mariadb/MDEV-23239/10.5/sql/transaction.cc:329
#19 0x000056371e2fafff in mysql_execute_command (thd=0x7f7a3c000d78)
    at /home/mariadb/MDEV-23239/10.5/sql/sql_parse.cc:3741
#20 0x000056371e3092c9 in mysql_parse (thd=0x7f7a3c000d78, 
    rawbuf=0x7f7a3c012a20 "ALTER TABLE x FORCE", length=19, 
    parser_state=0x7f7a7c0ad510, is_com_multi=false, is_next_command=false)
    at /home/mariadb/MDEV-23239/10.5/sql/sql_parse.cc:7993

The problem is that we completely fail to check for pending locks. Please try this patch. I only tested that it compiles on 10.3.

diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index e0d8d44a86e..1d4202b3033 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -1295,7 +1295,8 @@ trx_update_mod_tables_timestamp(
 		dict_table_t* table = it->first;
 		table->update_time = now;
 #ifdef UNIV_DEBUG
-		if (preserve_tables || table->get_ref_count()) {
+		if (preserve_tables || table->get_ref_count()
+		    || UT_LIST_GET_LEN(table->locks)) {
 			/* do not evict when committing DDL operations
 			or if some other transaction is holding the
 			table handle */
@@ -1304,7 +1305,11 @@ trx_update_mod_tables_timestamp(
 		/* recheck while holding the mutex that blocks
 		table->acquire() */
 		mutex_enter(&dict_sys_mutex);
-		if (!table->get_ref_count()) {
+		mutex_enter(&lock_sys.mutex);
+		const bool do_evict = !table->get_ref_count()
+			&& !UT_LIST_GET_LEN(table->locks);
+		mutex_exit(&lock_sys.mutex);
+		if (do_evict) {
 # if MYSQL_VERSION_ID >= 100405
 			dict_sys.remove(table, true);
 # else

Comment by Marko Mäkelä [ 2020-07-21 ]

MDEV-15880 addressed this.

Generated at Thu Feb 08 09:20:55 UTC 2024 using Jira 8.20.16#820016-sha1:9d11dbea5f4be3d4cc21f03a88dd11d8c8687422.