|
I think that this is related to the failure that I repeated on 10.4 and 10.2 and mysql-5.7.27 today, using the test main.other_lock_MDEV-20066 that is attached to MDEV-20066. On mysql-5.7.27 the stack trace looks like this:
|
mysql-5.7.27
|
Trying to get some variables.
|
Some pointers may be invalid and cause the dump to abort.
|
Query (7f9c88005af0): DELETE FROM t3 WHERE col1 = 7
|
…
|
#6 0x0000560866645111 in ut_dbg_assertion_failed (expr=0x560866c77bfb "!other_lock", file=0x560866c77318 "/mariadb/mysql-server/storage/innobase/lock/lock0lock.cc", line=1787) at /mariadb/mysql-server/storage/innobase/ut/ut0dbg.cc:67
|
#7 0x00005608664995a9 in lock_rec_add_to_queue (type_mode=1059, block=0x7f9cdceaba80, heap_no=2, index=0x7f9c90f916e0, trx=0x7f9cde8c9d20, caller_owns_trx_mutex=false) at /mariadb/mysql-server/storage/innobase/lock/lock0lock.cc:1787
|
#8 0x00005608664a37d7 in lock_rec_convert_impl_to_expl_for_trx (block=0x7f9cdceaba80, rec=0x7f9cdd3e807d "\200", index=0x7f9c90f916e0, offsets=0x7f9cdcbaab70, trx=0x7f9cde8c9d20, heap_no=2) at /mariadb/mysql-server/storage/innobase/lock/lock0lock.cc:6059
|
#9 0x00005608664a3ab6 in lock_rec_convert_impl_to_expl (block=0x7f9cdceaba80, rec=0x7f9cdd3e807d "\200", index=0x7f9c90f916e0, offsets=0x7f9cdcbaab70) at /mariadb/mysql-server/storage/innobase/lock/lock0lock.cc:6113
|
#10 0x00005608664a4929 in lock_clust_rec_read_check_and_lock (flags=0, block=0x7f9cdceaba80, rec=0x7f9cdd3e807d "\200", index=0x7f9c90f916e0, offsets=0x7f9cdcbaab70, mode=LOCK_X, gap_mode=1024, thr=0x7f9c880102f0) at /mariadb/mysql-server/storage/innobase/lock/lock0lock.cc:6404
|
#11 0x000056086659f42a in sel_set_rec_lock (pcur=0x7f9c8800fda0, rec=0x7f9cdd3e807d "\200", index=0x7f9c90f916e0, offsets=0x7f9cdcbaab70, mode=3, type=1024, thr=0x7f9c880102f0, mtr=0x7f9cdcbaaea0) at /mariadb/mysql-server/storage/innobase/row/row0sel.cc:1252
|
#12 0x00005608665a842c in row_search_mvcc (buf=0x7f9c8800f580 "\377", mode=PAGE_CUR_GE, prebuilt=0x7f9c8800fb80, match_mode=1, direction=0) at /mariadb/mysql-server/storage/innobase/row/row0sel.cc:5521
|
#13 0x00005608664180a4 in ha_innobase::index_read (this=0x7f9c8800f290, buf=0x7f9c8800f580 "\377", key_ptr=0x7f9c8801c6c0 "\a", key_len=4, find_flag=HA_READ_KEY_EXACT) at /mariadb/mysql-server/storage/innobase/handler/ha_innodb.cc:8744
|
#14 0x0000560865aa4e08 in handler::index_read_map (this=0x7f9c8800f290, buf=0x7f9c8800f580 "\377", key=0x7f9c8801c6c0 "\a", keypart_map=1, find_flag=HA_READ_KEY_EXACT) at /mariadb/mysql-server/sql/handler.h:2809
|
#15 0x0000560865a96ad8 in handler::ha_index_read_map (this=0x7f9c8800f290, buf=0x7f9c8800f580 "\377", key=0x7f9c8801c6c0 "\a", keypart_map=1, find_flag=HA_READ_KEY_EXACT) at /mariadb/mysql-server/sql/handler.cc:3039
|
#16 0x0000560865aa0777 in handler::read_range_first (this=0x7f9c8800f290, start_key=0x7f9c8800f378, end_key=0x7f9c8800f398, eq_range_arg=true, sorted=true) at /mariadb/mysql-server/sql/handler.cc:7404
|
#17 0x0000560865a9e7a1 in handler::multi_range_read_next (this=0x7f9c8800f290, range_info=0x7f9cdcbabf80) at /mariadb/mysql-server/sql/handler.cc:6469
|
#18 0x0000560865a9f5fa in DsMrr_impl::dsmrr_next (this=0x7f9c8800f4f0, range_info=0x7f9cdcbabf80) at /mariadb/mysql-server/sql/handler.cc:6861
|
#19 0x000056086642a174 in ha_innobase::multi_range_read_next (this=0x7f9c8800f290, range_info=0x7f9cdcbabf80) at /mariadb/mysql-server/storage/innobase/handler/ha_innodb.cc:20571
|
#20 0x0000560866271bd7 in QUICK_RANGE_SELECT::get_next (this=0x7f9c88011c60) at /mariadb/mysql-server/sql/opt_range.cc:11233
|
#21 0x0000560865fb95a2 in rr_quick (info=0x7f9cdcbac330) at /mariadb/mysql-server/sql/records.cc:398
|
#22 0x00005608662a53d3 in Sql_cmd_delete::mysql_delete (this=0x7f9c88007028, thd=0x7f9c88000dd0, limit=18446744073709551615) at /mariadb/mysql-server/sql/sql_delete.cc:463
|
#23 0x00005608662a823a in Sql_cmd_delete::execute (this=0x7f9c88007028, thd=0x7f9c88000dd0) at /mariadb/mysql-server/sql/sql_delete.cc:1393
|
#24 0x0000560866094b84 in mysql_execute_command (thd=0x7f9c88000dd0, first_level=true) at /mariadb/mysql-server/sql/sql_parse.cc:3606
|
#25 0x000056086609a3e5 in mysql_parse (thd=0x7f9c88000dd0, parser_state=0x7f9cdcbad660) at /mariadb/mysql-server/sql/sql_parse.cc:5570
|
#26 0x000056086608fc60 in dispatch_command (thd=0x7f9c88000dd0, com_data=0x7f9cdcbaddf0, command=COM_QUERY) at /mariadb/mysql-server/sql/sql_parse.cc:1484
|
#27 0x000056086608eb77 in do_command (thd=0x7f9c88000dd0) at /mariadb/mysql-server/sql/sql_parse.cc:1025
|
#28 0x00005608661bd74d in handle_connection (arg=0x56086a0c9df0) at /mariadb/mysql-server/sql/conn_handler/connection_handler_per_thread.cc:306
|
#29 0x000056086684a2c5 in pfs_spawn_thread (arg=0x56086a0c9e40) at /mariadb/mysql-server/storage/perfschema/pfs.cc:2190
|
#30 0x00007f9ce3956fa3 in start_thread (arg=<optimized out>) at pthread_create.c:486
|
#31 0x00007f9ce35104cf in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
|
current statement: INSERT INTO t3 (col1, col_text ) VALUES ( 7, 77 );
|
I was unable to repeat the issue on MariaDB Server 10.1.
I think that we must find and fix this bug before analyzing MDEV-20066 further, because that bug could be a consequence of this one.
|
|
Another failure, this time cmake -DWITH_WSREP=OFF:
|
10.2 9bf424bc7b8a6199ef0c22b508835f21019f861a
|
mysqld: /mariadb/10.2o/storage/innobase/lock/lock0lock.cc:1903: void lock_rec_add_to_queue(ulint, const buf_block_t *, ulint, dict_index_t *, trx_t *, bool): Assertion `!other_lock' failed.
|
…
|
#7 0x00005633677e9fc1 in lock_rec_add_to_queue (type_mode=1059, block=0x7f510cf9b268, heap_no=2, index=0x7f50b818acc8, trx=0x7f510d89d268, caller_owns_trx_mutex=<optimized out>) at /mariadb/10.2o/storage/innobase/lock/lock0lock.cc:1903
|
#8 0x00005633677f644b in lock_rec_convert_impl_to_expl_for_trx (block=<optimized out>, rec=<optimized out>, index=<optimized out>, trx=<optimized out>, heap_no=2) at /mariadb/10.2o/storage/innobase/lock/lock0lock.cc:5800
|
#9 lock_rec_convert_impl_to_expl (block=0x7f510cf9b268, rec=<optimized out>, index=<optimized out>, offsets=<optimized out>) at /mariadb/10.2o/storage/innobase/lock/lock0lock.cc:5854
|
#10 0x00005633677f83c4 in lock_clust_rec_read_check_and_lock (flags=<optimized out>, block=0x7f510cf9b268, rec=0x7f510d5f007d "\200", index=0x7f50b818acc8, offsets=0x7f510cc1d050, mode=LOCK_X, gap_mode=1024, thr=0x7f50bc022ae8) at /mariadb/10.2o/storage/innobase/lock/lock0lock.cc:6130
|
#11 0x00005633678d6f2e in sel_set_rec_lock (pcur=0x7f50bc022268, rec=0x7f510d5f007d "\200", index=0x7f50b818acc8, offsets=<optimized out>, mode=<optimized out>, type=1024, thr=<optimized out>, mtr=<optimized out>) at /mariadb/10.2o/storage/innobase/row/row0sel.cc:1261
|
#12 0x00005633678d3ad1 in row_search_mvcc (buf=<optimized out>, mode=<optimized out>, prebuilt=0x7f50bc022098, match_mode=<optimized out>, direction=<optimized out>) at /mariadb/10.2o/storage/innobase/row/row0sel.cc:5133
|
#13 0x0000563367783291 in ha_innobase::index_read (this=0x7f50bc06fa50, buf=0x7f50bc06e748 "\377", key_ptr=<optimized out>, key_len=<optimized out>, find_flag=<optimized out>) at /mariadb/10.2o/storage/innobase/handler/ha_innodb.cc:9470
|
#14 0x00005633675c2de4 in handler::ha_index_read_map (this=0x7f50bc06fa50, buf=0x7f50bc06e748 "\377", key=0x7f50b0174070 "\a", keypart_map=1, find_flag=HA_READ_KEY_EXACT) at /mariadb/10.2o/sql/handler.cc:2711
|
#15 0x00005633675c96b6 in handler::read_range_first (this=0x7f50bc06fa50, start_key=0x7f50bc06fb38, end_key=0x7f50bc06fb58, eq_range_arg=true, sorted=<optimized out>) at /mariadb/10.2o/sql/handler.cc:5455
|
#16 0x00005633675267c2 in handler::multi_range_read_next (this=0x7f50bc06fa50, range_info=0x7f510cc1ddf8) at /mariadb/10.2o/sql/multi_range_read.cc:298
|
#17 0x000056336752691c in Mrr_simple_index_reader::get_next (this=0x7f50bc06ffc0, range_info=0x7f510cc1ddf8) at /mariadb/10.2o/sql/multi_range_read.cc:335
|
#18 0x000056336752860f in DsMrr_impl::dsmrr_next (this=0x7f50bc06fe80, range_info=0x7f510cc1ddf8) at /mariadb/10.2o/sql/multi_range_read.cc:1411
|
#19 0x00005633676ee3f1 in QUICK_RANGE_SELECT::get_next (this=0x7f50b006e510) at /mariadb/10.2o/sql/opt_range.cc:11459
|
#20 0x00005633676fb05d in rr_quick (info=0x7f510cc1def0) at /mariadb/10.2o/sql/records.cc:365
|
#21 0x000056336771bb2c in mysql_delete (thd=0x7f50b0000cf8, table_list=<optimized out>, conds=<optimized out>, order_list=<optimized out>, limit=18446744073709551361, options=<optimized out>, result=<optimized out>) at /mariadb/10.2o/sql/sql_delete.cc:559
|
#22 0x0000563367408802 in mysql_execute_command (thd=<optimized out>) at /mariadb/10.2o/sql/sql_parse.cc:4371
|
#23 0x0000563367400365 in mysql_parse (thd=0x7f50b0000cf8, rawbuf=0x7f50b0010640 "DELETE FROM t3 WHERE col1 = 7", length=<optimized out>, parser_state=0x7f510cc1f720, is_com_multi=<optimized out>, is_next_command=<optimized out>) at /mariadb/10.2o/sql/sql_parse.cc:7760
|
#24 0x00005633673fd508 in dispatch_command (command=COM_QUERY, thd=0x7f50b0000cf8, packet=0x7f50b0008329 "DELETE FROM t3 WHERE col1 = 7", packet_length=29, is_com_multi=<optimized out>, is_next_command=<optimized out>) at /mariadb/10.2o/sql/sql_parse.cc:1831
|
While we are converting the implicit lock (which conflicts with our pending lock request) to an explicit one, the owning transaction is no longer listed in trx_sys->rw_trx_list, because it is about to be committed in another thread after having inserted exactly 1 record:
#7 0x00005633677f9f8e in PolicyMutex<TTASEventMutex<GenericPolicy> >::enter (
|
this=<optimized out>, n_spins=30, n_delay=<optimized out>,
|
name=<optimized out>, line=6587)
|
at /mariadb/10.2o/storage/innobase/include/ib0mutex.h:637
|
#8 lock_trx_release_locks (trx=0x7f510d89d268)
|
at /mariadb/10.2o/storage/innobase/lock/lock0lock.cc:6587
|
#9 0x00005633679558f0 in trx_commit_in_memory (trx=<optimized out>,
|
mtr=<optimized out>, serialised=<optimized out>)
|
at /mariadb/10.2o/storage/innobase/trx/trx0trx.cc:1665
|
#10 trx_commit_low (trx=<optimized out>, mtr=<optimized out>)
|
at /mariadb/10.2o/storage/innobase/trx/trx0trx.cc:1883
|
#11 0x0000563367956bb3 in trx_commit (trx=0x7f510d89d268)
|
at /mariadb/10.2o/storage/innobase/trx/trx0trx.cc:1907
|
#12 0x0000563367958204 in trx_commit_for_mysql (trx=0x7f510d89d268)
|
at /mariadb/10.2o/storage/innobase/trx/trx0trx.cc:2116
|
#13 0x0000563367794a30 in innobase_commit_low (trx=<optimized out>)
|
at /mariadb/10.2o/storage/innobase/handler/ha_innodb.cc:4524
|
#14 innobase_commit_ordered_2 (trx=0x7f510d89d268, thd=<optimized out>)
|
at /mariadb/10.2o/storage/innobase/handler/ha_innodb.cc:4641
|
#15 0x000056336778cd76 in innobase_commit (hton=<optimized out>,
|
thd=0x7f50bc000cf8, commit_trx=<optimized out>)
|
at /mariadb/10.2o/storage/innobase/handler/ha_innodb.cc:4757
|
#16 0x00005633675c0545 in commit_one_phase_2 (thd=0x7f50bc000cf8,
|
all=<optimized out>, trans=0x7f50bc003fc8, is_real_trans=<optimized out>)
|
at /mariadb/10.2o/sql/handler.cc:1604
|
#17 0x00005633675c0470 in ha_commit_one_phase (thd=0x7f50bc000cf8,
|
all=<optimized out>) at /mariadb/10.2o/sql/handler.cc:1585
|
#18 0x00005633675c0081 in ha_commit_trans (thd=0x7f50bc000cf8,
|
all=<optimized out>) at /mariadb/10.2o/sql/handler.cc:1452
|
#19 0x0000563367508c5a in trans_commit_stmt (thd=0x7f50bc000cf8)
|
at /mariadb/10.2o/sql/transaction.cc:510
|
#20 0x0000563367406ffa in mysql_execute_command (thd=<optimized out>)
|
at /mariadb/10.2o/sql/sql_parse.cc:6043
|
#21 0x0000563367400365 in mysql_parse (thd=0x7f50bc000cf8,
|
rawbuf=0x7f50bc010640 "INSERT INTO t3 (col1, col_text ) VALUES ( 7, 77 )", length=<optimized out>, parser_state=0x7f510cc6a720,
|
is_com_multi=<optimized out>, is_next_command=<optimized out>)
|
at /mariadb/10.2o/sql/sql_parse.cc:7760
|
There is another transaction that is likewise being blocked, before having modified anything:
#7 0x00005633677f845e in PolicyMutex<TTASEventMutex<GenericPolicy> >::enter (
|
this=<optimized out>, n_spins=30, n_delay=6, name=<optimized out>,
|
line=6133) at /mariadb/10.2o/storage/innobase/include/ib0mutex.h:637
|
#8 lock_clust_rec_read_check_and_lock (flags=<optimized out>,
|
block=0x7f510cf9b268, rec=0x7f510d5f0070 "supremum",
|
index=0x7f50b818acc8, offsets=0x7f510cbd2570, mode=LOCK_X, gap_mode=0,
|
thr=0x7f50b017d678)
|
at /mariadb/10.2o/storage/innobase/lock/lock0lock.cc:6133
|
#9 0x00005633678d6f2e in sel_set_rec_lock (pcur=0x7f50b017d128,
|
rec=0x7f510d5f0070 "supremum", index=0x7f50b818acc8,
|
offsets=<optimized out>, mode=<optimized out>, type=0,
|
thr=<optimized out>, mtr=<optimized out>)
|
at /mariadb/10.2o/storage/innobase/row/row0sel.cc:1261
|
#10 0x00005633678d2b52 in row_search_mvcc (buf=<optimized out>,
|
mode=<optimized out>, prebuilt=0x7f50b017cf58,
|
match_mode=<optimized out>, direction=<optimized out>)
|
at /mariadb/10.2o/storage/innobase/row/row0sel.cc:4835
|
#11 0x00005633677836f0 in ha_innobase::general_fetch (this=0x7f50b017b880,
|
buf=0x7f50b0178258 "\377\a", direction=1, match_mode=0)
|
at /mariadb/10.2o/storage/innobase/handler/ha_innodb.cc:9737
|
#12 0x00005633675c3431 in handler::ha_index_next (this=0x7f50b017b880,
|
buf=0x7f50b0178258 "\377\a") at /mariadb/10.2o/sql/handler.cc:2761
|
#13 0x00005633676fb2e6 in rr_index (info=0x7f510cbd3318)
|
at /mariadb/10.2o/sql/records.cc:449
|
#14 0x00005633674b6762 in mysql_update (thd=0x7f50b4000cf8,
|
table_list=<optimized out>, fields=..., values=...,
|
conds=<optimized out>, order_num=<optimized out>, order=<optimized out>,
|
limit=18446744073709551615, handle_duplicates=<optimized out>,
|
ignore=<optimized out>, found_return=<optimized out>,
|
updated_return=<optimized out>) at /mariadb/10.2o/sql/sql_update.cc:629
|
#15 0x0000563367404999 in mysql_execute_command (thd=<optimized out>)
|
at /mariadb/10.2o/sql/sql_parse.cc:4007
|
#16 0x0000563367400365 in mysql_parse (thd=0x7f50b4000cf8,
|
rawbuf=0x7f50b4010640 "UPDATE t3 SET col1 = 7", length=<optimized out>,
|
parser_state=0x7f510cbd4720, is_com_multi=<optimized out>,
|
is_next_command=<optimized out>) at /mariadb/10.2o/sql/sql_parse.cc:7760
|
The table contains the following records:
(col1,DB_TRX_ID,DB_ROLL_PTR,col_text)=(4,0x6cf5,update,77),(7,0x6d17,insert,77)
(col_text,col1)=(77,4),(77,7), PAGE_MAX_TRX_ID=0x6d17
We have trx_sys->max_trx_id=0x6d1f, and both trx_sys->rw_trx_set and trx_sys->rw_trx_list contain only the two blocked transactions 0x6d1d and 0x6d1c. The committing transaction 0x6d17 is not listed. Because the most significant bit of DB_ROLL_PTR is set in the record, purge must have run after the UPDATE t3 SET col1=4 (necessarily by an earlier-committed transaction 0x6cf5) that had delete-marked the clustered index record, before the being-committed INSERT was executed.
I cannot spot any wrong-doing by transaction 0x6d17.
The problem appears to be a race condition between the commit of that transaction, and an active transaction that is waiting for a lock that is being implicitly held by the committing transaction. In lock_rec_convert_impl_to_expl(), at the time of the call to trx_rw_is_active(), the trx_sys->rw_trx_set must have contained the transaction 0x6d17. Because neither trx_sys->mutex nor lock_sys->mutex are being held, the transaction may be delisted. Later, in lock_rec_convert_impl_to_expl_for_trx(), we acquire lock_sys->mutex (which is blocking the commit of the conflicting transaction), and in lock_rec_add_to_queue() the assertion fails, because lock_rec_other_has_expl_req(LOCK_X, block, wait=false, heap_no, trx) finds an explicit lock.
Could it simply be that the other blocked transaction had converted the implicit lock to an explicit one? The answer appears to be no, because the code is blocked in lock_clust_rec_read_check_and_lock(), which was invoked on the supremum pseudo-record:
if (heap_no != PAGE_HEAP_NO_SUPREMUM) {
|
|
lock_rec_convert_impl_to_expl(block, rec, index, offsets);
|
}
|
|
lock_mutex_enter();
|
Unfortunately, no plausible value of other_lock was available in the core dump that I analyzed. It was other_lock=NULL, the only acceptable value for the assertion expression. In lock_sys->rec_hash->array, at the time of the crash, there is only one record lock bitmap entry, for LOCK_REC|LOCK_REC_NOT_GAP|LOCK_X of the committing transaction, and apparently an empty bitmap (no locks set).
On a rerun with improved instrumentation, we have a plausible value of other_lock, but the crash looks slightly different. We have transactions 0x31d5 and 0x31d6 in trx_sys->rw_trx_list and trx_sys->rw_trx_set. The assertion fails in DELETE (transaction 0x31d6). The lock_sys->rec_hash->array contains exactly one record lock bitmap, associated with transaction 0x31ce, an INSERT that is being committed. The transaction 0x31d5, which is associated with the other_lock that tripped the assertion, got a bit further than last time, and is waiting for fix_block->lock X-latch here:
#10 buf_page_get_gen (page_id=..., page_size=..., rw_latch=2, guess=0x0, mode=10, file=0x5585e6d67a50 "/mariadb/10.2o/storage/innobase/row/row0upd.cc", line=3097, mtr=0x7fd2bff666b8, err=0x7fd2bff6509c) at /mariadb/10.2o/storage/innobase/buf/buf0buf.cc:4918
|
#11 0x00005585e6928c4f in btr_cur_search_to_nth_level (index=<optimized out>, level=<optimized out>, tuple=<optimized out>, mode=<optimized out>, latch_mode=<optimized out>, cursor=<optimized out>, has_search_latch=0, file=0x5585e6d67a50 "/mariadb/10.2o/storage/innobase/row/row0upd.cc", line=3097, mtr=0x7fd2bff666b8, autoinc=0) at /mariadb/10.2o/storage/innobase/btr/btr0cur.cc:1242
|
#12 0x00005585e6948fd7 in btr_pcur_open_with_no_init_func (index=<optimized out>, tuple=0x7fd26801e1f8, mode=3398803468, latch_mode=0, cursor=<optimized out>, has_search_latch=0, file=<optimized out>, line=<optimized out>, mtr=<optimized out>) at /mariadb/10.2o/storage/innobase/include/btr0pcur.ic:525
|
#13 btr_pcur_restore_position_func (latch_mode=0, cursor=0x7fd268062ab0, file=0x5585e6d67a50 "/mariadb/10.2o/storage/innobase/row/row0upd.cc", line=<optimized out>, mtr=0x7fd2bff666b8) at /mariadb/10.2o/storage/innobase/btr/btr0pcur.cc:351
|
#14 0x00005585e688b2d2 in row_upd_clust_step (node=<optimized out>, thr=<optimized out>) at /mariadb/10.2o/storage/innobase/row/row0upd.cc:3097
|
#15 row_upd (node=0x7fd25c080b90, thr=0x7fd25c080e98) at /mariadb/10.2o/storage/innobase/row/row0upd.cc:3281
|
#16 0x00005585e688acf3 in row_upd_step (thr=0x7fd25c080e98) at /mariadb/10.2o/storage/innobase/row/row0upd.cc:3427
|
#17 0x00005585e68392bb in row_update_for_mysql (prebuilt=0x7fd25c080088) at /mariadb/10.2o/storage/innobase/row/row0mysql.cc:1828
|
#18 0x00005585e6718f95 in ha_innobase::update_row (this=0x7fd25c07f900, old_row=0x7fd25c0296b8 "\377\004", new_row=0x7fd25c0296a8 "\377\a") at /mariadb/10.2o/storage/innobase/handler/ha_innodb.cc:9016
|
#19 0x00005585e6562183 in handler::ha_update_row (this=0x7fd25c07f900, old_data=0x7fd25c0296b8 "\377\004", new_data=0x7fd25c0296a8 "\377\a") at /mariadb/10.2o/sql/handler.cc:6121
|
#20 0x00005585e644ded9 in mysql_update (thd=0x7fd268000cf8, table_list=<optimized out>, fields=..., values=..., conds=<optimized out>, order_num=<optimized out>, order=<optimized out>, limit=18446744073709551615, handle_duplicates=<optimized out>, ignore=<optimized out>, found_return=<optimized out>, updated_return=<optimized out>) at /mariadb/10.2o/sql/sql_update.cc:818
|
#21 0x00005585e639b999 in mysql_execute_command (thd=<optimized out>) at /mariadb/10.2o/sql/sql_parse.cc:4007
|
#22 0x00005585e6397365 in mysql_parse (thd=0x7fd268000cf8, rawbuf=0x7fd268010640 "UPDATE t3 SET col1 = 7", length=<optimized out>, parser_state=0x7fd2bff68720, is_com_multi=<optimized out>, is_next_command=<optimized out>) at /mariadb/10.2o/sql/sql_parse.cc:7760
|
The other_lock appears to be LOCK_X|LOCK_REC, granted to the UPDATE transaction (which by the way is still about to perform its first modification, blocked by the clustered index page latch that the DELETE is holding). The record lock bitmap is 0xe, meaning that heap numbers 1,2,3 (page supremum and 2 user records) are locked. In the table, we have
(col1,DB_TRX_ID,DB_ROLL_PTR,col_text)=(4,0x31cc,insert,77), (7,0x31ce,insert,77)
(col_text,col1)=(77,4), (77,7), PAGE_MAX_TRX_ID=0x31ce
In this case, purge had removed any delete-marked records from the clustered index, both before UPDATE t3 SET col1=4 (by committed transaction 0x31cc) and the INSERT (by committing transaction 0x31ce).
The scenario seems to be as follows:
- The INSERT transaction performed its operation, implicitly locking col1=7.
- The DELETE transaction, about to enqueue a waiting lock request on col1=7, starts to convert the implicit lock to explicit.
- The UPDATE transaction somehow gets in between, and gets the request on col1=7 granted.
- The assertion fails in the DELETE transaction.
- For the being-committed INSERT, blocked by lock_sys->mutex, there are no bits set in the only entry that is registered in lock_sys->rec_hash. This entry ought to have been created by an earlier transaction, possibly the currently active UPDATE transaction.
Open questions:
- Why is the other_lock no longer registered in lock_sys->rec_hash? It belongs to an active transaction that is under an impression that it successfully acquired the lock. UPDATE (until MDEV-16232 is implemented) always relies on explicit locks, never implicit ones.
- Were the lock bits for the lock_sys->rec_hash entry for the INSERT transaction ever set? Was that entry originally created by the UPDATE transaction, or was it by some earlier transaction?
- When was the lock request granted for the UPDATE transaction, and by which thread?
|
|
In my previous analysis, I forgot to dereference the singly-linked list of ib_lock_t::hash. Actually the other_lock appears to be linked from the entry in lock_sys->rec_hash, as the very last element, in the following case.
The following variation of the test appears to crash slightly less frequently:
CREATE TABLE t3 (col1 INT PRIMARY KEY, col_text INT UNSIGNED NOT NULL DEFAULT 0, INDEX uidx3(col_text)) ENGINE=InnoDB;
|
thread1: INSERT INTO t3 (col1, col_text ) VALUES ( 7, 77 );
|
thread2: DELETE FROM t3;
|
thread3: DELETE FROM t3;
|
Unsurprisingly, the crash occurs on DELETE FROM t3. The other_lock owner is waiting for the clustered index page latch here:
#13 btr_pcur_restore_position_func (latch_mode=0, cursor=0x7f5ae8179420,
|
file=0x55c7f4d9da70 "/mariadb/10.2o/storage/innobase/row/row0upd.cc",
|
line=<optimized out>, mtr=0x7f5b3fed0578)
|
at /mariadb/10.2o/storage/innobase/btr/btr0pcur.cc:351
|
#14 0x000055c7f48c1302 in row_upd_clust_step (node=<optimized out>,
|
thr=<optimized out>)
|
at /mariadb/10.2o/storage/innobase/row/row0upd.cc:3097
|
#15 row_upd (node=0x7f5ae404b8a0, thr=0x7f5ae404bba8)
|
at /mariadb/10.2o/storage/innobase/row/row0upd.cc:3281
|
#16 0x000055c7f48c0d23 in row_upd_step (thr=0x7f5ae404bba8)
|
at /mariadb/10.2o/storage/innobase/row/row0upd.cc:3427
|
#17 0x000055c7f486f2eb in row_update_for_mysql (prebuilt=0x7f5ae404b0c8)
|
at /mariadb/10.2o/storage/innobase/row/row0mysql.cc:1828
|
#18 0x000055c7f474f524 in ha_innobase::delete_row (this=0x7f5ae407dd60,
|
record=<optimized out>)
|
at /mariadb/10.2o/storage/innobase/handler/ha_innodb.cc:9136
|
4 first items in the only occupied hash bucket lock_sys->rec_hash belong to the being-committed INSERT transaction. The 5th and last item is the other_lock by the other DELETE. The first items are (all for the clustered index):
- LOCK_REC|LOCK_S|LOCK_REC_NOT_GAP({})
- LOCK_REC|LOCK_X({supremum})
- LOCK_REC|LOCK_X|LOCK_INSERT_INTENTION({supremum})
- LOCK_REC|LOCK_S|LOCK_GAP({col1=7})
The last item is other_lock, also for the clustered index:
- LOCK_REC|LOCK_X({col1=7})
This is clearly conflicting with the lock that the being-committed INSERT transaction is still holding! So, the assertion seems to be pretty much valid.
Also in this case, purge had run into completion before the INSERT.
The failure can be repeated even without involving a secondary index:
CREATE TABLE t3 (col1 INT PRIMARY KEY) ENGINE=InnoDB;
|
thread1: INSERT INTO t3 SET col1=7;
|
thread2: DELETE FROM t3;
|
thread3: DELETE FROM t3;
|
For the above test, I got a simpler failure where lock_sys->rec_hash only contains other_lock, which belongs to the other DELETE transaction that is blocked by the clustered index page latch that our DELETE is holding:
#10 buf_page_get_gen (page_id=..., page_size=..., rw_latch=2, guess=0x0,
|
mode=10,
|
file=0x55ca29432a70 "/mariadb/10.2o/storage/innobase/row/row0upd.cc",
|
line=3097, mtr=0x7fa572d59578, err=0x7fa572d57f5c)
|
at /mariadb/10.2o/storage/innobase/buf/buf0buf.cc:4918
|
#11 0x000055ca28ff3c7f in btr_cur_search_to_nth_level (index=<optimized out>,
|
level=<optimized out>, tuple=<optimized out>, mode=<optimized out>,
|
latch_mode=<optimized out>, cursor=<optimized out>, has_search_latch=0,
|
file=0x55ca29432a70 "/mariadb/10.2o/storage/innobase/row/row0upd.cc",
|
line=3097, mtr=0x7fa572d59578, autoinc=0)
|
at /mariadb/10.2o/storage/innobase/btr/btr0cur.cc:1242
|
#12 0x000055ca29014007 in btr_pcur_open_with_no_init_func (
|
index=<optimized out>, tuple=0x7fa514075b38, mode=2037727244,
|
latch_mode=0, cursor=<optimized out>, has_search_latch=0,
|
file=<optimized out>, line=<optimized out>, mtr=<optimized out>)
|
at /mariadb/10.2o/storage/innobase/include/btr0pcur.ic:525
|
#13 btr_pcur_restore_position_func (latch_mode=0, cursor=0x7fa514017830,
|
file=0x55ca29432a70 "/mariadb/10.2o/storage/innobase/row/row0upd.cc",
|
line=<optimized out>, mtr=0x7fa572d59578)
|
at /mariadb/10.2o/storage/innobase/btr/btr0pcur.cc:351
|
#14 0x000055ca28f56302 in row_upd_clust_step (node=<optimized out>,
|
thr=<optimized out>)
|
at /mariadb/10.2o/storage/innobase/row/row0upd.cc:3097
|
#15 row_upd (node=0x7fa520078d18, thr=0x7fa520078ff0)
|
at /mariadb/10.2o/storage/innobase/row/row0upd.cc:3281
|
#16 0x000055ca28f55d23 in row_upd_step (thr=0x7fa520078ff0)
|
at /mariadb/10.2o/storage/innobase/row/row0upd.cc:3427
|
#17 0x000055ca28f042eb in row_update_for_mysql (prebuilt=0x7fa520078268)
|
at /mariadb/10.2o/storage/innobase/row/row0mysql.cc:1828
|
#18 0x000055ca28de4524 in ha_innobase::delete_row (this=0x7fa52006ef20,
|
record=<optimized out>)
|
at /mariadb/10.2o/storage/innobase/handler/ha_innodb.cc:9136
|
Again, trx_sys->rw_trx_list only contains the two DELETE transactions (0x34bf, 0x34c0). The INSERT transaction 0x34be is being committed, waiting for the lock_sys->mutex. The table contents is (col1,DB_TRX_ID,DB_ROLL_PTR)=(7,0x34be,insert).
In this case, the scenario seems simpler, and looks like a possible subset of the more complex scenario presented earlier:
- The INSERT transaction performed its operation, implicitly locking col1=7.
- The current DELETE transaction started converting the implicit lock into explicit one.
- The INSERT transaction was removed from trx_sys->rw_trx_list and trx_sys->rw_trx_set in trx_erase_lists(), invoked by trx_commit_in_memory().
- The other DELETE transaction noticed that there is no implicit record lock, and registered a conflicting explicit record lock.
- The current DELETE transaction resumes execution and hits the assertion failure due to the conflicting other_lock.
I believe that it should also be possible to replace one of the DELETE with SELECT * FROM t3 FOR UPDATE, but I did not try that yet.
|
|
Here is an analysis of a 10.3 crash with the partial fix:
|
10.3 12cd8d54e373f5e8de98ed2cb5bc6921049a38d2
|
#5 0x00005647d21e00d0 in lock_rec_add_to_queue (type_mode=1059,
|
block=0x7fc32b1cf170, heap_no=2, index=0x7fc2d00f7608,
|
trx=0x7fc32ba8e288, caller_owns_trx_mutex=<optimized out>)
|
at /mariadb/10.3/storage/innobase/lock/lock0lock.cc:1853
|
#6 0x00005647d21f07f3 in lock_rec_convert_impl_to_expl_for_trx (
|
block=<optimized out>, rec=<optimized out>, index=<optimized out>,
|
trx=<optimized out>, heap_no=2)
|
at /mariadb/10.3/storage/innobase/lock/lock0lock.cc:5431
|
#7 lock_rec_convert_impl_to_expl (caller_trx=<optimized out>, block=0x7fc32b1cf170, rec=<optimized out>, index=<optimized out>, offsets=<optimized out>) at /mariadb/10.3/storage/innobase/lock/lock0lock.cc:5591
|
#8 0x00005647d21f2e00 in lock_clust_rec_read_check_and_lock (flags=0, block=0x7fc32b1cf170, rec=0x7fc32b51807d "\200", index=0x7fc2d00f7608, offsets=0x7fc3280dfec0, mode=LOCK_X, gap_mode=1024, thr=0x7fc2d4056548) at /mariadb/10.3/storage/innobase/lock/lock0lock.cc:5854
|
#9 0x00005647d232a1f9 in sel_set_rec_lock (pcur=0x7fc2d4055ce8, rec=<optimized out>, index=0x7fc2d00f7608, offsets=<optimized out>, mode=<optimized out>, type=<optimized out>, thr=0x7fc2d4056548, mtr=0x7fc3280e01e8) at /mariadb/10.3/storage/innobase/row/row0sel.cc:1260
|
#10 0x00005647d2326abb in row_search_mvcc (buf=<optimized out>, mode=<optimized out>, prebuilt=0x7fc2d4055b18, match_mode=<optimized out>, direction=0) at /mariadb/10.3/storage/innobase/row/row0sel.cc:5062
|
…
|
#22 0x00005647d1d3f53d in mysql_parse (thd=0x7fc2b8000cf8, rawbuf=0x7fc2b80109f0 "DELETE FROM t3 WHERE col1 = 7", length=<optimized out>, parser_state=<optimized out>, is_com_multi=<optimized out>, is_next_command=<optimized out>) at /mariadb/10.3/sql/sql_parse.cc:7829
|
The current transaction is 0x3d1b, and the other_lock->trx->id is 0x3d1c.
The clustered index root page contains (col1,DB_TRX_ID,DB_ROLL_PTR,col_text)=(4,0,1<<55,77),(7,0x3d19,insert,77).
The transaction 0x3d19 is waiting for lock_sys.mutex at the start of lock_trx_release_locks().
Because even with the partial fix, we will already have removed the transaction from trx_sys.rw_trx_hash at this point of commit, it is very possible that the other_lock->trx thread observed that the record is no longer implicitly locked. That thread is waiting for the clustered index leaf page latch, which our crashing thread is holding.
I developed a follow-up fix that allows us to remove the transaction from trx_sys.rw_trx_hash after the locks have been released. The MVCC read view creation still needs some work, because the test innodb.innodb-read-view is returning wrong result.
|