#0 __pthread_kill_implementation (no_tid=0, signo=6, threadid=<optimized out>) at ./nptl/pthread_kill.c:44
#1 __pthread_kill_internal (signo=6, threadid=<optimized out>) at ./nptl/pthread_kill.c:78
#2 __GI___pthread_kill (threadid=<optimized out>, signo=6) at ./nptl/pthread_kill.c:89
#3 0x00005608cebb9b95 in my_write_core (sig=6) at /home/jan/work/mariadb/10.6/mysys/stacktrace.c:424
#4 0x00005608ce27d292 in handle_fatal_signal (sig=6) at /home/jan/work/mariadb/10.6/sql/signal_handler.cc:357
#5 <signal handler called>
#6 __pthread_kill_implementation (no_tid=0, signo=6, threadid=<optimized out>) at ./nptl/pthread_kill.c:44
#7 __pthread_kill_internal (signo=6, threadid=<optimized out>) at ./nptl/pthread_kill.c:78
#8 __GI___pthread_kill (threadid=<optimized out>, signo=signo@entry=6) at ./nptl/pthread_kill.c:89
#9 0x00007f2917e3bc46 in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26
#10 0x00007f2917e227fc in __GI_abort () at ./stdlib/abort.c:79
#11 0x00007f2917e2271b in __assert_fail_base (fmt=0x7f2917fbac30 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x5608cf0911b0 "!lock.was_chosen_as_deadlock_victim", file=0x5608cf090fb8 "/home/jan/work/mariadb/10.6/storage/innobase/include/trx0trx.h", line=1065, function=<optimized out>) at ./assert/assert.c:92
#12 0x00007f2917e33596 in __GI___assert_fail (assertion=0x5608cf0911b0 "!lock.was_chosen_as_deadlock_victim", file=0x5608cf090fb8 "/home/jan/work/mariadb/10.6/storage/innobase/include/trx0trx.h", line=1065, function=0x5608cf091160 "void trx_t::assert_freed() const") at ./assert/assert.c:101
#13 0x00005608ce963ed1 in trx_t::assert_freed (this=0x7f2901c61780) at /home/jan/work/mariadb/10.6/storage/innobase/include/trx0trx.h:1065
#14 0x00005608ce95fd3d in trx_t::commit_cleanup (this=0x7f2901c61780) at /home/jan/work/mariadb/10.6/storage/innobase/trx/trx0trx.cc:1385
#15 0x00005608ce9616bb in trx_t::commit (this=0x7f2901c61780) at /home/jan/work/mariadb/10.6/storage/innobase/trx/trx0trx.cc:1481
#16 0x00005608ce961a6d in trx_commit_for_mysql (trx=0x7f2901c61780) at /home/jan/work/mariadb/10.6/storage/innobase/trx/trx0trx.cc:1595
#17 0x00005608ce6c6e3d in innobase_commit_low (trx=0x7f2901c61780) at /home/jan/work/mariadb/10.6/storage/innobase/handler/ha_innodb.cc:4383
#18 0x00005608ce6c71ab in innobase_commit_ordered_2 (trx=0x7f2901c61780, thd=0x7f28b4000dc8) at /home/jan/work/mariadb/10.6/storage/innobase/handler/ha_innodb.cc:4489
#19 0x00005608ce6c777e in innobase_commit (hton=0x5608d11fe448, thd=0x7f28b4000dc8, commit_trx=false) at /home/jan/work/mariadb/10.6/storage/innobase/handler/ha_innodb.cc:4593
#20 0x00005608ce284832 in commit_one_phase_2 (thd=0x7f28b4000dc8, all=false, trans=0x7f28b4004738, is_real_trans=true) at /home/jan/work/mariadb/10.6/sql/handler.cc:2087
#21 0x00005608ce2845d3 in ha_commit_one_phase (thd=0x7f28b4000dc8, all=false) at /home/jan/work/mariadb/10.6/sql/handler.cc:2040
#22 0x00005608ce283663 in ha_commit_trans (thd=0x7f28b4000dc8, all=false) at /home/jan/work/mariadb/10.6/sql/handler.cc:1834
#23 0x00005608ce0a594d in trans_commit_stmt (thd=0x7f28b4000dc8) at /home/jan/work/mariadb/10.6/sql/transaction.cc:472
#24 0x00005608cdec5ce5 in mysql_execute_command (thd=0x7f28b4000dc8, is_called_from_prepared_stmt=false) at /home/jan/work/mariadb/10.6/sql/sql_parse.cc:6062
#25 0x00005608cdecba2c in mysql_parse (thd=0x7f28b4000dc8, rawbuf=0x7f28b4015650 "DELETE FROM `oltp17` WHERE `id` = 38146 /* QNO 3215 CON_ID 19 */", length=64, parser_state=0x7f2900b7a280) at /home/jan/work/mariadb/10.6/sql/sql_parse.cc:8021
#26 0x00005608cdecb102 in wsrep_mysql_parse (thd=0x7f28b4000dc8, rawbuf=0x7f28b4015650 "DELETE FROM `oltp17` WHERE `id` = 38146 /* QNO 3215 CON_ID 19 */", length=64, parser_state=0x7f2900b7a280) at /home/jan/work/mariadb/10.6/sql/sql_parse.cc:7833
#27 0x00005608cdeb741c in dispatch_command (command=COM_QUERY, thd=0x7f28b4000dc8, packet=0x7f28b403fd39 "DELETE FROM `oltp17` WHERE `id` = 38146 /* QNO 3215 CON_ID 19 */ ", packet_length=65, blocking=true) at /home/jan/work/mariadb/10.6/sql/sql_parse.cc:1883
#28 0x00005608cdeb5e66 in do_command (thd=0x7f28b4000dc8, blocking=true) at /home/jan/work/mariadb/10.6/sql/sql_parse.cc:1409
#29 0x00005608ce087ad4 in do_handle_one_connection (connect=0x5608d1817cf8, put_in_cache=true) at /home/jan/work/mariadb/10.6/sql/sql_connect.cc:1416
#30 0x00005608ce08783d in handle_one_connection (arg=0x5608d1833a28) at /home/jan/work/mariadb/10.6/sql/sql_connect.cc:1318
#31 0x00005608ce5e719a in pfs_spawn_thread (arg=0x5608d181c6a8) at /home/jan/work/mariadb/10.6/storage/perfschema/pfs.cc:2201
#32 0x00007f2917e90402 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#33 0x00007f2917f1f590 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
Steps to reproduce:
Start 3-node cluster with --wsrep-slave-threads > 2 --wsrep-debug=1
In wsrep_abort_transaction(), victim_trx->lock.set_wsrep_victim() is being invoked to set the flag even though there is no victim_trx->lock.wait_lock. The transaction has actually already been committed at that point. I think that it is wrong to modify anything in victim_trx at that point of execution. I believe that wsrep_abort_transaction() needs to acquire at exclusive lock_sys.latch (possibly also lock_sys.wait_mutex) as well as trx_t::mutex in order to avoid any race condition with a concurrent transaction commit or rollback. See lock_rec_reset_and_release_wait() and its callers.
Some noncritical part of the transaction commit is being blocked by DEBUG_SYNC:
(rr) thread apply 3 backtrace
Thread 3 (Thread 2754691.2756219):
…
#19 0x00005604abe87ac7 in debug_sync_set_action (thd=0x148d2c0074a8, action_str=0x148d3404dec0 "now SIGNAL sync.wsrep_trx_commit_reached WAIT_FOR signal.wsrep_trx_commit", len=73) at /test/mtest/10.6/sql/debug_sync.cc:1806
#20 0x00005604ac88eb5d in trx_t::commit (this=0x148d37262680) at /test/mtest/10.6/storage/innobase/trx/trx0trx.cc:1483
Marko Mäkelä
added a comment - In wsrep_abort_transaction() , victim_trx->lock.set_wsrep_victim() is being invoked to set the flag even though there is no victim_trx->lock.wait_lock . The transaction has actually already been committed at that point. I think that it is wrong to modify anything in victim_trx at that point of execution. I believe that wsrep_abort_transaction() needs to acquire at exclusive lock_sys.latch (possibly also lock_sys.wait_mutex ) as well as trx_t::mutex in order to avoid any race condition with a concurrent transaction commit or rollback. See lock_rec_reset_and_release_wait() and its callers.
(rr) p victim_trx.undo_no
$5 = 0
(rr) p victim_trx.state
$6 = {m = {static _S_min_alignment = 4, static _S_alignment = 4,
_M_i = TRX_STATE_COMMITTED_IN_MEMORY}}
Some noncritical part of the transaction commit is being blocked by DEBUG_SYNC :
(rr) thread apply 3 backtrace
Thread 3 (Thread 2754691.2756219):
…
#19 0x00005604abe87ac7 in debug_sync_set_action (thd=0x148d2c0074a8, action_str=0x148d3404dec0 "now SIGNAL sync.wsrep_trx_commit_reached WAIT_FOR signal.wsrep_trx_commit", len=73) at /test/mtest/10.6/sql/debug_sync.cc:1806
#20 0x00005604ac88eb5d in trx_t::commit (this=0x148d37262680) at /test/mtest/10.6/storage/innobase/trx/trx0trx.cc:1483
In wsrep_abort_transaction(), victim_trx->lock.set_wsrep_victim() is being invoked to set the flag even though there is no victim_trx->lock.wait_lock. The transaction has actually already been committed at that point. I think that it is wrong to modify anything in victim_trx at that point of execution. I believe that wsrep_abort_transaction() needs to acquire at exclusive lock_sys.latch (possibly also lock_sys.wait_mutex) as well as trx_t::mutex in order to avoid any race condition with a concurrent transaction commit or rollback. See lock_rec_reset_and_release_wait() and its callers.
(rr) p victim_trx.undo_no
$5 = 0
(rr) p victim_trx.state
$6 = {m = {static _S_min_alignment = 4, static _S_alignment = 4,
_M_i = TRX_STATE_COMMITTED_IN_MEMORY}}
Some noncritical part of the transaction commit is being blocked by DEBUG_SYNC:
(rr) thread apply 3 backtrace
Thread 3 (Thread 2754691.2756219):
…
#19 0x00005604abe87ac7 in debug_sync_set_action (thd=0x148d2c0074a8, action_str=0x148d3404dec0 "now SIGNAL sync.wsrep_trx_commit_reached WAIT_FOR signal.wsrep_trx_commit", len=73) at /test/mtest/10.6/sql/debug_sync.cc:1806
#20 0x00005604ac88eb5d in trx_t::commit (this=0x148d37262680) at /test/mtest/10.6/storage/innobase/trx/trx0trx.cc:1483