Details
-
Bug
-
Status: Closed (View Workflow)
-
Major
-
Resolution: Fixed
-
10.6, 10.11, 11.4, 11.7(EOL)
Description
11.7 d60efa269e798537efa77b2b915288425f6e73f1 |
(rr) bt
|
#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
|
#1 0x000006cc48731859 in __GI_abort () at abort.c:79
|
#2 0x000006cc48731729 in __assert_fail_base (fmt=0x6cc488c7588 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x55dc39f6950a "0",
|
file=0x55dc39f68ee8 "/data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc", line=773, function=<optimized out>) at assert.c:92
|
#3 0x000006cc48742fd6 in __GI___assert_fail (assertion=0x55dc39f6950a "0", file=0x55dc39f68ee8 "/data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc", line=773,
|
function=0x55dc39f69510 "void row_purge_del_mark_error(const btr_cur_t&, const dtuple_t&)") at assert.c:101
|
#4 0x000055dc396da040 in row_purge_del_mark_error (cursor=..., entry=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:773
|
#5 0x000055dc396da2a8 in row_purge_remove_sec_if_poss_tree (node=0x55dc3d4eacf0, index=0x71c480bc780, entry=0x41e72c005d10, page_max_trx_id=1049)
|
at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:839
|
#6 0x000055dc396daa2d in row_purge_remove_sec_if_poss (node=0x55dc3d4eacf0, index=0x71c480bc780, entry=0x41e72c005d10)
|
at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:985
|
#7 0x000055dc396db69a in row_purge_upd_exist_or_extern_func (thr=0x55dc3d4e9d70, node=0x55dc3d4eacf0, undo_rec=0x7ffea87c07b5 "\a\364\f\033\027")
|
at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:1148
|
#8 0x000055dc396dcd0e in row_purge_record_func (node=0x55dc3d4eacf0, undo_rec=0x7ffea87c07b5 "\a\364\f\033\027", thr=0x55dc3d4e9d70, updated_extern=false)
|
at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:1539
|
#9 0x000055dc396dce68 in row_purge (node=0x55dc3d4eacf0, undo_rec=0x7ffea87c07b5 "\a\364\f\033\027", thr=0x55dc3d4e9d70)
|
at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:1578
|
#10 0x000055dc396dcf41 in row_purge_step (thr=0x55dc3d4e9d70) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:1641
|
#11 0x000055dc39650c82 in que_thr_step (thr=0x55dc3d4e9d70) at /data/MDEV-35619/11.7/storage/innobase/que/que0que.cc:554
|
#12 0x000055dc39650ef7 in que_run_threads_low (thr=0x55dc3d4e9d70) at /data/MDEV-35619/11.7/storage/innobase/que/que0que.cc:610
|
#13 0x000055dc3965100f in que_run_threads (thr=0x55dc3d4e9d70) at /data/MDEV-35619/11.7/storage/innobase/que/que0que.cc:630
|
#14 0x000055dc397254db in srv_task_execute () at /data/MDEV-35619/11.7/storage/innobase/srv/srv0srv.cc:1356
|
#15 0x000055dc3972593c in srv_purge_worker_task_low () at /data/MDEV-35619/11.7/storage/innobase/srv/srv0srv.cc:1487
|
#16 0x000055dc39725b2c in purge_worker_callback () at /data/MDEV-35619/11.7/storage/innobase/srv/srv0srv.cc:1498
|
#17 0x000055dc398f5a5c in tpool::task_group::execute (this=0x55dc3b167e60 <purge_task_group>, t=0x55dc3b167c60 <purge_worker_task>)
|
at /data/MDEV-35619/11.7/tpool/task_group.cc:73
|
#18 0x000055dc398f5e4a in tpool::task::execute (this=0x55dc3b167c60 <purge_worker_task>) at /data/MDEV-35619/11.7/tpool/task.cc:32
|
#19 0x000055dc398ef19d in tpool::thread_pool_generic::worker_main (this=0x55dc3d4351d0, thread_var=0x55dc3d435540) at /data/MDEV-35619/11.7/tpool/tpool_generic.cc:575
|
#20 0x000055dc398f5818 in std::__invoke_impl<void, void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> (__f=
|
@0x3558100014c8: (void (tpool::thread_pool_generic::*)(class tpool::thread_pool_generic * const, struct tpool::worker_data *)) 0x55dc398ef0ce <tpool::thread_pool_generic::worker_main(tpool::worker_data*)>, __t=@0x3558100014c0: 0x55dc3d4351d0) at /usr/include/c++/9/bits/invoke.h:73
|
#21 0x000055dc398f56fa in std::__invoke<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> (__fn=
|
@0x3558100014c8: (void (tpool::thread_pool_generic::*)(class tpool::thread_pool_generic * const, struct tpool::worker_data *)) 0x55dc398ef0ce <tpool::thread_pool_generic::worker_main(tpool::worker_data*)>) at /usr/include/c++/9/bits/invoke.h:95
|
#22 0x000055dc398f5619 in std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> >::_M_invoke<0ul, 1ul, 2ul> (this=0x3558100014b8) at /usr/include/c++/9/thread:244
|
#23 0x000055dc398f55b4 in std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> >::operator() (this=0x3558100014b8) at /usr/include/c++/9/thread:251
|
#24 0x000055dc398f5594 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> > >::_M_run (this=0x3558100014b0) at /usr/include/c++/9/thread:195
|
#25 0x000035580d3eade4 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
|
#26 0x000000006ffe2609 in start_thread (arg=<optimized out>) at pthread_create.c:477
|
#27 0x000006cc4882e353 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
|
Attachments
Issue Links
- is caused by
-
MDEV-34515 Contention between secondary index UPDATE and purge due to large innodb_purge_batch_size
-
- Closed
-
-
MDEV-35508 Race condition between purge and secondary index INSERT or UPDATE
-
- Closed
-
Conccurrently with the purge operation, the committed delete-marked secondary index record is being delete-unmarked as part of an INSERT operation that in turn is part of an UPDATE operation:
11.7 d60efa269e798537efa77b2b915288425f6e73f1
Thread 21 hit Hardware watchpoint 1: -location $rec[-5]
Old value = 32 ' '
New value = 0 '\000'
mtr_t::write<1u, (mtr_t::write_type)1, unsigned long> (this=..., block=..., ptr=..., val=...) at /data/MDEV-35619/11.7/storage/innobase/include/mtr0log.h:214
214 memcpy_low(block, uint16_t(p - block.page.frame), p, end - p);
(rr) bt
#0 mtr_t::write<1u, (mtr_t::write_type)1, unsigned long> (this=..., block=..., ptr=..., val=...) at /data/MDEV-35619/11.7/storage/innobase/include/mtr0log.h:214
#1 0x000055dc397b115b in btr_cur_upd_rec_in_place (rec=..., index=..., offsets=..., update=..., block=..., mtr=...) at /data/MDEV-35619/11.7/storage/innobase/btr/btr0cur.cc:3006
#2 0x000055dc397b2ab2 in btr_cur_update_in_place (flags=..., cursor=..., offsets=..., update=..., cmpl_info=..., thr=..., trx_id=..., mtr=...) at /data/MDEV-35619/11.7/storage/innobase/btr/btr0cur.cc:3277
#3 0x000055dc397b4431 in btr_cur_optimistic_update (flags=..., cursor=..., offsets=..., heap=..., update=..., cmpl_info=..., thr=..., trx_id=..., mtr=...)
at /data/MDEV-35619/11.7/storage/innobase/btr/btr0cur.cc:3510
#4 0x000055dc3968d8a7 in row_ins_sec_index_entry_by_modify (flags=..., mode=..., cursor=..., offsets=..., offsets_heap=..., heap=..., entry=..., thr=..., mtr=...)
at /data/MDEV-35619/11.7/storage/innobase/row/row0ins.cc:231
#5 0x000055dc3969698b in row_ins_sec_index_entry_low (flags=..., mode=..., index=..., offsets_heap=..., heap=..., entry=..., trx_id=..., thr=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0ins.cc:3139
#6 0x000055dc3969724a in row_ins_sec_index_entry (index=..., entry=..., thr=..., check_foreign=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0ins.cc:3350
#7 0x000055dc3971567a in row_upd_sec_index_entry (node=..., thr=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0upd.cc:2009
#8 0x000055dc397158b4 in row_upd_sec_step (node=..., thr=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0upd.cc:2036
#9 0x000055dc39718455 in row_upd (node=..., thr=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0upd.cc:2783
#10 0x000055dc39718834 in row_upd_step (thr=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0upd.cc:2898
#11 0x000055dc396ba316 in row_update_for_mysql (prebuilt=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0mysql.cc:1658
#12 0x000055dc394fde79 in ha_innobase::update_row (this=..., old_row=..., new_row=...) at /data/MDEV-35619/11.7/storage/innobase/handler/ha_innodb.cc:8542
(rr) thread apply 2 backtrace
Thread 2 (Thread 1797209.1797215):
#0 btr_cur_t::search_leaf (this=..., tuple=..., mode=..., latch_mode=..., mtr=...) at /data/MDEV-35619/11.7/storage/innobase/btr/btr0cur.cc:1222
#1 0x000055dc395218cb in btr_pcur_open (tuple=..., mode=..., latch_mode=..., cursor=..., mtr=...) at /data/MDEV-35619/11.7/storage/innobase/include/btr0pcur.h:430
#2 0x000055dc396e657d in row_search_index_entry (entry=..., mode=..., pcur=..., mtr=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0row.cc:1277
#3 0x000055dc396da1ea in row_purge_remove_sec_if_poss_tree (node=..., index=..., entry=..., page_max_trx_id=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:813
#4 0x000055dc396daa2d in row_purge_remove_sec_if_poss (node=..., index=..., entry=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:985
#5 0x000055dc396db69a in row_purge_upd_exist_or_extern_func (thr=..., node=..., undo_rec=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:1148
#6 0x000055dc396dcd0e in row_purge_record_func (node=..., undo_rec=..., thr=..., updated_extern=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:1539
#7 0x000055dc396dce68 in row_purge (node=..., undo_rec=..., thr=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:1578
#8 0x000055dc396dcf41 in row_purge_step (thr=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:1641
#9 0x000055dc39650c82 in que_thr_step (thr=...) at /data/MDEV-35619/11.7/storage/innobase/que/que0que.cc:554
#10 0x000055dc39650ef7 in que_run_threads_low (thr=...) at /data/MDEV-35619/11.7/storage/innobase/que/que0que.cc:610
#11 0x000055dc3965100f in que_run_threads (thr=...) at /data/MDEV-35619/11.7/storage/innobase/que/que0que.cc:630
#12 0x000055dc397254db in srv_task_execute () at /data/MDEV-35619/11.7/storage/innobase/srv/srv0srv.cc:1356
#13 0x000055dc3972593c in srv_purge_worker_task_low () at /data/MDEV-35619/11.7/storage/innobase/srv/srv0srv.cc:1487
#14 0x000055dc39725b2c in purge_worker_callback () at /data/MDEV-35619/11.7/storage/innobase/srv/srv0srv.cc:1498
w=..., new_row=...) at /data/MDEV-35619/11.7/storage/innobase/handler/ha_innodb.cc:8542
…
(rr) print $rec
$5 = (rec_t *) 0x7ffea874c1f8 "\200"
(rr) awatch *(char*)0x7ffea874c038@8
Hardware access (read/write) watchpoint 2: *(char*)0x7ffea874c038@8
(rr) continue
Continuing.
The rr replay trace seems to be of an unoptimized executable without -march=native, because my access watchpoint on PAGE_MAX_TRX_ID is being repeatedly hit for something that should be a single memory access, instead of lots of register spilling and several movzbl, shl and or instructions. Even on the 80486, the function mach_read_from_4() could be translated into a mov and bswap; on x86-64-v2 it could use movbe, in fact, also mach_read_from_8() would translate into a single movbe.
To avoid hitting numerous "false alarms" I ended up removing the original awatch and setting two separate watchpoints:
delete 2
awatch *(char*)0x7ffea874c038@8 thread 2
watch -l *(char*)0x7ffea874c038@8
reverse-continue
It turns out that the PAGE_MAX_TRX_ID did not change; it always was 0x419 even when it was read by the purge thread earlier:
(rr) reverse-continue
Continuing.
[Switching to Thread 1797209.1797215]
Thread 2 hit Hardware access (read/write) watchpoint 3: *(char*)0x7ffea874c038@8
Value = "\000\000\000\000\000\000\004\031"
0x000055dc39587703 in mach_read_from_4 (b=...) at /data/MDEV-35619/11.7/storage/innobase/include/mach0data.inl:187
187 | static_cast<uint32_t>(b[3])
(rr) backtrace
#0 0x000055dc39587703 in mach_read_from_4 (b=...) at /data/MDEV-35619/11.7/storage/innobase/include/mach0data.inl:187
#1 0x000055dc395a705b in mach_read_from_8 (b=...) at /data/MDEV-35619/11.7/storage/innobase/include/mach0data.inl:366
#2 0x000055dc395a7886 in page_get_max_trx_id (page=...) at /data/MDEV-35619/11.7/storage/innobase/include/page0page.h:407
#3 0x000055dc396da3a6 in row_purge_check (page=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:869
#4 0x000055dc396da90f in row_purge_remove_sec_if_poss_leaf (node=..., index=..., entry=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:953
#5 0x000055dc396daa03 in row_purge_remove_sec_if_poss (node=..., index=..., entry=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:983
#6 0x000055dc396db69a in row_purge_upd_exist_or_extern_func (thr=..., node=..., undo_rec=...) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:1148
The concurrent transaction carries that same transaction id 0x419. The following logic, which had been added in
MDEV-34515, seems to be at fault here:== page_get_max_trx_id(btr_cur_get_page(&pcur.btr_cur))
|| row_purge_poss_sec(node, index, entry, &mtr)) {
Even after the revision in
MDEV-35508, we page_max_trx_id carries the value 0x419, instead of the "impossible" value 1. We actually must recheck the purgeability if any transaction whose identifier is not larger than PAGE_MAX_TRX_ID is active. It turns out that there is an off-by-one error in theMDEV-35508fix:#0 trx_sys_t::find_same_or_older_in_purge (this=0x55dc3a6fb040 <trx_sys>, trx=0x6cc48903080, id=0x419) at /data/MDEV-35619/11.7/storage/innobase/include/trx0sys.h:1008
#1 0x000055dc396da43c in row_purge_check (page=0x7ffea874c000 "") at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:871
#2 0x000055dc396da90f in row_purge_remove_sec_if_poss_leaf (node=0x55dc3d4eacf0, index=0x71c480bc780, entry=0x41e72c005d10) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:953
#3 0x000055dc396daa03 in row_purge_remove_sec_if_poss (node=0x55dc3d4eacf0, index=0x71c480bc780, entry=0x41e72c005d10) at /data/MDEV-35619/11.7/storage/innobase/row/row0purge.cc:983
We hit the following code path:
Atomic_relaxed<trx_id_t> &max_inactive_id= trx->max_inactive_id_atomic;
I see that this logic actually differs from find_same_or_older(), which was introduced in
MDEV-28445to avoid a costly traversal of trx_sys.rw_trx_hash. I believe that the following should fix this:diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
index f76bdea33a4..d889ece9730 100644
--- a/storage/innobase/include/trx0sys.h
+++ b/storage/innobase/include/trx0sys.h
@@ -983,18 +983,23 @@ class trx_sys_t
Our IA-32 target is not "i386" but at least "i686", that is, at least
Pentium MMX, which has a 64-bit data bus and 64-bit XMM registers. */
+ bool hot= false;
trx->mutex_lock();
trx_id_t &max_inactive_id= trx->max_inactive_id;
- const bool hot{max_inactive_id < id && find_same_or_older(trx, id)};
+ if (max_inactive_id >= id);
+ else if (!find_same_or_older(trx, id))
+ max_inactive_id= id;
+ else
+ hot= true;
#else
Atomic_relaxed<trx_id_t> &max_inactive_id= trx->max_inactive_id_atomic;
if (max_inactive_id >= id)
return false;
trx->mutex_lock();
- const bool hot{find_same_or_older(trx, id)};
-#endif
- if (hot)
+ const bool hot{find_same_or_older(trx, id)}
+ if (!hot)
max_inactive_id= id;
+#endif
trx->mutex_unlock();
return hot;
We also need to test the #if branch, which avoids an optimization that is applicable to 64-bit targets and the IA-32. The most convenient way to do that would be to change the condition to #if 1.