Details
-
Bug
-
Status: Closed (View Workflow)
-
Critical
-
Resolution: Fixed
-
10.6, 10.7(EOL), 10.8(EOL), 10.9(EOL)
Description
As noted in MDEV-27026 and MDEV-27983, the test innodb_fts.concurrent_insert would occasionally hang.
I was suspecting some foul play in the futex-based 10.6 latch implementation, but it turns out that the reason is a simple high-level bug: fts_optimize_thread is holding a shared dict_sys.latch while it is trying to acquire an exclusive latch. I found out about it by restoring the unnecessary exclusive dict_sys.latch acquisition to dict_index_set_merge_threshold() (the MDEV-27026 "fix"). The debug field dict_sys.latch_readers did indicate that a shared lock had been granted. To find out which thread it was granted to, I changed the dict_sys.latch to be of type index_lock, which will explicitly track the threads that are holding shared locks:
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
|
index f64fd6f04c9..5e050b27f6a 100644
|
--- a/storage/innobase/dict/dict0dict.cc
|
+++ b/storage/innobase/dict/dict0dict.cc
|
@@ -984,7 +984,7 @@ void dict_sys_t::lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line))
|
if (latch_ex_wait_start.compare_exchange_strong
|
(old, now, std::memory_order_relaxed, std::memory_order_relaxed))
|
{
|
- latch.wr_lock(SRW_LOCK_ARGS(file, line));
|
+ latch.x_lock(SRW_LOCK_ARGS(file, line));
|
latch_ex_wait_start.store(0, std::memory_order_relaxed);
|
ut_ad(!latch_readers);
|
ut_ad(!latch_ex);
|
@@ -1003,7 +1003,7 @@ void dict_sys_t::lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line))
|
if (waited > threshold / 4)
|
ib::warn() << "A long wait (" << waited
|
<< " seconds) was observed for dict_sys.latch";
|
- latch.wr_lock(SRW_LOCK_ARGS(file, line));
|
+ latch.x_lock(SRW_LOCK_ARGS(file, line));
|
ut_ad(!latch_readers);
|
ut_ad(!latch_ex);
|
ut_d(latch_ex= true);
|
@@ -4285,6 +4285,8 @@ dict_index_set_merge_threshold(
|
|
mtr.start();
|
|
+ dict_sys.lock(SRW_LOCK_CALL);
|
+
|
sys_index = UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes);
|
|
/* Find the index row in SYS_INDEXES */
|
@@ -4320,6 +4322,8 @@ dict_index_set_merge_threshold(
|
|
mtr_commit(&mtr);
|
mem_heap_free(heap);
|
+
|
+ dict_sys.unlock();
|
}
|
|
#ifdef UNIV_DEBUG
|
@@ -4654,7 +4658,7 @@ void dict_sys_t::close()
|
temp_id_hash.free();
|
|
unlock();
|
- latch.destroy();
|
+ latch.free();
|
|
mysql_mutex_destroy(&dict_foreign_err_mutex);
|
|
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
|
index a02f4761964..1b9f23666ce 100644
|
--- a/storage/innobase/include/dict0dict.h
|
+++ b/storage/innobase/include/dict0dict.h
|
@@ -1354,7 +1354,7 @@ class dict_sys_t
|
std::atomic<ulonglong> latch_ex_wait_start;
|
|
/** the rw-latch protecting the data dictionary cache */
|
- MY_ALIGNED(CACHE_LINE_SIZE) srw_lock latch;
|
+ MY_ALIGNED(CACHE_LINE_SIZE) index_lock latch;
|
#ifdef UNIV_DEBUG
|
/** whether latch is being held in exclusive mode (by any thread) */
|
bool latch_ex;
|
@@ -1562,7 +1562,7 @@ class dict_sys_t
|
/** Exclusively lock the dictionary cache. */
|
void lock(SRW_LOCK_ARGS(const char *file, unsigned line))
|
{
|
- if (latch.wr_lock_try())
|
+ if (latch.x_lock_try())
|
{
|
ut_ad(!latch_readers);
|
ut_ad(!latch_ex);
|
@@ -1570,7 +1570,7 @@ class dict_sys_t
|
}
|
else
|
lock_wait(SRW_LOCK_ARGS(file, line));
|
- }
|
+ }
|
|
#ifdef UNIV_PFS_RWLOCK
|
/** Unlock the data dictionary cache. */
|
@@ -1586,12 +1586,12 @@ class dict_sys_t
|
ut_ad(latch_ex);
|
ut_ad(!latch_readers);
|
ut_d(latch_ex= false);
|
- latch.wr_unlock();
|
+ latch.x_unlock();
|
}
|
/** Acquire a shared lock on the dictionary cache. */
|
void freeze()
|
{
|
- latch.rd_lock();
|
+ latch.s_lock();
|
ut_ad(!latch_ex);
|
ut_d(latch_readers++);
|
}
|
@@ -1600,7 +1600,7 @@ class dict_sys_t
|
{
|
ut_ad(!latch_ex);
|
ut_ad(latch_readers--);
|
- latch.rd_unlock();
|
+ latch.s_unlock();
|
}
|
#endif
|
|
With this patch and the test invocation (add repeat or parallel as suitable):
./mtr --mysqld=--innodb-fatal-semaphore-wait-threshold=3 --mysqld=--innodb-purge-threads=1 innodb_fts.concurrent_insert
|
I reproduced another failure: a recursive S-latch acquisition, reproduced with rr. Also that is forbidden by our design rules:
(rr) c
|
Continuing.
|
|
Thread 7 hit Hardware watchpoint 3: -location dict_sys.latch.readers_lock
|
|
Old value = {lock = {<std::__atomic_base<unsigned int>> = {_M_i = 0}, <No data fields>}}
|
New value = {lock = {<std::__atomic_base<unsigned int>> = {_M_i = 2147483649}, <No data fields>}}
|
0x0000556b4391b20c in srw_mutex_impl<false>::wr_lock (this=0x556b4451bc58 <dict_sys+88>) at /mariadb/10.6/storage/innobase/include/srw_lock.h:134
|
134 void wr_lock() { if (!wr_lock_try()) wait_and_lock(); }
|
1: *dict_sys.latch.readers._M_b._M_p = std::unordered_multiset with 0 elements
|
(rr)
|
Continuing.
|
|
Thread 7 hit Hardware watchpoint 3: -location dict_sys.latch.readers_lock
|
|
Old value = {lock = {<std::__atomic_base<unsigned int>> = {_M_i = 2147483649}, <No data fields>}}
|
New value = {lock = {<std::__atomic_base<unsigned int>> = {_M_i = 0}, <No data fields>}}
|
0x0000556b4391b2e9 in srw_mutex_impl<false>::wr_unlock (this=0x556b4451bc58 <dict_sys+88>) at /mariadb/10.6/storage/innobase/include/srw_lock.h:138
|
138 if (lk != HOLDER + 1)
|
1: *dict_sys.latch.readers._M_b._M_p = std::unordered_multiset with 1 element = {[0] = 84956419741248}
|
(rr) fin
|
Run till exit from #0 0x0000556b4391b2e9 in srw_mutex_impl<false>::wr_unlock (this=0x556b4451bc58 <dict_sys+88>) at /mariadb/10.6/storage/innobase/include/srw_lock.h:138
|
dict_sys_t::freeze (this=0x556b4451bc00 <dict_sys>) at /mariadb/10.6/storage/innobase/include/dict0dict.h:1595
|
1595 ut_ad(!latch_ex);
|
1: *dict_sys.latch.readers._M_b._M_p = std::unordered_multiset with 1 element = {[0] = 84956419741248}
|
(rr) fin
|
Run till exit from #0 dict_sys_t::freeze (this=0x556b4451bc00 <dict_sys>) at /mariadb/10.6/storage/innobase/include/dict0dict.h:1595
|
0x0000556b43a8bc0a in dict_acquire_mdl_shared<true> (table=0xfa740098ae0, thd=<optimized out>, mdl=mdl@entry=0x4d4475384a78, table_op=table_op@entry=DICT_TABLE_OP_NORMAL)
|
at /mariadb/10.6/storage/innobase/dict/dict0dict.cc:752
|
752 table= dict_sys.find_table(table_id);
|
1: *dict_sys.latch.readers._M_b._M_p = std::unordered_multiset with 1 element = {[0] = 84956419741248}
|
(rr) fin
|
Run till exit from #0 0x0000556b43a8bc0a in dict_acquire_mdl_shared<true> (table=0xfa740098ae0, thd=<optimized out>, mdl=mdl@entry=0x4d4475384a78, table_op=table_op@entry=DICT_TABLE_OP_NORMAL)
|
at /mariadb/10.6/storage/innobase/dict/dict0dict.cc:752
|
|
Thread 7 hit Hardware watchpoint 3: -location dict_sys.latch.readers_lock
|
|
Old value = {lock = {<std::__atomic_base<unsigned int>> = {_M_i = 0}, <No data fields>}}
|
New value = {lock = {<std::__atomic_base<unsigned int>> = {_M_i = 2147483649}, <No data fields>}}
|
0x0000556b4391c1f8 in srw_mutex_impl<false>::wr_lock (this=0x556b4451bc58 <dict_sys+88>) at /mariadb/10.6/storage/innobase/include/srw_lock.h:134
|
134 void wr_lock() { if (!wr_lock_try()) wait_and_lock(); }
|
1: *dict_sys.latch.readers._M_b._M_p = std::unordered_multiset with 1 element = {[0] = 84956419741248}
|
(rr) bt
|
#0 0x0000556b4391c1f8 in srw_mutex_impl<false>::wr_lock (this=0x556b4451bc58 <dict_sys+88>) at /mariadb/10.6/storage/innobase/include/srw_lock.h:134
|
#1 sux_lock<ssux_lock_impl<false> >::have_s (this=0x556b4451bc40 <dict_sys+64>) at /mariadb/10.6/storage/innobase/include/sux_lock.h:177
|
#2 sux_lock<ssux_lock_impl<false> >::s_lock (this=0x556b4451bc40 <dict_sys+64>) at /mariadb/10.6/storage/innobase/include/sux_lock.h:362
|
#3 dict_sys_t::freeze (this=0x556b4451bc00 <dict_sys>) at /mariadb/10.6/storage/innobase/include/dict0dict.h:1594
|
#4 0x0000556b43a8af28 in dict_table_t::parse_name<false> (this=this@entry=0xfa740098ae0,
|
db_name=@0x4d44753845f0: '\000' <repeats 72 times>, "\227\251\000\000\000\000\000\000N\246", '\000' <repeats 110 times>,
|
tbl_name=@0x4d4475384790: '\000' <repeats 16 times>, "[\000\000\000n\000\000\000\002Q\265CkU\000\000\000\000\000\000\000\000\000\000\337\364\266CkU", '\000' <repeats 11 times>, "K\355\224(K\266\f\374\377\066\\\375\177\000\000\000\300\066\\\375\177\000\000\001", '\000' <repeats 15 times>, "\337=\300}\000\000\000\000\303y\227CkU\000\000\001\000\000\000\000\000\000\000\002Q\265CkU\000\000\000\000\000\000\000\000\000\000\310,\355[\375\177\000\000p,\355[\375\177\000\000\001\000\000\000\000\000\000\000\000\300\066\\\375\177\000\000\000\000\000\000\000\000\000\000 \002\342DkU\000\000A)\246CkU\000", <incomplete sequence \310>,
|
db_name_len=db_name_len@entry=0x4d4475384510, tbl_name_len=tbl_name_len@entry=0x4d4475384518) at /mariadb/10.6/storage/innobase/dict/dict0dict.cc:620
|
#5 0x0000556b43a8bccc in dict_acquire_mdl_shared<true> (table=0xfa740098ae0, thd=<optimized out>, mdl=mdl@entry=0x4d4475384a78, table_op=table_op@entry=DICT_TABLE_OP_NORMAL)
|
at /mariadb/10.6/storage/innobase/dict/dict0dict.cc:785
|
#6 0x0000556b43acabd8 in fts_optimize_sync_table (table=<optimized out>, process_message=<optimized out>) at /mariadb/10.6/storage/innobase/fts/fts0opt.cc:2791
|
#7 0x0000556b43ace084 in fts_optimize_callback () at /mariadb/10.6/storage/innobase/fts/fts0opt.cc:2912
|
This is simple API misuse and should be fixed by the following:
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
|
index f64fd6f04c9..aab17d808c2 100644
|
--- a/storage/innobase/dict/dict0dict.cc
|
+++ b/storage/innobase/dict/dict0dict.cc
|
@@ -782,7 +782,7 @@ dict_acquire_mdl_shared(dict_table_t *table,
|
|
size_t db1_len, tbl1_len;
|
|
- if (!table->parse_name<!trylock>(db_buf1, tbl_buf1, &db1_len, &tbl1_len))
|
+ if (!table->parse_name<true>(db_buf1, tbl_buf1, &db1_len, &tbl1_len))
|
{
|
/* The table was renamed to #sql prefix.
|
Release MDL (if any) for the old name and return. */ |
The template parameter no longer indicates that an exclusive dict_sys.latch is being held. It only indicates that a shared or exclusive dict_sys.latch is being held by the current thread. And that indeed is the case here.
With this fixed, the test no longer hangs for me.
This appears to be a regression due to MDEV-24258.
Attachments
Issue Links
- relates to
-
MDEV-27026 innodb_fts.concurrent_insert failed in bb, innodb_fatal_semaphore_wait_threshold was exceeded for dict_sys.latch
- Closed
-
MDEV-24258 Merge dict_sys.mutex into dict_sys.latch
- Closed
-
MDEV-27026 innodb_fts.concurrent_insert failed in bb, innodb_fatal_semaphore_wait_threshold was exceeded for dict_sys.latch
- Closed
-
MDEV-27983 InnoDB hangs on multiple concurrent requests of a cold ROW_FORMAT=COMPRESSED page
- Closed
dict_sys.latch.sux.patch hardens dict_sys.latch to catch invalid latch acquisition while holding a shared dict_sys.latch.