[MDEV-22901] Accessing btr_search_sys->hash_tables when buffer pool resize happens Created: 2020-06-15  Updated: 2020-08-11  Resolved: 2020-06-16

Status: Closed
Project: MariaDB Server
Component/s: Storage Engine - InnoDB
Affects Version/s: 10.4.13, 10.3.24
Fix Version/s: 10.3.24, 10.4.14

Type: Bug Priority: Major
Reporter: Matthias Leich Assignee: Thirunarayanan Balathandayuthapani
Resolution: Fixed Votes: 0
Labels: not-10.5, rr-profile

Attachments: File 002390.log    

 Description   

SEGV hit during RQG testing
 
==10485==ERROR: AddressSanitizer: SEGV on unknown address 0x000000000000 (pc 0x55754e5d221e bp 0x1cd2478b83c0 sp 0x1cd2478b83a0 T10)
==10485==The signal is caused by a READ memory access.
==10485==Hint: address points to the zero page.
    #0 0x55754e5d221d in btr_get_search_table storage/innobase/include/btr0sea.ic:188
    #1 0x55754e5daec2 in btr_search_build_page_hash_index storage/innobase/btr/btr0sea.cc:1533
    #2 0x55754e5dc1c8 in btr_search_move_or_delete_hash_entries(buf_block_t*, buf_block_t*) storage/innobase/btr/btr0sea.cc:1684
    #3 0x55754e2460dc in page_copy_rec_list_end(buf_block_t*, buf_block_t*, unsigned char*, dict_index_t*, mtr_t*) storage/innobase/page/page0page.cc:665
    #4 0x55754e55bbd3 in btr_compress(btr_cur_t*, unsigned long, mtr_t*) storage/innobase/btr/btr0btr.cc:3668
    #5 0x55754e5afd14 in btr_cur_compress_if_useful(btr_cur_t*, unsigned long, mtr_t*) storage/innobase/btr/btr0cur.cc:5401
    #6 0x55754e5b2b51 in btr_cur_pessimistic_delete(dberr_t*, unsigned long, btr_cur_t*, unsigned long, bool, mtr_t*) storage/innobase/btr/btr0cur.cc:5838
    #7 0x55754e3b10e1 in row_purge_remove_clust_if_poss_low storage/innobase/row/row0purge.cc:143
    #8 0x55754e3b1375 in row_purge_remove_clust_if_poss storage/innobase/row/row0purge.cc:192
    #9 0x55754e3b340f in row_purge_del_mark storage/innobase/row/row0purge.cc:647
    #10 0x55754e3b6ab3 in row_purge_record_func storage/innobase/row/row0purge.cc:1048
    #11 0x55754e3b7132 in row_purge storage/innobase/row/row0purge.cc:1109
    #12 0x55754e3b74ae in row_purge_step(que_thr_t*) storage/innobase/row/row0purge.cc:1158
    #13 0x55754e28f95a in que_thr_step storage/innobase/que/que0que.cc:947
    #14 0x55754e28fdc7 in que_run_threads_low storage/innobase/que/que0que.cc:1009
    #15 0x55754e29020a in que_run_threads(que_thr_t*) storage/innobase/que/que0que.cc:1049
    #16 0x55754e43fe5c in srv_task_execute storage/innobase/srv/srv0srv.cc:2059
    #17 0x55754e440c4d in purge_worker_callback storage/innobase/srv/srv0srv.cc:2224
    #18 0x55754e8917cb in tpool::task_group::execute(tpool::task*) tpool/task_group.cc:55
    #19 0x55754e892116 in tpool::task::execute() tpool/task.cc:47
    #20 0x55754e882639 in tpool::thread_pool_generic::worker_main(tpool::worker_data*) tpool/tpool_generic.cc:518
    #21 0x55754e88af91 in void std::__invoke_impl<void, void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*>(std::__invoke_memfun_deref, void (tpool::thread_pool_generic::*&&)(tpool::worker_data*), tpool::thread_pool_generic*&&, tpool::worker_data*&&) (/home/mleich/Server_bin/bb-10.5-release_asan/bin/mariadbd+0x35e1f91)
    #22 0x55754e888d98 in std::__invoke_result<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*>::type std::__invoke<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*>(void (tpool::thread_pool_generic::*&&)(tpool::worker_data*), tpool::thread_pool_generic*&&, tpool::worker_data*&&) /usr/include/c++/7/bits/invoke.h:95
    #23 0x55754e891102 in decltype (__invoke((_S_declval<0ul>)(), (_S_declval<1ul>)(), (_S_declval<2ul>)())) std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> >::_M_invoke<0ul, 1ul, 2ul>(std::_Index_tuple<0ul, 1ul, 2ul>) /usr/include/c++/7/thread:234
    #24 0x55754e891050 in std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> >::operator()() /usr/include/c++/7/thread:243
    #25 0x55754e890fb1 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (tpool::thread_pool_generic::*)(tpool::worker_data*), tpool::thread_pool_generic*, tpool::worker_data*> > >::_M_run() /usr/include/c++/7/thread:186
    #26 0x23c57ad976de  (/usr/lib/x86_64-linux-gnu/libstdc++.so.6+0xbd6de)
    #27 0x1203629286da in start_thread (/lib/x86_64-linux-gnu/libpthread.so.0+0x76da)
 
origin/bb-10.5-release 70a3e3ef552c0c65248151daaa45a2e978cfe86c 2020-06-13T19:32:33+03:00
 
RQG
git clone https://github.com/mleich1/rqg --branch experimental RQG
origin/experimental 611a31e38a3bc157392c0748c95991b6a248bb3b 2020-06-09T15:31:46+02:00
 
perl rqg.pl \                       
-mysqld=--loose-innodb_lock_schedule_algorithm=fcfs \
-gendata=conf/mariadb/table_stress.zz \
-gendata_sql=conf/mariadb/table_stress.sql \
-engine=Innodb \
-reporters=Deadlock1,ErrorLog,Backtrace \
-mysqld=--connect_timeout=60 \
-mysqld=--net_read_timeout=30 \
-mysqld=--net_write_timeout=60 \
-mysqld=--loose-idle_readonly_transaction_timeout=0 \
-mysqld=--loose-idle_transaction_timeout=0 \
-mysqld=--loose-idle_write_transaction_timeout=0 \
-mysqld=--interactive_timeout=28800 \
-mysqld=--lock_wait_timeout=86400 \
-mysqld=--innodb-lock-wait-timeout=50 \
-mysqld=--loose-table_lock_wait_timeout=50 \
-mysqld=--wait_timeout=28800 \
-mysqld=--slave_net_timeout=60 \
-mysqld=--loose-max-statement-time=30 \
-mysqld=--loose-debug_assert_on_not_freed_memory=0 \
-mysqld=--log-output=none \
-mysqld=--loose-innodb_fatal_semaphore_wait_threshold=300 \
-duration=300 \
-seed=random \
-grammar=conf/mariadb/table_stress_innodb.yy \
-threads=9 \
-mysqld=--innodb_stats_persistent=ON \
-mysqld=--innodb_adaptive_hash_index=ON \
-mysqld=--innodb_page_size=4K \
-mysqld=--innodb-buffer-pool-size=8M \
-duration=300 \
-no_mask \
... certain local settings ...



 Comments   
Comment by Thirunarayanan Balathandayuthapani [ 2020-06-16 ]

This issue could have caused while merging the patch of MDEV-22646. Problem exist from 10.3+.

diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc
index f24c143c24a..bc85e60aacc 100644
--- a/storage/innobase/btr/btr0sea.cc
+++ b/storage/innobase/btr/btr0sea.cc
@@ -1485,14 +1485,13 @@ btr_search_build_page_hash_index(
 
        btr_search_check_free_space_in_heap(index);
 
-       hash_table_t*   table   = btr_get_search_table(index);
+       hash_table_t*   table;
        rw_lock_x_lock(ahi_latch);
 
        if (!btr_search_enabled) {
                goto exit_func;
        }
 
-       table = btr_get_search_table(index);
        if (block->index && ((block->curr_n_fields != n_fields)
                             || (block->curr_n_bytes != n_bytes)
                             || (block->curr_left_side != left_side))) {
@@ -1516,6 +1515,7 @@ btr_search_build_page_hash_index(
        block->curr_left_side = unsigned(left_side);
        block->index = index;
 
+       table = btr_get_search_table(index);
        for (i = 0; i < n_cached; i++) {
 
                ha_insert_for_fold(table, folds[i], block, recs[i]);

Comment by Marko Mäkelä [ 2020-06-16 ]

Thanks, good analysis. I suppose that this is related to MDEV-22456?

Can we also narrow down the scope of the variable, like this? Please test and push to 10.3.

diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc
index f24c143c24a..32f5ae672e8 100644
--- a/storage/innobase/btr/btr0sea.cc
+++ b/storage/innobase/btr/btr0sea.cc
@@ -1348,7 +1348,6 @@ btr_search_build_page_hash_index(
 	ulint		n_recs;
 	ulint*		folds;
 	const rec_t**	recs;
-	ulint		i;
 	mem_heap_t*	heap		= NULL;
 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
 	rec_offs*	offsets		= offsets_;
@@ -1485,20 +1484,12 @@ btr_search_build_page_hash_index(
 
 	btr_search_check_free_space_in_heap(index);
 
-	hash_table_t*	table	= btr_get_search_table(index);
 	rw_lock_x_lock(ahi_latch);
 
 	if (!btr_search_enabled) {
 		goto exit_func;
 	}
 
-	table = btr_get_search_table(index);
-	if (block->index && ((block->curr_n_fields != n_fields)
-			     || (block->curr_n_bytes != n_bytes)
-			     || (block->curr_left_side != left_side))) {
-		goto exit_func;
-	}
-
 	/* This counter is decremented every time we drop page
 	hash index entries and is incremented here. Since we can
 	rebuild hash index for a page that is already hashed, we
@@ -1507,6 +1498,10 @@ btr_search_build_page_hash_index(
 	if (!block->index) {
 		assert_block_ahi_empty(block);
 		index->search_info->ref_count++;
+	} else if (block->curr_n_fields != n_fields
+		   || block->curr_n_bytes != n_bytes
+		   || block->curr_left_side != left_side) {
+		goto exit_func;
 	}
 
 	block->n_hash_helps = 0;
@@ -1516,9 +1511,11 @@ btr_search_build_page_hash_index(
 	block->curr_left_side = unsigned(left_side);
 	block->index = index;
 
-	for (i = 0; i < n_cached; i++) {
-
-		ha_insert_for_fold(table, folds[i], block, recs[i]);
+	{
+		hash_table_t*	table = btr_get_search_table(index);
+		for (ulint i = 0; i < n_cached; i++) {
+			ha_insert_for_fold(table, folds[i], block, recs[i]);
+		}
 	}
 
 	MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED);

Comment by Marko Mäkelä [ 2020-06-16 ]

I see it now: MDEV-22456 need not be involved. btr_get_search_table() is accessing btr_search_sys->hash_tables, which was not safe if the buffer pool is being resized concurrently.

Comment by Marko Mäkelä [ 2020-07-02 ]

This regression did not affect 10.5.4 because MDEV-22871 had refactored the code.

Generated at Thu Feb 08 09:18:20 UTC 2024 using Jira 8.20.16#820016-sha1:9d11dbea5f4be3d4cc21f03a88dd11d8c8687422.