Details
- 
    
Bug
 - 
    Status: Closed (View Workflow)
 - 
    
Major
 - 
    Resolution: Fixed
 - 
    10.4.22
 - 
    None
 
Description
This morning: a crash in ull_get_key because $rax (return value from MDL_ticket::get_key) held a bad pointer.
This is the first time I've seen this particular crash. And this is on the first nodes we have running with 10.4.22.
Versions:
					galera-4 - 26.4.9-bionic
			 | 
		
					galera-arbitrator-4 - 26.4.9-bionic
			 | 
		
					libmariadb3:amd64 - 1:10.4.22+maria~bionic
			 | 
		
					mariadb-backup - 1:10.4.22+maria~bionic
			 | 
		
					mariadb-client-10.4 - 1:10.4.22+maria~bionic
			 | 
		
					mariadb-client-core-10.4 - 1:10.4.22+maria~bionic
			 | 
		
					mariadb-common - 1:10.4.22+maria~bionic
			 | 
		
					mariadb-server - 1:10.4.22+maria~bionic
			 | 
		
					mariadb-server-10.4 - 1:10.4.22+maria~bionic
			 | 
		
					mariadb-server-core-10.4 - 1:10.4.22+maria~bionic
			 | 
		
Short backtrace:
					(gdb) bt
			 | 
		
					#0  ull_get_key (ptr=<optimized out>, length=0x7fdfb66c5938, not_used=<optimized out>) at ./sql/item_func.cc:4002
			 | 
		
					#1  0x000055657d7b3413 in my_hash_key (first=1 '\001', length=0x7fdfb66c5938, record=<optimized out>, hash=0x7fde94002e58) at ./mysys/hash.c:196
			 | 
		
					#2  hashcmp (pos=0x7fde9431ea38, length=43, key=0x7fdfb66c59a8 "\bb28527f1559cc629795224a389d14a07ba41cb80", hash=0x7fde94002e58) at ./mysys/hash.c:371
			 | 
		
					#3  my_hash_first_from_hash_value (hash=hash@entry=0x7fde94002e58, hash_value=<optimized out>, key=key@entry=0x7fdfb66c59a8 "\bb28527f1559cc629795224a389d14a07ba41cb80", length=43, current_record=current_record@entry=0x7fdfb66c598c)
			 | 
		
					    at ./mysys/hash.c:288
			 | 
		
					#4  0x000055657d7b3538 in my_hash_first (hash=hash@entry=0x7fde94002e58, key=key@entry=0x7fdfb66c59a8 "\bb28527f1559cc629795224a389d14a07ba41cb80", length=<optimized out>, current_record=current_record@entry=0x7fdfb66c598c)
			 | 
		
					    at ./mysys/hash.c:262
			 | 
		
					#5  0x000055657d7b3551 in my_hash_search (hash=hash@entry=0x7fde94002e58, key=key@entry=0x7fdfb66c59a8 "\bb28527f1559cc629795224a389d14a07ba41cb80", length=<optimized out>) at ./mysys/hash.c:235
			 | 
		
					#6  0x000055657d2afc4f in Item_func_release_lock::val_int (this=0x7fde94010800) at ./sql/item_func.cc:4243
			 | 
		
Segfault due to invalid memory access:
					(gdb) disassemble 
			 | 
		
					Dump of assembler code for function ull_get_key(unsigned char const*, unsigned long*, char):
			 | 
		
					   0x000055657d2ab4e0 <+0>:	push   %rbp
			 | 
		
					   0x000055657d2ab4e1 <+1>:	mov    %rsp,%rbp
			 | 
		
					   0x000055657d2ab4e4 <+4>:	push   %rbx
			 | 
		
					   0x000055657d2ab4e5 <+5>:	mov    %rsi,%rbx
			 | 
		
					   0x000055657d2ab4e8 <+8>:	sub    $0x8,%rsp
			 | 
		
					   0x000055657d2ab4ec <+12>:	mov    (%rdi),%rdi
			 | 
		
					   0x000055657d2ab4ef <+15>:	callq  0x55657d122be0 <MDL_ticket::get_key() const>
			 | 
		
					=> 0x000055657d2ab4f4 <+20>:	movzwl (%rax),%edx
			 | 
		
					   0x000055657d2ab4f7 <+23>:	add    $0x8,%rax
			 | 
		
					(gdb) x $rax 
			 | 
		
					0x7fde00707063:	Cannot access memory at address 0x7fde00707063
			 | 
		
(That's a "cpp" in there.)
					/** Extract a hash key from User_level_lock. */
			 | 
		
					Â  | 
		
					uchar *ull_get_key(const uchar *ptr, size_t *length,
			 | 
		
					                   my_bool not_used __attribute__((unused)))
			 | 
		
					{
			 | 
		
					  User_level_lock *ull = (User_level_lock*) ptr;
			 | 
		
					  MDL_key *key = ull->lock->get_key();
			 | 
		
					  *length= key->length();
			 | 
		
					  return (uchar*) key->ptr();
			 | 
		
					}
			 | 
		
					class User_level_lock
			 | 
		
					{
			 | 
		
					public:
			 | 
		
					  MDL_ticket *lock;
			 | 
		
					  int refs;
			 | 
		
					};
			 | 
		
So, ull might be good or bad, but the MDL_ticket lock is pointing to something that is not a lock (anymore).
					(gdb) disassemble  0x55657d122be0
			 | 
		
					Dump of assembler code for function MDL_ticket::get_key() const:
			 | 
		
					   0x000055657d122be0 <+0>:	push   %rbp
			 | 
		
					   0x000055657d122be1 <+1>:	mov    0x38(%rdi),%rax
			 | 
		
					   0x000055657d122be5 <+5>:	mov    %rsp,%rbp
			 | 
		
					   0x000055657d122be8 <+8>:	pop    %rbp
			 | 
		
					   0x000055657d122be9 <+9>:	retq   
			 | 
		
					End of assembler dump.
			 | 
		
So, I would conclude that ull (ptr at entry) is 0x00007fde94311008 (0x8(%r10),%rdi in my_hash_first_from_hash_value) and ull->lock is then 0x00007fde9425bc10:
					(gdb) x/g $r10+8
			 | 
		
					0x7fde9431ea40:	0x00007fde94311008
			 | 
		
					(gdb) x 0x00007fde94311008
			 | 
		
					0x7fde94311008:	0x00007fde9425bc10
			 | 
		
Examining that memory:
					(gdb) print *(User_level_lock*)0x00007fde94311008
			 | 
		
					$9 = {lock = 0x7fde9425bc10, refs = 1}
			 | 
		
					Â  | 
		
					(gdb) print *((User_level_lock*)0x00007fde94311008)->lock
			 | 
		
					$11 = {<MDL_wait_for_subgraph> = {_vptr.MDL_wait_for_subgraph = 0x7fde9404dd40},
			 | 
		
					       next_in_context = 0x622f746f62646c69, prev_in_context = 0x2f746f62646c6975,
			 | 
		
					       next_in_lock = 0x61672f646c697562, prev_in_lock = 0x6372732f6172656c, 
			 | 
		
					       m_type = 1920169775, m_ctx = 0x2e72656469766f72, m_lock = 0x7fde00707063}
			 | 
		
That last one is definitely garbage:
					(gdb) print sizeof(MDL_ticket)
			 | 
		
					$12 = 64
			 | 
		
					Â  | 
		
					(gdb) x/8gx 0x7fde9425bc10
			 | 
		
					0x7fde9425bc10:	0x00007fde9404dd40	0x622f746f62646c69
			 | 
		
					0x7fde9425bc20:	0x2f746f62646c6975	0x61672f646c697562
			 | 
		
					0x7fde9425bc30:	0x6372732f6172656c	0x705f70657273772f
			 | 
		
					0x7fde9425bc40:	0x2e72656469766f72	0x00007fde00707063
			 | 
		
					Â  | 
		
					(gdb) print (char*)0x7fde9425bc10+8
			 | 
		
					$13 = 0x7fde9425bc18 "ildbot/buildbot/build/galera/src/wsrep_provider.cpp"
			 | 
		
So, when calling get_key on the lock 0x7fde9425bc10 we got:
					MDL_key *MDL_ticket::get_key() const
			 | 
		
					{
			 | 
		
					        return &m_lock->key;
			 | 
		
					}
			 | 
		
That is:
					(gdb) print &((MDL_ticket*)0x7fde9425bc10)->m_lock.key
			 | 
		
					$15 = (MDL_key *) 0x7fde00707063
			 | 
		
Assumptions:
- ull->refs looks okay, so ULL is possibly still good;
 - ull->lock (MDL_ticket) is received in Item_func_get_lock::val_int at one point;
 - now the MDL_ticket memory is overwritten.
 
Relevant SQL:
					SELECT RELEASE_LOCK('b28527f1559cc629795224a389d14a07ba41cb80')
			 | 
		
Full backtrace attached.
Is there anything I can get you? I have the core dump, so if you're interested in other threads, I can take a peek.
Cheers,
Walter Doekes
OSSO B.V.
Attachments
Issue Links
- relates to
 - 
                    
MDEV-17547 MariaDB 10.2 and Galera crashing with segfault
-         
 - Closed
 
 -         
 - 
                    
MDEV-24143 Galera nodes "randomly" crashing in Item_func_release_lock::val_int
-         
 - Closed
 
 -         
 - 
                    
MDEV-26803 Galera crash - Assertion. Possible parallel writeset problem
-         
 - Closed
 
 -