Details
-
Bug
-
Status: Closed (View Workflow)
-
Major
-
Resolution: Fixed
-
10.1.17, 5.5(EOL), 10.0(EOL), 10.1(EOL)
-
CentOS 6.8
-
5.5.54
Description
Our servers running MariaDB 10.1.17 in a galera cluster experienced a crash relating to the index merge sort_union optimization. The error caused all three of ou servers crash because the query, after failing on the first server due to the crash, was passed to the next server causing it to crash. I have disable the optimization using `optimizer_switch="index_merge_sort_union=off"` since the error continues to occur without it.
The crash occurs when the following query is run:
SELECT SQL_NO_CACHE contact_id FROM users_assignment FORCE INDEX (user_id,company_id) WHERE user_id = '57291' OR (user_id IN(55838,55839,56998,57260,57291,60227,121997,137571,173600,219233,306480,354991,358594,376261,398572,447223,472009,646282,932548,993011,1004609,1021262,1177117,1234288,1283121,1288749,1337771,57291) AND role_id = '1194') OR company_id = '2321' |
I have attached the relevant log excerpts from the servers as well as some output from running mysqld in gdb and a SQL file that creates a table with data against which running the sample query above crashes.
EXPLAIN EXTENDED of the query above:
+------+-------------+----------------------------+-------------+--------------------+--------------------+---------+------+------+----------+---------------------------------------------------+
|
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
|
+------+-------------+----------------------------+-------------+--------------------+--------------------+---------+------+------+----------+---------------------------------------------------+
|
| 1 | SIMPLE | users_assignment_brad_test | index_merge | user_id,company_id | user_id,company_id | 4,4 | NULL | 8193 | 100.00 | Using sort_union(user_id,company_id); Using where |
|
+------+-------------+----------------------------+-------------+--------------------+--------------------+---------+------+------+----------+---------------------------------------------------+
|
Attachments
Activity
The first config file I added (`my.cnf`) is from a remote asynchronous slave. The slave is stopped when I test this issue. The second config file (`my2.cnf`) is from one of our production servers from before I modified it to disable the `index_merge_sort_union` optimizer switch.
Thank you.
Another way to work around it might be removing sort_buffer_size=256K from the config file and relying on the default value instead.
Stack trace from 5.5 b3f7a8019d debug |
mysqld: /data/src/5.5/mysys/queues.c:265: queue_remove: Assertion `idx >= 1 && idx <= queue->elements' failed.
|
161004 1:07:11 [ERROR] mysqld got signal 6 ;
|
|
#7 0x00007f4520818312 in __GI___assert_fail (assertion=0xe793d0 "idx >= 1 && idx <= queue->elements", file=0xe79358 "/data/src/5.5/mysys/queues.c", line=265, function=0xe79400 <__PRETTY_FUNCTION__.10391> "queue_remove") at assert.c:101
|
#8 0x0000000000cae49d in queue_remove (queue=0x7f452284f230, idx=0) at /data/src/5.5/mysys/queues.c:265
|
#9 0x00000000007af331 in merge_buffers (param=0x7f452284f3b0, from_file=0x7f44f7c7f9c0, to_file=0x7f44f7d87260, sort_buffer=0x7f44f7d93060 "\303|w", lastbuff=0x7f44f7d272e0, Fb=0x7f44f7d272e0, Tb=0x7f44f7d27310, flag=1) at /data/src/5.5/sql/filesort.cc:1414
|
#10 0x00000000007afb21 in merge_index (param=0x7f452284f3b0, sort_buffer=0x7f44f7d93060 "\303|w", buffpek=0x7f44f7d272e0, maxbuffer=1, tempfile=0x7f44f7c7f9c0, outfile=0x7f44f7d87260) at /data/src/5.5/sql/filesort.cc:1599
|
#11 0x0000000000917046 in Unique::merge (this=0x7f44f7c7f998, table=0x7f44f7c22a60, buff=0x7f44f7d93060 "\303|w", without_last_merge=false) at /data/src/5.5/sql/uniques.cc:694
|
#12 0x000000000091729c in Unique::get (this=0x7f44f7c7f998, table=0x7f44f7c22a60) at /data/src/5.5/sql/uniques.cc:744
|
#13 0x00000000008c5545 in read_keys_and_merge_scans (thd=0x7f450d31f060, head=0x7f44f7c22a60, quick_selects=..., pk_quick_select=0x0, read_record=0x7f44f7cc6488, intersection=false, filtered_scans=0x0, unique_ptr=0x7f44f7cc6420) at /data/src/5.5/sql/opt_range.cc:10804
|
#14 0x00000000008c5721 in QUICK_INDEX_MERGE_SELECT::read_keys_and_merge (this=0x7f44f7cc63e0) at /data/src/5.5/sql/opt_range.cc:10827
|
#15 0x00000000008b24a8 in QUICK_INDEX_SORT_SELECT::reset (this=0x7f44f7cc63e0) at /data/src/5.5/sql/opt_range.cc:1943
|
#16 0x000000000065d4a6 in join_init_read_record (tab=0x7f44f7c7ef38) at /data/src/5.5/sql/sql_select.cc:17860
|
#17 0x000000000065b507 in sub_select (join=0x7f44f7c47ee8, join_tab=0x7f44f7c7ef38, end_of_records=false) at /data/src/5.5/sql/sql_select.cc:16965
|
#18 0x000000000065add5 in do_select (join=0x7f44f7c47ee8, fields=0x7f450d322b88, table=0x0, procedure=0x0) at /data/src/5.5/sql/sql_select.cc:16630
|
#19 0x0000000000639633 in JOIN::exec (this=0x7f44f7c47ee8) at /data/src/5.5/sql/sql_select.cc:2873
|
#20 0x0000000000639dfe in mysql_select (thd=0x7f450d31f060, rref_pointer_array=0x7f450d322ce0, tables=0x7f44f7c135d8, wild_num=0, fields=..., conds=0x7f44f7c479d0, og_num=0, order=0x0, group=0x0, having=0x0, proc_param=0x0, select_options=2147748608, result=0x7f44f7c47ec8, unit=0x7f450d322390, select_lex=0x7f450d322a70) at /data/src/5.5/sql/sql_select.cc:3094
|
#21 0x00000000006306e4 in handle_select (thd=0x7f450d31f060, lex=0x7f450d3222e0, result=0x7f44f7c47ec8, setup_tables_done_option=0) at /data/src/5.5/sql/sql_select.cc:319
|
#22 0x0000000000609be7 in execute_sqlcom_select (thd=0x7f450d31f060, all_tables=0x7f44f7c135d8) at /data/src/5.5/sql/sql_parse.cc:4689
|
#23 0x0000000000602f38 in mysql_execute_command (thd=0x7f450d31f060) at /data/src/5.5/sql/sql_parse.cc:2234
|
#24 0x000000000060c7b2 in mysql_parse (thd=0x7f450d31f060, rawbuf=0x7f44f7c13078 "SELECT SQL_NO_CACHE contact_id FROM index_merge_crash_test FORCE INDEX (user_id,company_id) WHERE user_id = '57291' OR (user_id IN(55838,55839,56998,57260,57291,60227,121997,137571,173600,219233,30648"..., length=372, parser_state=0x7f45228506a0) at /data/src/5.5/sql/sql_parse.cc:5934
|
#25 0x00000000006004c7 in dispatch_command (command=COM_QUERY, thd=0x7f450d31f060, packet=0x7f450d7f6061 "", packet_length=372) at /data/src/5.5/sql/sql_parse.cc:1079
|
#26 0x00000000005ff681 in do_command (thd=0x7f450d31f060) at /data/src/5.5/sql/sql_parse.cc:793
|
#27 0x00000000007018ed in do_handle_one_connection (thd_arg=0x7f450d31f060) at /data/src/5.5/sql/sql_connect.cc:1270
|
#28 0x000000000070167a in handle_one_connection (arg=0x7f450d31f060) at /data/src/5.5/sql/sql_connect.cc:1186
|
#29 0x00007f45224ac0a4 in start_thread (arg=0x7f4522851700) at pthread_create.c:309
|
#30 0x00007f45208d287d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
|
Stack trace from 5.5.51 release |
#2 <signal handler called>
|
#3 Field_long::cmp (this=0x7fa64606cc98, a_ptr=0x14f <error: Cannot access memory at address 0x14f>, b_ptr=0x7fa646167004 "") at /home/buildbot/buildbot/build/sql/field.cc:3700
|
#4 0x000000000081211a in cmp_ref (ref2=0x7fa646167004 "", ref1=0x14f <error: Cannot access memory at address 0x14f>, this=<optimized out>) at /home/buildbot/buildbot/build/storage/xtradb/handler/ha_innodb.cc:11840
|
#5 ha_innobase::cmp_ref (this=<optimized out>, ref1=<optimized out>, ref2=<optimized out>) at /home/buildbot/buildbot/build/storage/xtradb/handler/ha_innodb.cc:11793
|
#6 0x0000000000ac4779 in _downheap (queue=0x7fa66dff9660, start_idx=0, element=0x7fa6460fc300 "") at /home/buildbot/buildbot/build/mysys/queues.c:299
|
#7 0x0000000000ac4936 in queue_remove (queue=<optimized out>, idx=<optimized out>) at /home/buildbot/buildbot/build/mysys/queues.c:267
|
#8 0x00000000006d3999 in merge_buffers (param=param@entry=0x7fa66dff9760, from_file=from_file@entry=0x7fa6460da7f0, to_file=to_file@entry=0x7fa6460c0500, sort_buffer=sort_buffer@entry=0x7fa64615f000 "\303|w", lastbuff=lastbuff@entry=0x7fa6460fc300, Fb=Fb@entry=0x7fa6460fc300, Tb=0x7fa6460fc330, flag=flag@entry=1) at /home/buildbot/buildbot/build/sql/filesort.cc:1414
|
#9 0x00000000006d466c in merge_index (param=param@entry=0x7fa66dff9760, sort_buffer=sort_buffer@entry=0x7fa64615f000 "\303|w", buffpek=buffpek@entry=0x7fa6460fc300, maxbuffer=<optimized out>, tempfile=tempfile@entry=0x7fa6460da7f0, outfile=outfile@entry=0x7fa6460c0500) at /home/buildbot/buildbot/build/sql/filesort.cc:1599
|
#10 0x00000000007ea7fc in Unique::merge (this=this@entry=0x7fa6460da7c8, table=table@entry=0x7fa64608d800, buff=buff@entry=0x7fa64615f000 "\303|w", without_last_merge=without_last_merge@entry=false) at /home/buildbot/buildbot/build/sql/uniques.cc:694
|
#11 0x00000000007eae8a in Unique::get (this=this@entry=0x7fa6460da7c8, table=table@entry=0x7fa64608d800) at /home/buildbot/buildbot/build/sql/uniques.cc:744
|
#12 0x00000000007a9c2c in read_keys_and_merge_scans (thd=0x7fa659f03000, head=0x7fa64608d800, quick_selects=..., pk_quick_select=0x0, read_record=0x7fa64607e3a8, intersection=intersection@entry=false, filtered_scans=filtered_scans@entry=0x0, unique_ptr=unique_ptr@entry=0x7fa64607e340) at /home/buildbot/buildbot/build/sql/opt_range.cc:10804
|
#13 0x00000000007a9de9 in QUICK_INDEX_MERGE_SELECT::read_keys_and_merge (this=0x7fa64607e300) at /home/buildbot/buildbot/build/sql/opt_range.cc:10827
|
#14 0x00000000005cf525 in join_init_read_record (tab=0x7fa6460d9e60) at /home/buildbot/buildbot/build/sql/sql_select.cc:17842
|
#15 0x00000000005c2d01 in sub_select (join=0x7fa646069e88, join_tab=0x7fa6460d9e60, end_of_records=<optimized out>) at /home/buildbot/buildbot/build/sql/sql_select.cc:16947
|
#16 0x00000000005d9edd in do_select (join=join@entry=0x7fa646069e88, fields=fields@entry=0x7fa659f06988, table=table@entry=0x0, procedure=0x0) at /home/buildbot/buildbot/build/sql/sql_select.cc:16612
|
#17 0x00000000005ef604 in JOIN::exec (this=this@entry=0x7fa646069e88) at /home/buildbot/buildbot/build/sql/sql_select.cc:2873
|
#18 0x00000000005ea925 in mysql_select (thd=thd@entry=0x7fa659f03000, rref_pointer_array=rref_pointer_array@entry=0x7fa659f06ae0, tables=0x7fa64601e578, wild_num=<optimized out>, fields=..., conds=<optimized out>, og_num=0, order=0x0, group=0x0, having=0x0, proc_param=0x0, select_options=2147748608, result=result@entry=0x7fa646069e68, unit=0x7fa659f06190, select_lex=select_lex@entry=0x7fa659f06870) at /home/buildbot/buildbot/build/sql/sql_select.cc:3094
|
#19 0x00000000005f1533 in handle_select (thd=thd@entry=0x7fa659f03000, lex=lex@entry=0x7fa659f060e0, result=result@entry=0x7fa646069e68, setup_tables_done_option=setup_tables_done_option@entry=0) at /home/buildbot/buildbot/build/sql/sql_select.cc:319
|
#20 0x000000000059b5e8 in execute_sqlcom_select (thd=thd@entry=0x7fa659f03000, all_tables=0x7fa64601e578) at /home/buildbot/buildbot/build/sql/sql_parse.cc:4689
|
#21 0x00000000005a3373 in mysql_execute_command (thd=thd@entry=0x7fa659f03000) at /home/buildbot/buildbot/build/sql/sql_parse.cc:2234
|
#22 0x00000000005a6c64 in mysql_parse (thd=thd@entry=0x7fa659f03000, parser_state=parser_state@entry=0x7fa66dffb710, length=<optimized out>, rawbuf=<optimized out>) at /home/buildbot/buildbot/build/sql/sql_parse.cc:5934
|
#23 0x00000000005a9123 in dispatch_command (command=COM_QUERY, thd=0x7fa659f03000, packet=<optimized out>, packet_length=372) at /home/buildbot/buildbot/build/sql/sql_parse.cc:1245
|
#24 0x00000000005a929e in do_command (thd=<optimized out>) at /home/buildbot/buildbot/build/sql/sql_parse.cc:793
|
#25 0x000000000065bc3b in do_handle_one_connection (thd_arg=thd_arg@entry=0x7fa659f03000) at /home/buildbot/buildbot/build/sql/sql_connect.cc:1270
|
#26 0x000000000065bcbc in handle_one_connection (arg=0x7fa659f03000) at /home/buildbot/buildbot/build/sql/sql_connect.cc:1186
|
#27 0x00007fa66d6120a4 in start_thread (arg=0x7fa66dffc700) at pthread_create.c:309
|
#28 0x00007fa66cd3b87d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
|
Test data
The query is
SELECT SQL_NO_CACHE contact_id FROM index_merge_crash_test FORCE INDEX (user_id,company_id) WHERE user_id = '57291' OR (user_id IN(55838,55839,56998,57260,57291,60227,121997,137571,173600,219233,306480,354991,358594,376261,398572,447223,472009,646282,932548,993011,1004609,1021262,1177117,1234288,1283121,1288749,1337771,57291) AND role_id = '1194') OR company_id = '2321' |
(same as in the description, but with the table name matching the test data)
Setting `sort_buffer_size` to the default of `2M` does appear to work as a work around. I inherited most of the configuration and I'm not sure why it was set to `256k`, but `2M` should work fine.
In file sql/filesort.cc, function merge_buffers
[1411]if (!(error= (int) read_to_buffer(from_file, buffpek, |
rec_length)))
|
{
|
queue_remove(&queue,0); |
reuse_freed_buff(&queue, buffpek, rec_length);
|
[1416] } |
The assert in queue_remove(file queue.c)
uchar *queue_remove(register QUEUE *queue, uint idx)
|
{
|
uchar *element;
|
DBUG_ASSERT(idx >= 1 && idx <= queue->elements); |
element= queue->root[idx];
|
_downheap(queue, idx, queue->root[queue->elements--]);
|
return element; |
}
|
The assert fails because the queue_remove function is called with value of idx=0.
Yes but isn't it odd that queue_remove() has an assert that idx>=1, while filesort.cc has code that ALWAYS calls with idx=0?
Investigating how we ended up with this mismatch.
]
The call to
queue_remove(&queue, 0);
|
in filesort.cc comes from this patch
commit 76f0b94bb0b2994d639353530c5b251d0f1a204b
|
Author: Sergei Golubchik <sergii@pisem.net> Wed Oct 19 22:45:18 2011
|
Committer: Sergei Golubchik <sergii@pisem.net> Wed Oct 19 22:45:18 2011
|
|
merge with 5.3
|
|
...
|
The change made by that patch was
- VOID(queue_remove(&queue,0));
|
++ queue_remove(&queue,0);
|
The old code was introduced by
commit 69dd773b67a9892561f148964e51d3be606f7c77
|
Author: Igor Babaev <igor@askmonty.org>
|
Date: Mon Sep 13 15:22:11 2010 -0700
|
|
An implementation of index intersect via a modified Unique class.
|
This code is planned to be used for mwl#21.
|
but when that happened, queue_remove(..., 0) was used in multiple places.
queue_remove_top didn't exist, and the assert in queue_remove() looked like this:
DBUG_ASSERT(idx < queue->max_elements);
|
queue_remove_top was introduced in
commit ecbcddc03dc298ea1e6c0aa1a120bd0b4b04b3fd
|
Author: Michael Widenius <monty@askmonty.org> Fri Jul 16 10:33:01 2010
|
Committer: Michael Widenius <monty@askmonty.org> Fri Jul 16 10:33:01 2010
|
|
Improved speed of thr_alarm from O(N) to O(1). thr_alarm is used to handle timeouts and kill of connections.
|
Fixed compiler warnings.
|
queues.h and queues.c are now based on the UNIREG code and thus made BSD.
|
Fix code to use new queue() interface. This mostly affects how you access elements in the queue.
|
...
|
Also that patch changed:
uchar *queue_remove(register QUEUE *queue, uint idx)
|
{
|
uchar *element;
|
- DBUG_ASSERT(idx < queue->max_elements);
|
- element= queue->root[++idx]; /* Intern index starts from 1 */
|
- queue->root[idx]= queue->root[queue->elements--];
|
- _downheap(queue, idx);
|
+ DBUG_ASSERT(idx >= 1 && idx <= queue->elements);
|
+ element= queue->root[idx];
|
+ _downheap(queue, idx, queue->root[queue->elements--]);
|
return element;
|
}
|
Ok, it was a change in QUEUE code in 5.5 and a merge of code that used the old calling convention from 5.5
Indeed, line where this bug was had no test coverage:
I make a gcov build, run ./mysql-test-run --gcov --suite=main
and in sql/CMakeFiles/sql.dir/filesort.cc.gcov I see:
121: 1396: if (unique_buff)
|
-: 1397: {
|
-: 1398: /*
|
-: 1399: Called by Unique::get()
|
-: 1400: Copy the first argument to unique_buff for unique removal.
|
-: 1401: Store it also in 'to_file'.
|
-: 1402: */
|
28: 1403: buffpek= (BUFFPEK*) queue_top(&queue);
|
28: 1404: memcpy(unique_buff, buffpek->key, rec_length);
|
28: 1405: if (min_dupl_count)
|
-: 1406: memcpy(&dupl_count, unique_buff+dupl_count_ofs,
|
26: 1407: sizeof(dupl_count));
|
28: 1408: buffpek->key+= rec_length;
|
28: 1409: if (! --buffpek->mem_count)
|
-: 1410: {
|
#####: 1411: if (!(error= (int) read_to_buffer(from_file, buffpek,
|
#####: 1412: rec_length)))
|
-: 1413: {
|
#####: 1414: queue_remove(&queue,0);
|
#####: 1415: reuse_freed_buff(&queue, buffpek, rec_length);
|
-: 1416: }
|
#####: 1417: else if (error == -1)
|
#####: 1418: goto err; /* purecov: inspected */
|
-: 1419: }
|
A smaller testcase:
-- source include/have_innodb.inc
|
|
create table t2 (
|
pk int(11) NOT NULL AUTO_INCREMENT,
|
col1 int(11) NOT NULL,
|
col2 int(11) NOT NULL,
|
col3 int(11) NOT NULL,
|
key2 int(11) NOT NULL,
|
col4 int(11) NOT NULL,
|
key1 int(11) NOT NULL,
|
PRIMARY KEY (pk),
|
KEY key1 (key1),
|
KEY key2 (key2)
|
) ENGINE=InnoDB AUTO_INCREMENT=12860259 DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT;
|
|
create table ten(a int);
|
insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
|
|
create table ten_k(a int);
|
insert into ten_k select A.a + B.a* 10 + C.a * 100 + D.a*1000 from ten A, ten B, ten C, ten D;
|
|
insert into t2 (key1, key2, col1,col2,col3,col4)
|
select
|
a,a,
|
a,a,a,a
|
from ten_k;
|
SELECT sum(col1) FROM t2 FORCE INDEX (key1,key2) WHERE (key1 between 10 and 8191+10) or (key2= 5);
|
varun, please rename all tables t0,t1,t2.. and use that for the bugfix.
Could you please also attach your cnf file(s)? Also, if you can risk getting a crash again, please check if EXPLAIN EXTENDED ... works for the query, and if it does, paste the output of it.