[MDEV-4394] Sporadic failures in multi_source tests Created: 2013-04-13  Updated: 2021-01-12  Resolved: 2013-04-14

Status: Closed
Project: MariaDB Server
Component/s: None
Affects Version/s: 10.0.1
Fix Version/s: 10.0.2

Type: Bug Priority: Major
Reporter: Elena Stepanova Assignee: Michael Widenius
Resolution: Fixed Votes: 0
Labels: tests


 Description   

http://buildbot.askmonty.org/buildbot/builders/kvm-bintar-precise-x86/builds/703/steps/test/logs/stdio

multi_source.info_logs                   w2 [ fail ]
        Test ended at 2013-04-10 05:21:07
 
CURRENT_TEST: multi_source.info_logs
mysqltest: In included file "./include/wait_for_slave_param.inc": 
included from ./include/wait_for_slave_sql_to_start.inc at line 32:
included from ./include/wait_for_slave_to_start.inc at line 27:
included from /usr/local/mariadb-10.0.1-linux-i686/mysql-test/suite/multi_source/info_logs.test at line 146:
At line 115: Cannot perform inc/dec on a non-numeric value
 
The result from queries just before the failure was:
< snip >
#   while 'MASTER 2.2' and '' are running
master-master@00202@002e2.info
master.info
multi-master.info
relay.bin-master@00202@002e2.info
relay.bin.info
# End of list
#
# Contents of multi-master.info
MASTER 2.2
# EOF
#
show all slaves status;
Connection_name	Slave_SQL_State	Slave_IO_State	Master_Host	Master_User	Master_Port	Connect_Retry	Master_Log_File	Read_Master_Log_Pos	Relay_Log_File	Relay_Log_Pos	Relay_Master_Log_File	Slave_IO_Running	Slave_SQL_Running	Replicate_Do_DB	Replicate_Ignore_DB	Replicate_Do_Table	Replicate_Ignore_Table	Replicate_Wild_Do_Table	Replicate_Wild_Ignore_Table	Last_Errno	Last_Error	Skip_Counter	Exec_Master_Log_Pos	Relay_Log_Space	Until_Condition	Until_Log_File	Until_Log_Pos	Master_SSL_Allowed	Master_SSL_CA_File	Master_SSL_CA_Path	Master_SSL_Cert	Master_SSL_Cipher	Master_SSL_Key	Seconds_Behind_Master	Master_SSL_Verify_Server_Cert	Last_IO_Errno	Last_IO_Error	Last_SQL_Errno	Last_SQL_Error	Replicate_Ignore_Server_Ids	Master_Server_Id	Retried_transactions	Max_relay_log_size	Executed_log_entries	Slave_received_heartbeats	Slave_heartbeat_period
	Slave has read all relay log; waiting for the slave I/O thread to update it	Waiting for master to send event	127.0.0.1	root	MYPORT_1	60	master-bin.000001	286	relay.000002	572	master-bin.000001	Yes	Yes							0		0	286	857	None		0	No						0	No	0		0			1	0	1073741824	6	0	60.000
MASTER 2.2	Slave has read all relay log; waiting for the slave I/O thread to update it	Waiting for master to send event	127.0.0.1	root	MYPORT_2	60	master-bin.000001	286	relay-master@00202@002e2.000002	572	master-bin.000001	Yes	Yes							0		0	286	876	None		0	No						0	No	0		0			2	0	1073741824	6	0	60.000
include/wait_for_slave_to_start.inc
set default_master_connection = 'MASTER 2.2';
include/wait_for_slave_to_start.inc
**** ERROR: timeout after 300 seconds while waiting for slave parameter Slave_SQL_Running = Yes ****
 
More results from queries before failure can be found in /run/shm/var_auto_qGxp/2/log/info_logs.log
 
 - saving '/run/shm/var_auto_qGxp/2/log/multi_source.info_logs/' to '/usr/local/mariadb-10.0.1-linux-i686/mysql-test/var/log/multi_source.info_logs/'
 
Retrying test multi_source.info_logs, attempt(2/3)...
 
multi_source.info_logs                   w2 [ retry-pass ]   1490

http://buildbot.askmonty.org/buildbot/builders/kvm-deb-debian6-x86/builds/1939/steps/test_3/logs/stdio

multi_source.multisource 'innodb_plugin' w4 [ fail ]
        Test ended at 2013-04-10 08:29:52
 
CURRENT_TEST: multi_source.multisource
--- /usr/share/mysql/mysql-test/suite/multi_source/multisource.result	2013-04-10 07:48:58.000000000 +0200
+++ /dev/shm/var/4/log/multisource.reject	2013-04-10 08:29:51.357212484 +0200
@@ -72,7 +72,7 @@
 include/wait_for_slave_to_start.inc
 show all slaves status;
 Connection_name	Slave_SQL_State	Slave_IO_State	Master_Host	Master_User	Master_Port	Connect_Retry	Master_Log_File	Read_Master_Log_Pos	Relay_Log_File	Relay_Log_Pos	Relay_Master_Log_File	Slave_IO_Running	Slave_SQL_Running	Replicate_Do_DB	Replicate_Ignore_DB	Replicate_Do_Table	Replicate_Ignore_Table	Replicate_Wild_Do_Table	Replicate_Wild_Ignore_Table	Last_Errno	Last_Error	Skip_Counter	Exec_Master_Log_Pos	Relay_Log_Space	Until_Condition	Until_Log_File	Until_Log_Pos	Master_SSL_Allowed	Master_SSL_CA_File	Master_SSL_CA_Path	Master_SSL_Cert	Master_SSL_Cipher	Master_SSL_Key	Seconds_Behind_Master	Master_SSL_Verify_Server_Cert	Last_IO_Errno	Last_IO_Error	Last_SQL_Errno	Last_SQL_Error	Replicate_Ignore_Server_Ids	Master_Server_Id	Retried_transactions	Max_relay_log_size	Executed_log_entries	Slave_received_heartbeats	Slave_heartbeat_period
-	Slave has read all relay log; waiting for the slave I/O thread to update it	Waiting for master to send event	127.0.0.1	root	MYPORT_2	60	master-bin.000001	286	mysqld-relay-bin.000002	572	master-bin.000001	Yes	Yes							0		0	286	868	None		0	No						0	No	0		0			2	0	1073741824	6	0	60.000
+	Slave has read all relay log; waiting for the slave I/O thread to update it	Checking master version	127.0.0.1	root	MYPORT_2	60		4	mysqld-relay-bin.000001	4		Yes	Yes							0		0	0	246	None		0	No						0	No	0		0			0	0	1073741824	1	0	60.000
 master1	Slave has read all relay log; waiting for the slave I/O thread to update it	Waiting for master to send event	127.0.0.1	root	MYPORT_1	60	master-bin.000001	876	mysqld-relay-bin-master1.000002	1162	master-bin.000001	Yes	Yes							0		0	876	1466	None		0	No						0	No	0		0			1	0	1073741824	13	0	60.000
 insert into t1 (f1) values ('three');
 drop database if exists db2;
 
mysqltest: Result length mismatch
 
 - saving '/dev/shm/var/4/log/multi_source.multisource-innodb_plugin/' to '/dev/shm/var/log/multi_source.multisource-innodb_plugin/'
 
Retrying test multi_source.multisource, attempt(2/3)...
 
multi_source.relaylog_events             w2 [ pass ]    461
multi_source.reset_slave                 w2 [ pass ]    316
multi_source.info_logs                   w3 [ pass ]   3664
multi_source.multisource 'xtradb'        w1 [ pass ]   1518
multi_source.simple                      w2 [ pass ]    576
perfschema.cnf_option                    w3 [ pass ]      5
multi_source.multisource 'innodb_plugin' w4 [ retry-pass ]   1209
 
Retrying test multi_source.multisource, attempt(3/3)...
 
multi_source.multisource 'innodb_plugin' w4 [ retry-fail ]
        Test ended at 2013-04-10 08:30:03
 
CURRENT_TEST: multi_source.multisource
mysqltest: At line 14: query 'change master 'abc' to relay_log_file=''' failed with wrong errno 1201: 'Could not initialize master info structure for 'abc'; more error messages can be found in the MariaDB error log', instead of 1380...
 
The result from queries just before the failure was:
change master 'abc' to relay_log_file='';
 
 - skipping '/dev/shm/var/4/log/multi_source.multisource-innodb_plugin/'
multi_source.skip_counter                w2 [ pass ]   1008
***Warnings generated in error logs during shutdown after running tests: multi_source.multisource
 
130410  8:30:02 [ERROR] Failed to open the relay log './' (relay_log_pos 4)
130410  8:30:02 [ERROR] Could not find target log during relay log initialization
 
rpl.rpl_auto_increment 'innodb_plugin,mix' w1 [ pass ]   1956
 



 Comments   
Comment by Kristian Nielsen [ 2013-04-14 ]

I checked my notes, and it turns out I already analysed this while working on global transaction Id.

According to my notes, this failure is a consequence of MDEV-4033.

I do not yet have an explanation how the strange inc/dec error turns up, however it fits with the place where the test fails - it is just after restarting the slave mysqld server, it will try to start the two slave threads at approximately the same time, opening a window for getting this problem.

Comment by Kristian Nielsen [ 2013-04-14 ]

Ok, now I got it.
This problem happens in include/show_rpl_debug_info.inc, which is invoked from
include/wait_for_slave_param.inc when the timeout is caught.

The error occurs because monty is not using include/rpl_topology.inc, so
$rpl_server_count is not set (and we inc an empty string).

The error location is reported incorrectly because of the general problem that
any errors within a while() loop are reported as occuring at the end of the
loop.

Clear enough, indeed it is the file name conflict in the slave threads that is
the problem.

Comment by Kristian Nielsen [ 2013-04-14 ]

Hm, ok there are two failures mentioned here. For clarity, the MDEV-4033 is about the first one "Cannot perform inc/dec ...". The second one I believe Elena already has a fix for.

Comment by Michael Widenius [ 2013-04-14 ]

MDEV 4033 will be pushed shortly
I also added --let $rpl_server_count= 0 to all mutli-source tests
I can't easily use rpl_init.inc as the multi-source tests doesn't use any of the normal topologies.

(I also couldn't find a rpl_topology.inc file)

Comment by Michael Widenius [ 2013-04-14 ]

Pushed into 10.0-base

Comment by Elena Stepanova [ 2013-04-15 ]

It was a fix for the first failure (and for MDEV-4033), while I'm still to push the wait condition for "Waiting for master to send event" (the second failure), right? I don't expect that the revno 3602 alone will fix it too, will it?

Comment by Elena Stepanova [ 2013-04-15 ]

Below is the fix. It looks innocent, so I've pushed it anyway, revno 3603. If you disagree, please feel free to revert.
It only deals with one possible race condition that we discussed (in 3 places), while there might be more, but we'll have to catch them one by one unless we want to remove SHOW SLAVE STATUS completely.

=== modified file 'mysql-test/suite/multi_source/multisource.test'
— mysql-test/suite/multi_source/multisource.test 2013-04-14 15:30:05 +0000
+++ mysql-test/suite/multi_source/multisource.test 2013-04-14 23:21:34 +0000
@@ -43,6 +43,12 @@

  1. each of the 3 commands should produce
  2. 'master1' status

+let $wait_for_all= 1;
+let $show_statement= SHOW ALL SLAVES STATUS;
+let $field= Slave_IO_State;
+let $condition= = 'Waiting for master to send event';
+--source include/wait_show_condition.inc
+
--replace_result $SERVER_MYPORT_1 MYPORT_1
show slave 'master1' status;
--replace_result $SERVER_MYPORT_1 MYPORT_1
@@ -132,6 +138,12 @@

  1. See both connections in the same status output

+let $wait_for_all= 1;
+let $show_statement= SHOW ALL SLAVES STATUS;
+let $field= Slave_IO_State;
+let $condition= = 'Waiting for master to send event';
+--source include/wait_show_condition.inc
+
--replace_result $SERVER_MYPORT_1 MYPORT_1 $SERVER_MYPORT_2 MYPORT_2
show all slaves status;

@@ -196,6 +208,12 @@

--source wait_for_sql_thread_read_all.inc

+let $wait_for_all= 1;
+let $show_statement= SHOW ALL SLAVES STATUS;
+let $field= Slave_IO_State;
+let $condition= = 'Waiting for master to send event';
+--source include/wait_show_condition.inc
+
--replace_result $SERVER_MYPORT_1 MYPORT_1 $SERVER_MYPORT_2 MYPORT_2
show all slaves status;

Comment by Elena Stepanova [ 2013-04-15 ]

Same problem appeared in info_logs on revno 3602, hence same fix (pushed as revno 3604).

=== modified file 'mysql-test/suite/multi_source/info_logs.test'
— mysql-test/suite/multi_source/info_logs.test 2013-04-14 15:30:05 +0000
+++ mysql-test/suite/multi_source/info_logs.test 2013-04-15 00:11:16 +0000
@@ -130,6 +130,13 @@
--echo # EOF
--echo #

+let $wait_for_all= 1;
+let $show_statement= SHOW ALL SLAVES STATUS;
+let $field= Slave_IO_State;
+let $condition= = 'Waiting for master to send event';
+--source include/wait_show_condition.inc
+
+
--replace_result $SERVER_MYPORT_1 MYPORT_1 $SERVER_MYPORT_2 MYPORT_2
show all slaves status;

@@ -146,6 +153,13 @@
--source include/wait_for_slave_to_start.inc
set default_master_connection = '';

+let $wait_for_all= 1;
+let $show_statement= SHOW ALL SLAVES STATUS;
+let $field= Slave_IO_State;
+let $condition= = 'Waiting for master to send event';
+--source include/wait_show_condition.inc
+
+
--replace_result $SERVER_MYPORT_1 MYPORT_1 $SERVER_MYPORT_2 MYPORT_2
show all slaves status;

Generated at Thu Feb 08 06:56:05 UTC 2024 using Jira 8.20.16#820016-sha1:9d11dbea5f4be3d4cc21f03a88dd11d8c8687422.