|
it's my initial bug testing with 10.0.17 and 10.1.5 reported to mariadb support.
I'm observe on randomize insert with watch command.
Stop all slave/start all slave temporary resolve a error.
3 server mariadb 10.1.5 datamod01, datamod02, datamod03
root@datamod01 12:24:10 [test]> select @@global.gtid_domain_id,@@global.server_id,@@global.gtid_ignore_duplicates \G
|
@@global.gtid_domain_id: 1
|
@@global.server_id: 101
|
@@global.gtid_ignore_duplicates: 1
|
|
root@datamod02 12:03:11 [test]> select @@global.gtid_domain_id,@@global.server_id,@@global.gtid_ignore_duplicates \G
|
@@global.gtid_domain_id: 2
|
@@global.server_id: 102
|
@@global.gtid_ignore_duplicates: 1
|
|
root@datamod03 12:03:40 [test]> select @@global.gtid_domain_id,@@global.server_id,@@global.gtid_ignore_duplicates \G
|
@@global.gtid_domain_id: 3
|
@@global.server_id: 103
|
@@global.gtid_ignore_duplicates: 1
|
root@datamod01 12:24:27 [test]> show slave hosts;
|
+-----------+----------+------+-----------+
|
| Server_id | Host | Port | Master_id |
|
+-----------+----------+------+-----------+
|
| 103 | sqlmod03 | 3306 | 101 |
|
| 102 | sqlmod02 | 3306 | 101 |
|
+-----------+----------+------+-----------+
|
|
root@datamod02 12:24:48 [test]> show slave hosts;
|
+-----------+----------+------+-----------+
|
| Server_id | Host | Port | Master_id |
|
+-----------+----------+------+-----------+
|
| 103 | sqlmod03 | 3306 | 102 |
|
| 101 | sqlmod01 | 3306 | 102 |
|
+-----------+----------+------+-----------+
|
|
root@datamod03 12:25:31 [test]> show slave hosts;
|
+-----------+----------+------+-----------+
|
| Server_id | Host | Port | Master_id |
|
+-----------+----------+------+-----------+
|
| 101 | sqlmod01 | 3306 | 103 |
|
| 102 | sqlmod02 | 3306 | 103 |
|
+-----------+----------+------+-----------+
|
root@datamod01 13:21:49 [test]> stop all slaves;
|
Query OK, 0 rows affected, 2 warnings (0.01 sec)
|
|
root@datamod01 13:21:51 [test]> flush slave 'slave-of-sqlmod02';
|
Query OK, 0 rows affected (0.00 sec)
|
|
root@datamod01 13:22:00 [test]> flush slave 'slave-of-sqlmod03';
|
Query OK, 0 rows affected (0.00 sec)
|
|
root@datamod01 13:28:18 [test]> flush master;
|
Query OK, 0 rows affected (0.00 sec)
|
|
root@datamod01 13:29:20 [test]> change master 'slave-of-sqlmod02' to master_use_gtid=slave_pos ;
|
Query OK, 0 rows affected (0.00 sec)
|
root@datamod01 13:30:13 [test]> change master 'slave-of-sqlmod03' to master_use_gtid=slave_pos ;
|
Query OK, 0 rows affected (0.01 sec)
|
|
root@datamod01 13:31:52 [test]> start all slaves;
|
Query OK, 0 rows affected, 2 warnings (0.01 sec)
|
root@datamod02 12:25:48 [test]> stop all slaves;
|
Query OK, 0 rows affected, 2 warnings (0.01 sec)
|
|
root@datamod02 13:22:23 [test]> flush slave 'slave-of-sqlmod01';
|
Query OK, 0 rows affected (0.00 sec)
|
|
root@datamod02 13:22:40 [test]> flush slave 'slave-of-sqlmod03';
|
Query OK, 0 rows affected (0.00 sec)
|
|
root@datamod02 13:26:42 [test]> flush master;
|
Query OK, 0 rows affected (0.00 sec)
|
|
root@datamod02 13:28:36 [test]> change master 'slave-of-sqlmod01' to master_use_gtid=slave_pos ;
|
Query OK, 0 rows affected (0.00 sec)
|
|
root@datamod02 13:32:30 [test]> change master 'slave-of-sqlmod03' to master_use_gtid=slave_pos ;
|
Query OK, 0 rows affected (0.00 sec)
|
|
root@datamod02 13:32:34 [test]> start all slaves;
|
Query OK, 0 rows affected, 2 warnings (0.01 sec)
|
root@datamod03 12:29:05 [test]> stop all slaves;
|
Query OK, 0 rows affected, 2 warnings (0.00 sec)
|
|
root@datamod03 13:22:25 [test]> flush slave 'slave-of-sqlmod01';
|
Query OK, 0 rows affected (0.00 sec)
|
|
root@datamod03 13:22:53 [test]> flush slave 'slave-of-sqlmod02';
|
Query OK, 0 rows affected (0.00 sec)
|
|
root@datamod03 13:26:44 [test]> flush master;
|
Query OK, 0 rows affected (0.01 sec)
|
|
root@datamod03 13:28:37 [test]> change master 'slave-of-sqlmod01' to master_use_gtid=slave_pos ;
|
Query OK, 0 rows affected (0.01 sec)
|
|
root@datamod03 13:33:11 [test]> change master 'slave-of-sqlmod02' to master_use_gtid=slave_pos ;
|
Query OK, 0 rows affected (0.00 sec)
|
|
root@datamod03 13:33:16 [test]> start all slaves;
|
Query OK, 0 rows affected, 2 warnings (0.01 sec)
|
On all server :
show create table start_dump\G
|
Table: start_dump
|
Create Table: CREATE TABLE `start_dump` (
|
`ID` tinyint(3) unsigned NOT NULL DEFAULT '1',
|
`DUMP_OK` tinyint(1) DEFAULT '0',
|
PRIMARY KEY (`ID`)
|
) ENGINE=MEMORY DEFAULT CHARSET=latin1 MAX_ROWS=1
|
Before test:
Every 1.0s: echo hostname:datamod01; mysql -uroot -e 'show all slaves status\G' |grep -E '(Connection_name:|Master_Lo|Last_.*_Error|Running|Gtid)'
|
Mon Jul 20 14:16:46 2015
|
|
hostname:datamod01
|
Connection_name: slave-of-sqlmod02
|
Master_Log_File: bin-log.000001
|
Read_Master_Log_Pos: 310
|
Relay_Master_Log_File: bin-log.000001
|
Slave_IO_Running: Yes
|
Slave_SQL_Running: Yes
|
Exec_Master_Log_Pos: 310
|
Last_IO_Error:
|
Last_SQL_Error:
|
Using_Gtid: Slave_Pos
|
Gtid_IO_Pos: 1-101-399,2-102-41,3-103-72
|
Gtid_Slave_Pos: 1-101-399,2-102-41,3-103-72
|
Connection_name: slave-of-sqlmod03
|
Master_Log_File: bin-log.000001
|
Read_Master_Log_Pos: 310
|
Relay_Master_Log_File: bin-log.000001
|
Slave_IO_Running: Yes
|
Slave_SQL_Running: Yes
|
Exec_Master_Log_Pos: 310
|
Last_IO_Error:
|
Last_SQL_Error:
|
Using_Gtid: Slave_Pos
|
Gtid_IO_Pos: 1-101-399,2-102-41,3-103-72
|
Gtid_Slave_Pos: 1-101-399,2-102-41,3-103-72
|
Every 1.0s: echo hostname:datamod02; mysql -uroot -e 'show all slaves status\G' |grep -E '(Connection_name:|Master_Lo|Last_.*_Error|Running|Gtid)'
|
Mon Jul 20 14:17:00 2015
|
|
hostname:datamod02
|
Connection_name: slave-of-sqlmod01
|
Master_Log_File: bin-log.000001
|
Read_Master_Log_Pos: 310
|
Relay_Master_Log_File: bin-log.000001
|
Slave_IO_Running: Yes
|
Slave_SQL_Running: Yes
|
Exec_Master_Log_Pos: 310
|
Last_IO_Error:
|
Last_SQL_Error:
|
Using_Gtid: Slave_Pos
|
Gtid_IO_Pos: 1-101-399,2-102-41,3-103-72
|
Gtid_Slave_Pos: 1-101-399,2-102-41,3-103-72
|
Connection_name: slave-of-sqlmod03
|
Master_Log_File: bin-log.000001
|
Read_Master_Log_Pos: 310
|
Relay_Master_Log_File: bin-log.000001
|
Slave_IO_Running: Yes
|
Slave_SQL_Running: Yes
|
Exec_Master_Log_Pos: 310
|
Last_IO_Error:
|
Last_SQL_Error:
|
Using_Gtid: Slave_Pos
|
Gtid_IO_Pos: 1-101-399,2-102-41,3-103-72
|
Gtid_Slave_Pos: 1-101-399,2-102-41,3-103-72
|
Every 1.0s: echo hostname:datamod03; mysql -uroot -e 'show all slaves status\G' |grep -E '(Connection_name:|Master_Lo|Last_.*_Error|Running|Gtid)' Mon Jul 20 14:17:08 2015
|
|
hostname:datamod03
|
Connection_name: slave-of-sqlmod01
|
Master_Log_File: bin-log.000001
|
Read_Master_Log_Pos: 310
|
Relay_Master_Log_File: bin-log.000001
|
Slave_IO_Running: Yes
|
Slave_SQL_Running: Yes
|
Exec_Master_Log_Pos: 310
|
Last_IO_Error:
|
Last_SQL_Error:
|
Using_Gtid: Slave_Pos
|
Gtid_IO_Pos: 1-101-399,2-102-41,3-103-72
|
Gtid_Slave_Pos: 1-101-399,2-102-41,3-103-72
|
Connection_name: slave-of-sqlmod02
|
Master_Log_File: bin-log.000001
|
Read_Master_Log_Pos: 310
|
Relay_Master_Log_File: bin-log.000001
|
Slave_IO_Running: Yes
|
Slave_SQL_Running: Yes
|
Exec_Master_Log_Pos: 310
|
Last_IO_Error:
|
Last_SQL_Error:
|
Using_Gtid: Slave_Pos
|
Gtid_IO_Pos: 1-101-399,2-102-41,3-103-72
|
Gtid_Slave_Pos: 1-101-399,2-102-41,3-103-72
|
test:
root@datamod01:/srv/mariadb# watch -n 1 "mysql -uroot -e 'truncate table test.start_dump ; insert into test.start_dump set DUMP_OK=1;'"
|
after many truncate, insert:
Every 1.0s: echo hostname:datamod01; mysql -uroot -e 'show all slaves status\G' |grep -E '(Connection_name:|Master_Lo|Last_.*_Error|Running|Gtid)' Mon Jul 20 16:13:25 2015
|
hostname:datamod01
|
Connection_name: slave-of-sqlmod02
|
Master_Log_File: bin-log.000001
|
Read_Master_Log_Pos: 4968
|
Relay_Master_Log_File: bin-log.000001
|
Slave_IO_Running: Yes
|
Slave_SQL_Running: Yes
|
Exec_Master_Log_Pos: 4968
|
Last_IO_Error:
|
Last_SQL_Error:
|
Using_Gtid: Slave_Pos
|
Gtid_IO_Pos: 1-101-433,2-102-41,3-103-72
|
Gtid_Slave_Pos: 1-101-433,2-102-41,3-103-72
|
Connection_name: slave-of-sqlmod03
|
Master_Log_File: bin-log.000001
|
Read_Master_Log_Pos: 4968
|
Relay_Master_Log_File: bin-log.000001
|
Slave_IO_Running: Yes
|
Slave_SQL_Running: Yes
|
Exec_Master_Log_Pos: 4968
|
Last_IO_Error:
|
Last_SQL_Error:
|
Using_Gtid: Slave_Pos
|
Gtid_IO_Pos: 1-101-433,2-102-41,3-103-72
|
Gtid_Slave_Pos: 1-101-433,2-102-41,3-103-72
|
Every 1.0s: echo hostname:datamod02; mysql -uroot -e 'show all slaves status\G' |grep -E '(Connection_name:|Master_Lo|Last_.*_Error|Running|Gtid)'
|
Mon Jul 20 16:13:52 2015
|
|
hostname:datamod02
|
Connection_name: slave-of-sqlmod01
|
Master_Log_File: bin-log.000001
|
Read_Master_Log_Pos: 68494
|
Relay_Master_Log_File: bin-log.000001
|
Slave_IO_Running: Yes
|
Slave_SQL_Running: No
|
Exec_Master_Log_Pos: 63351
|
Last_IO_Error:
|
Last_SQL_Error: Could not execute Write_rows_v1 event on table test.start_dump; Duplicate entry '1' for key 'PRIMARY', Error_code: 1062; handler error HA_ERR_FOUND_DUPP_KEY; the event's master log bin-log.000001, end_log_pos 63469
|
Using_Gtid: Slave_Pos
|
Gtid_IO_Pos: 1-101-433,2-102-41,3-103-72
|
Gtid_Slave_Pos: 1-101-433,2-102-41,3-103-72
|
Connection_name: slave-of-sqlmod03
|
Master_Log_File: bin-log.000001
|
Read_Master_Log_Pos: 4968
|
Relay_Master_Log_File: bin-log.000001
|
Slave_IO_Running: Yes
|
Slave_SQL_Running: Yes
|
Exec_Master_Log_Pos: 4968
|
Last_IO_Error:
|
Last_SQL_Error:
|
Using_Gtid: Slave_Pos
|
Gtid_IO_Pos: 1-101-433,2-102-41,3-103-72
|
Gtid_Slave_Pos: 1-101-433,2-102-41,3-103-72
|
Every 1.0s: echo hostname:datamod03; mysql -uroot -e 'show all slaves status\G' |grep -E '(Connection_name:|Master_Lo|Last_.*_Error|Running|Gtid)' Mon Jul 20 16:14:05 2015
|
hostname:datamod03
|
Connection_name: slave-of-sqlmod01
|
Master_Log_File: bin-log.000001
|
Read_Master_Log_Pos: 68494
|
Relay_Master_Log_File: bin-log.000001
|
Slave_IO_Running: Yes
|
Slave_SQL_Running: Yes
|
Exec_Master_Log_Pos: 68494
|
Last_IO_Error:
|
Last_SQL_Error:
|
Using_Gtid: Slave_Pos
|
Gtid_IO_Pos: 1-101-433,2-102-41,3-103-72
|
Gtid_Slave_Pos: 1-101-433,2-102-41,3-103-72
|
Connection_name: slave-of-sqlmod02
|
Master_Log_File: bin-log.000001
|
Read_Master_Log_Pos: 4968
|
Relay_Master_Log_File: bin-log.000001
|
Slave_IO_Running: Yes
|
Slave_SQL_Running: Yes
|
Exec_Master_Log_Pos: 4968
|
Last_IO_Error:
|
Last_SQL_Error:
|
Using_Gtid: Slave_Pos
|
Gtid_IO_Pos: 1-101-433,2-102-41,3-103-72
|
Gtid_Slave_Pos: 1-101-433,2-102-41,3-103-72
|
show master status:
root@datamod01 16:25:22 [(none)]> show master status \G
|
File: bin-log.000001
|
Position: 68494
|
Binlog_Do_DB:
|
Binlog_Ignore_DB:
|
1 row in set (0.00 sec)
|
|
root@datamod02 16:26:13 [(none)]> show master status \G
|
File: bin-log.000001
|
Position: 4968
|
Binlog_Do_DB:
|
Binlog_Ignore_DB:
|
|
root@datamod03 16:26:32 [(none)]> show master status \G
|
File: bin-log.000001
|
Position: 4968
|
Binlog_Do_DB:
|
Binlog_Ignore_DB:
|
root@datamod02:/srv/mariadb# tail -4 ../data/sqlmod/log/mysqld.err
|
2015-07-20 13:33:42 140478747219712 [Note] Master 'slave-of-sqlmod03': Slave I/O thread: connected to master 'rsqlmod@datamod03:3306',replication starts at GTID position '1-101-399,2-102-41,3-103-72'
|
2015-07-20 16:09:31 140478747518720 [ERROR] Master 'slave-of-sqlmod01': Slave SQL: Could not execute Write_rows_v1 event on table test.start_dump; Duplicate entry '1' for key 'PRIMARY', Error_code: 1062; handler error HA_ERR_FOUND_DUPP_KEY; the event's master log bin-log.000001, end_log_pos 63469, Gtid 1-101-401, Internal MariaDB error code: 1062
|
2015-07-20 16:09:31 140478747518720 [Warning] Master 'slave-of-sqlmod01': Slave: Duplicate entry '1' for key 'PRIMARY' Error_code: 1062
|
2015-07-20 16:09:31 140478747518720 [ERROR] Master 'slave-of-sqlmod01': Error running query, slave SQL thread aborted. Fix the problem, and restart the slave SQL thread with "SLAVE START". We stopped at log 'bin-log.000001' position 63351; GTID position '1-101-400,2-102-41,3-103-72'
|
on datanode01:
mysqlbinlog bin-log.000001 --base64-output=DECODE-ROWS --verbose
|
|
# at 68311
|
#150720 16:09:48 server id 101 end_log_pos 68349 GTID 1-101-433
|
/*!100001 SET @@session.gtid_seq_no=433*//*!*/;
|
BEGIN
|
/*!*/;
|
# at 68349
|
# at 68398
|
#150720 16:09:48 server id 101 end_log_pos 68398 Table_map: `test`.`start_dump` mapped to number 439 (has triggers)
|
#150720 16:09:48 server id 101 end_log_pos 68429 Write_rows: table id 439 flags: STMT_END_F
|
### INSERT INTO `test`.`start_dump`
|
### SET
|
### @1=1
|
# at 68429
|
#150720 16:09:48 server id 101 end_log_pos 68494 Query thread_id=9361 exec_time=0 error_code=0
|
SET TIMESTAMP=1437401388/*!*/;
|
COMMIT
|
/*!*/;
|
on datanode02:
# at 4821
|
#150720 16:09:48 server id 101 end_log_pos 4859 GTID 1-101-433 trans
|
/*!100001 SET @@session.gtid_seq_no=433*//*!*/;
|
BEGIN
|
/*!*/;
|
# at 4859
|
# at 4909
|
#150720 16:09:48 server id 101 end_log_pos 4909 Table_map: `test`.`start_dump` mapped to number 44 (has triggers)
|
#150720 16:09:48 server id 101 end_log_pos 4941 Write_rows: table id 44 flags: STMT_END_F
|
### INSERT INTO `test`.`start_dump`
|
### SET
|
### @1=1
|
### @2=0
|
# at 4941
|
#150720 16:09:48 server id 101 end_log_pos 4968 Xid = 27517
|
COMMIT/*!*/;
|
on datanode03:
# at 4821
|
#150720 16:09:48 server id 101 end_log_pos 4859 GTID 1-101-433 trans
|
/*!100001 SET @@session.gtid_seq_no=433*//*!*/;
|
BEGIN
|
/*!*/;
|
# at 4859
|
# at 4909
|
#150720 16:09:48 server id 101 end_log_pos 4909 Table_map: `test`.`start_dump` mapped to number 48 (has triggers)
|
#150720 16:09:48 server id 101 end_log_pos 4941 Write_rows: table id 48 flags: STMT_END_F
|
### INSERT INTO `test`.`start_dump`
|
### SET
|
### @1=1
|
### @2=0
|
# at 4941
|
#150720 16:09:48 server id 101 end_log_pos 4968 Xid = 27120
|
COMMIT/*!*/;
|
on datanode02:
relay log for connection slave sqlnode01
mysqlbinlog mysqld-relay-bin-slave@002dof@002dsqlmod01.000002 --base64-output=DECODE-ROWS --verbose
|
|
# at 68596
|
#150720 16:09:48 server id 101 end_log_pos 68349 GTID 1-101-433
|
/*!100001 SET @@session.gtid_seq_no=433*//*!*/;
|
BEGIN
|
/*!*/;
|
# at 68634
|
# at 68683
|
#150720 16:09:48 server id 101 end_log_pos 68398 Table_map: `test`.`start_dump` mapped to number 439 (has triggers)
|
#150720 16:09:48 server id 101 end_log_pos 68429 Write_rows: table id 439 flags: STMT_END_F
|
### INSERT INTO `test`.`start_dump`
|
### SET
|
### @1=1
|
# at 68714
|
#150720 16:09:48 server id 101 end_log_pos 68494 Query thread_id=9361 exec_time=0 error_code=0
|
SET TIMESTAMP=1437401388/*!*/;
|
COMMIT
|
/*!*/;
|
relay log for connection slave sqlnode03
mysqlbinlog mysqld-relay-bin-slave@002dof@002dsqlmod03.000002 --base64-output=DECODE-ROWS --verbose
|
|
# at 5106
|
#150720 16:09:48 server id 101 end_log_pos 4859 GTID 1-101-433 trans
|
/*!100001 SET @@session.gtid_seq_no=433*//*!*/;
|
BEGIN
|
/*!*/;
|
# at 5144
|
# at 5194
|
#150720 16:09:48 server id 101 end_log_pos 4909 Table_map: `test`.`start_dump` mapped to number 48 (has triggers)
|
#150720 16:09:48 server id 101 end_log_pos 4941 Write_rows: table id 48 flags: STMT_END_F
|
### INSERT INTO `test`.`start_dump`
|
### SET
|
### @1=1
|
### @2=0
|
# at 5226
|
#150720 16:09:48 server id 101 end_log_pos 4968 Xid = 27120
|
COMMIT/*!*/;
|
|
|
Hi,
The event 1-101-433 is not interesting in itself. We need the set of binary/relay logs up to the moment when the problem happened, which, according to the error message, is 1-101-401. But better still, just attach the complete logs, don't extract anything from them (binary logs, relay logs, error logs). They should be tiny since the problem happens soon after flush.
Could you please provide those?
Also, not to spoil the experiment, it's better to be more careful about flush/start slaves. Since you have the all-directions replication, when you reset and start slaves on the first server, it might start picking up something from other slaves which are not reset yet, etc.
|