[MDEV-31052] Parallel slave hangs with binlog_alter_two_phase=ON and ALTER replication error Created: 2023-04-13  Updated: 2023-11-28

Status: Open
Project: MariaDB Server
Component/s: Replication
Affects Version/s: 10.8, 10.9, 10.10, 10.11, 11.0
Fix Version/s: 10.11, 11.0

Type: Bug Priority: Major
Reporter: Angelique Sklavounos (Inactive) Assignee: Andrei Elkin
Resolution: Unresolved Votes: 1
Labels: None

Issue Links:
Relates
relates to MDEV-30780 parallel slave hangs after hit an error Closed
relates to MDEV-31050 No Duplicate entry error on ALTER TAB... Open

 Description   

I originally came across this during RQG Replication testing for MDEV-16329. Using the test case in MDEV-31050 but with parallel slave threads and binlog_alter_two_phase=ON, it is possible to achieve a SHOW SLAVE STATUS with

10.8 e552747cfdf364a5ade1

Slave_SQL_Running	Yes
..
Last_SQL_Errno	1062
Last_SQL_Error	Error 'Duplicate entry '0' for key 'col_int_nokey'' on query. Default database: 'test'. Query: 'ALTER TABLE t9 ADD UNIQUE (`col_int_nokey`), ALGORITHM=COPY'

I am not sure if this is an issue that should be solved in its own right or is merely a consequence of MDEV-31050, but thought I should raise it separately in light of MDEV-30780.

Test case for reproducing

--source include/master-slave.inc
--source include/have_aria.inc
--source include/have_binlog_format_row.inc
 
# Need both binlog_alter_two_phase=1 and parallel replica threads
--connection master
--let $binlog_alter_two_phase= `select @@binlog_alter_two_phase`
set global binlog_alter_two_phase = ON;
set binlog_alter_two_phase = ON;
 
--connection slave
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=6;
SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
# Same issue with conservative
SET GLOBAL slave_parallel_mode='optimistic';
--source include/start_slave.inc
 
 
--connection master
USE test;
# No replication error with MyISAM and InnoDB
# Need ROW_FORMAT DYNAMIC for replication error
CREATE OR REPLACE TABLE t9 (`col_int_nokey` SMALLINT UNSIGNED DEFAULT 0 ) ROW_FORMAT DYNAMIC ENGINE=Aria;
LOCK TABLE t9 WRITE;
INSERT INTO t9 () VALUES (),(),();
FLUSH TABLES;
ALTER TABLE t9 ADD UNIQUE (`col_int_nokey`), ALGORITHM=COPY;
 
--connection slave
--let $slave_param= Last_SQL_Errno
--let $slave_param_value= 1062
--let $slave_timeout= 60
--source include/wait_for_slave_param.inc
--let $status_items= Last_SQL_Error
--source include/show_slave_status.inc
--let $slave_sql_errno= 1062
--let $slave_timeout= 120
# This will time out as the macro expects the SQL thread to NOT be running
--source include/wait_for_slave_sql_error_and_skip.inc
--source include/stop_slave.inc
SET GLOBAL slave_parallel_mode=@old_parallel_mode;
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
--source include/start_slave.inc
--connection master
DROP table t9;
--eval set global binlog_alter_two_phase=$binlog_alter_two_phase
--source include/rpl_end.inc

The test notably fails with:

**** SHOW PROCESSLIST on server_2 ****
SHOW PROCESSLIST;
Id	User	Host	db	Command	Time	State	Info	Progress
5	root	localhost:34070	test	Query	0	starting	SHOW PROCESSLIST	0.000
6	root	localhost:34086	test	Sleep	122		NULL	0.000
9	root	localhost:40302	test	Sleep	0		NULL	0.000
10	root	localhost:40316	test	Sleep	121		NULL	0.000
11	system user		NULL	Slave_IO	121	Waiting for master to send event	NULL	0.000
14	system user		NULL	Slave_worker	121	Waiting for work from SQL thread	NULL	0.000
13	system user		NULL	Slave_worker	121	Waiting for work from SQL thread	NULL	0.000
15	system user		NULL	Slave_worker	121	Waiting for work from SQL thread	NULL	0.000
16	system user		test	Slave_worker	121	closing tables	ALTER TABLE t9 ADD UNIQUE (`col_int_nokey`), ALGORITHM=COPY	0.000
17	system user		NULL	Slave_worker	121	Waiting for work from SQL thread	NULL	0.000
18	system user		NULL	Slave_worker	121	Waiting for work from SQL thread	NULL	0.000
12	system user		NULL	Slave_SQL	121	Reading event from the relay log	NULL	0.000

**** SHOW BINLOG EVENTS on server_2 ****
binlog_name = 'slave-bin.000001'
SHOW BINLOG EVENTS IN 'slave-bin.000001';
Log_name	Pos	Event_type	Server_id	End_log_pos	Info
slave-bin.000001	4	Format_desc	2	256	Server ver: 10.8.8-MariaDB-debug-log, Binlog ver: 4
slave-bin.000001	256	Gtid_list	2	285	[]
slave-bin.000001	285	Binlog_checkpoint	2	328	slave-bin.000001
slave-bin.000001	328	Gtid	1	370	GTID 0-1-1
slave-bin.000001	370	Query	1	550	use `test`; CREATE OR REPLACE TABLE t9 (`col_int_nokey` SMALLINT UNSIGNED DEFAULT 0 ) ROW_FORMAT DYNAMIC ENGINE=Aria
slave-bin.000001	550	Gtid	1	592	BEGIN GTID 0-1-2
slave-bin.000001	592	Query	1	692	use `test`; INSERT INTO t9 () VALUES (),(),()
slave-bin.000001	692	Query	1	765	COMMIT
slave-bin.000001	765	Gtid	1	807	GTID 0-1-3
slave-bin.000001	807	Query	1	886	use `test`; FLUSH TABLES
slave-bin.000001	886	Gtid	1	928	GTID 0-1-4 START ALTER
slave-bin.000001	928	Query	1	1056	use `test`; ALTER TABLE t9 ADD UNIQUE (`col_int_nokey`), ALGORITHM=COPY
 
**** SHOW RELAYLOG EVENTS on server_2 ****
relaylog_name = 'slave-relay-bin.000003'
SHOW RELAYLOG EVENTS IN 'slave-relay-bin.000003';
Log_name	Pos	Event_type	Server_id	End_log_pos	Info
slave-relay-bin.000003	4	Format_desc	2	256	Server ver: 10.8.8-MariaDB-debug-log, Binlog ver: 4
slave-relay-bin.000003	256	Rotate	1	0	master-bin.000001;pos=329
slave-relay-bin.000003	304	Format_desc	1	0	Server ver: 10.8.8-MariaDB-debug-log, Binlog ver: 4
slave-relay-bin.000003	556	Gtid	1	371	GTID 0-1-1
slave-relay-bin.000003	598	Query	1	551	use `test`; CREATE OR REPLACE TABLE t9 (`col_int_nokey` SMALLINT UNSIGNED DEFAULT 0 ) ROW_FORMAT DYNAMIC ENGINE=Aria
slave-relay-bin.000003	778	Gtid	1	593	BEGIN GTID 0-1-2
slave-relay-bin.000003	820	Query	1	693	use `test`; INSERT INTO t9 () VALUES (),(),()
slave-relay-bin.000003	920	Query	1	766	COMMIT
slave-relay-bin.000003	993	Gtid	1	808	GTID 0-1-3
slave-relay-bin.000003	1035	Query	1	887	use `test`; FLUSH TABLES
slave-relay-bin.000003	1114	Gtid	1	929	GTID 0-1-4 START ALTER
slave-relay-bin.000003	1156	Query	1	1057	use `test`; ALTER TABLE t9 ADD UNIQUE (`col_int_nokey`), ALGORITHM=COPY
slave-relay-bin.000003	1284	Gtid	1	1102	GTID 0-1-5 COMMIT ALTER id=4
slave-relay-bin.000003	1329	Query	1	1247	use `test`; ALTER TABLE t9 ADD UNIQUE (`col_int_nokey`), ALGORITHM=COPY
connection server_1;


Generated at Thu Feb 08 10:20:55 UTC 2024 using Jira 8.20.16#820016-sha1:9d11dbea5f4be3d4cc21f03a88dd11d8c8687422.