[MDEV-32960] Semi-sync ACKed Transaction can Timeout and Switch Off Semi-sync with Multiple Replicas Created: 2023-12-06  Updated: 2024-01-23

Status: Open
Project: MariaDB Server
Component/s: Replication
Affects Version/s: 10.6
Fix Version/s: 10.6

Type: Bug Priority: Major
Reporter: Brandon Nesterenko Assignee: Michael Widenius
Resolution: Unresolved Votes: 0
Labels: MDEV-32551-test

Issue Links:
Problem/Incident
is caused by MDEV-32551 "Read semi-sync reply magic number er... Closed

 Description   

If a semi-sync primary has multiple semi-sync slaves, the primary seems to only listen for ACKs on one replica at a time. If the focused slave fails to reply, but another one does reply, the transaction will still time-out and semi-sync will switch off. See the following MTR snippets:

When server_2 delays its ACK (using debug with "+d,simulate_delay_semisync_slave_reply") but server_3 sends its ACK, the transaction times out and semi-sync turns off

--source include/have_debug.inc
# binlog_format independent
--source include/have_binlog_format_statement.inc
 
--let $rpl_topology= 1->2,1->3
--source include/rpl_init.inc
 
 
--connection server_1
set global rpl_semi_sync_master_enabled= 1;
set global rpl_semi_sync_master_timeout= 500; # a slave will be delayed 800 milliseconds to force the timeout
 
--connection server_2
--source include/stop_slave.inc
set global rpl_semi_sync_slave_enabled= 1;
set global debug_dbug="+d,simulate_delay_semisync_slave_reply";
--source include/start_slave.inc
 
--connection server_3
--source include/stop_slave.inc
set global rpl_semi_sync_slave_enabled= 1;
--source include/start_slave.inc
 
--connection server_1
--echo # Ensure semi-sync is working for both connected replicas
--let $status_var_value= 2
--let $status_var= rpl_semi_sync_master_clients
--source include/wait_for_status_var.inc
 
--echo #
--echo # Semi-sync status starts ON
SHOW STATUS LIKE 'rpl_semi_sync_master_status';
 
--connection server_1
create table t1 (a int);
 
SHOW STATUS LIKE 'rpl_semi_sync_master_status';
 
--connection server_1
insert into t1 values (1);
 
--echo #
SHOW STATUS LIKE 'rpl_semi_sync_master_status';
 
--echo #
--echo # Cleanup
 
--source include/rpl_end.inc
--echo # End of test

And result:

include/rpl_init.inc [topology=1->2,1->3]
connection server_1;
set global rpl_semi_sync_master_enabled= 1;
set global rpl_semi_sync_master_timeout= 500;
connection server_2;
include/stop_slave.inc
set global rpl_semi_sync_slave_enabled= 1;
set global debug_dbug="+d,simulate_delay_semisync_slave_reply";
include/start_slave.inc
connection server_3;
include/stop_slave.inc
set global rpl_semi_sync_slave_enabled= 1;
include/start_slave.inc
connection server_1;
# Ensure semi-sync is working for both connected replicas
#
# Semi-sync status starts ON
SHOW STATUS LIKE 'rpl_semi_sync_master_status';
Variable_name	Value
Rpl_semi_sync_master_status	ON
connection server_1;
create table t1 (a int);
SHOW STATUS LIKE 'rpl_semi_sync_master_status';
Variable_name	Value
Rpl_semi_sync_master_status	OFF
connection server_1;
insert into t1 values (1);
#
SHOW STATUS LIKE 'rpl_semi_sync_master_status';
Variable_name	Value
Rpl_semi_sync_master_status	OFF
#
# Cleanup
include/rpl_end.inc
# End of test

But vice-versa, if server_2 ACKS but server_3 times-out, the primary sees the ACK and no timeout occurs

--source include/have_debug.inc
# binlog_format independent
--source include/have_binlog_format_statement.inc
 
--let $rpl_topology= 1->2,1->3
--source include/rpl_init.inc
 
 
--connection server_1
set global rpl_semi_sync_master_enabled= 1;
set global rpl_semi_sync_master_timeout= 500; # a slave will be delayed 800 milliseconds to force the timeout
 
--connection server_2
--source include/stop_slave.inc
set global rpl_semi_sync_slave_enabled= 1;
--source include/start_slave.inc
 
--connection server_3
--source include/stop_slave.inc
set global rpl_semi_sync_slave_enabled= 1;
set global debug_dbug="+d,simulate_delay_semisync_slave_reply";
--source include/start_slave.inc
 
--connection server_1
--echo # Ensure semi-sync is working for both connected replicas
--let $status_var_value= 2
--let $status_var= rpl_semi_sync_master_clients
--source include/wait_for_status_var.inc
 
--echo #
--echo # Semi-sync status starts ON
SHOW STATUS LIKE 'rpl_semi_sync_master_status';
 
--connection server_1
create table t1 (a int);
 
SHOW STATUS LIKE 'rpl_semi_sync_master_status';
 
--connection server_1
insert into t1 values (1);
 
--echo #
SHOW STATUS LIKE 'rpl_semi_sync_master_status';
 
--echo #
--echo # Cleanup
 
--source include/rpl_end.inc
--echo # End of test

With result:

include/rpl_init.inc [topology=1->2,1->3]
connection server_1;
set global rpl_semi_sync_master_enabled= 1;
set global rpl_semi_sync_master_timeout= 500;
connection server_2;
include/stop_slave.inc
set global rpl_semi_sync_slave_enabled= 1;
include/start_slave.inc
connection server_3;
include/stop_slave.inc
set global rpl_semi_sync_slave_enabled= 1;
set global debug_dbug="+d,simulate_delay_semisync_slave_reply";
include/start_slave.inc
connection server_1;
# Ensure semi-sync is working for both connected replicas
#
# Semi-sync status starts ON
SHOW STATUS LIKE 'rpl_semi_sync_master_status';
Variable_name	Value
Rpl_semi_sync_master_status	ON
connection server_1;
create table t1 (a int);
SHOW STATUS LIKE 'rpl_semi_sync_master_status';
Variable_name	Value
Rpl_semi_sync_master_status	ON
connection server_1;
insert into t1 values (1);
#
SHOW STATUS LIKE 'rpl_semi_sync_master_status';
Variable_name	Value
Rpl_semi_sync_master_status	ON
#
# Cleanup
include/rpl_end.inc
# End of test



 Comments   
Comment by Brandon Nesterenko [ 2023-12-07 ]

I think I see the issue. If a semi-sync connection already has been established on a primary, any new semi-sync slaves that join won't be listened to until a transaction has been ACKed after the new connection is added to the Ack_Receiver. The problem with this lies in the next transaction after adding a new semi-sync slave. If the existing connection (which is being listened for ACKs on) fails to send an ACK for the transaction, the Ack receiver thread won't be able to read the ACKs from the newly added replicas. This results in a time-out of the connection and semi-sync falling back to async mode altogether.

Generated at Thu Feb 08 10:35:19 UTC 2024 using Jira 8.20.16#820016-sha1:9d11dbea5f4be3d4cc21f03a88dd11d8c8687422.