Uploaded image for project: 'MariaDB Server'
  1. MariaDB Server
  2. MDEV-32960

Semi-sync ACKed Transaction can Timeout and Switch Off Semi-sync with Multiple Replicas

    XMLWordPrintable

Details

    Description

      If a semi-sync primary has multiple semi-sync slaves, the primary seems to only listen for ACKs on one replica at a time. If the focused slave fails to reply, but another one does reply, the transaction will still time-out and semi-sync will switch off. See the following MTR snippets:

      When server_2 delays its ACK (using debug with "+d,simulate_delay_semisync_slave_reply") but server_3 sends its ACK, the transaction times out and semi-sync turns off

      --source include/have_debug.inc
      # binlog_format independent
      --source include/have_binlog_format_statement.inc
       
      --let $rpl_topology= 1->2,1->3
      --source include/rpl_init.inc
       
       
      --connection server_1
      set global rpl_semi_sync_master_enabled= 1;
      set global rpl_semi_sync_master_timeout= 500; # a slave will be delayed 800 milliseconds to force the timeout
       
      --connection server_2
      --source include/stop_slave.inc
      set global rpl_semi_sync_slave_enabled= 1;
      set global debug_dbug="+d,simulate_delay_semisync_slave_reply";
      --source include/start_slave.inc
       
      --connection server_3
      --source include/stop_slave.inc
      set global rpl_semi_sync_slave_enabled= 1;
      --source include/start_slave.inc
       
      --connection server_1
      --echo # Ensure semi-sync is working for both connected replicas
      --let $status_var_value= 2
      --let $status_var= rpl_semi_sync_master_clients
      --source include/wait_for_status_var.inc
       
      --echo #
      --echo # Semi-sync status starts ON
      SHOW STATUS LIKE 'rpl_semi_sync_master_status';
       
      --connection server_1
      create table t1 (a int);
       
      SHOW STATUS LIKE 'rpl_semi_sync_master_status';
       
      --connection server_1
      insert into t1 values (1);
       
      --echo #
      SHOW STATUS LIKE 'rpl_semi_sync_master_status';
       
      --echo #
      --echo # Cleanup
       
      --source include/rpl_end.inc
      --echo # End of test
      

      And result:

      include/rpl_init.inc [topology=1->2,1->3]
      connection server_1;
      set global rpl_semi_sync_master_enabled= 1;
      set global rpl_semi_sync_master_timeout= 500;
      connection server_2;
      include/stop_slave.inc
      set global rpl_semi_sync_slave_enabled= 1;
      set global debug_dbug="+d,simulate_delay_semisync_slave_reply";
      include/start_slave.inc
      connection server_3;
      include/stop_slave.inc
      set global rpl_semi_sync_slave_enabled= 1;
      include/start_slave.inc
      connection server_1;
      # Ensure semi-sync is working for both connected replicas
      #
      # Semi-sync status starts ON
      SHOW STATUS LIKE 'rpl_semi_sync_master_status';
      Variable_name	Value
      Rpl_semi_sync_master_status	ON
      connection server_1;
      create table t1 (a int);
      SHOW STATUS LIKE 'rpl_semi_sync_master_status';
      Variable_name	Value
      Rpl_semi_sync_master_status	OFF
      connection server_1;
      insert into t1 values (1);
      #
      SHOW STATUS LIKE 'rpl_semi_sync_master_status';
      Variable_name	Value
      Rpl_semi_sync_master_status	OFF
      #
      # Cleanup
      include/rpl_end.inc
      # End of test
      

      But vice-versa, if server_2 ACKS but server_3 times-out, the primary sees the ACK and no timeout occurs

      --source include/have_debug.inc
      # binlog_format independent
      --source include/have_binlog_format_statement.inc
       
      --let $rpl_topology= 1->2,1->3
      --source include/rpl_init.inc
       
       
      --connection server_1
      set global rpl_semi_sync_master_enabled= 1;
      set global rpl_semi_sync_master_timeout= 500; # a slave will be delayed 800 milliseconds to force the timeout
       
      --connection server_2
      --source include/stop_slave.inc
      set global rpl_semi_sync_slave_enabled= 1;
      --source include/start_slave.inc
       
      --connection server_3
      --source include/stop_slave.inc
      set global rpl_semi_sync_slave_enabled= 1;
      set global debug_dbug="+d,simulate_delay_semisync_slave_reply";
      --source include/start_slave.inc
       
      --connection server_1
      --echo # Ensure semi-sync is working for both connected replicas
      --let $status_var_value= 2
      --let $status_var= rpl_semi_sync_master_clients
      --source include/wait_for_status_var.inc
       
      --echo #
      --echo # Semi-sync status starts ON
      SHOW STATUS LIKE 'rpl_semi_sync_master_status';
       
      --connection server_1
      create table t1 (a int);
       
      SHOW STATUS LIKE 'rpl_semi_sync_master_status';
       
      --connection server_1
      insert into t1 values (1);
       
      --echo #
      SHOW STATUS LIKE 'rpl_semi_sync_master_status';
       
      --echo #
      --echo # Cleanup
       
      --source include/rpl_end.inc
      --echo # End of test
      

      With result:

      include/rpl_init.inc [topology=1->2,1->3]
      connection server_1;
      set global rpl_semi_sync_master_enabled= 1;
      set global rpl_semi_sync_master_timeout= 500;
      connection server_2;
      include/stop_slave.inc
      set global rpl_semi_sync_slave_enabled= 1;
      include/start_slave.inc
      connection server_3;
      include/stop_slave.inc
      set global rpl_semi_sync_slave_enabled= 1;
      set global debug_dbug="+d,simulate_delay_semisync_slave_reply";
      include/start_slave.inc
      connection server_1;
      # Ensure semi-sync is working for both connected replicas
      #
      # Semi-sync status starts ON
      SHOW STATUS LIKE 'rpl_semi_sync_master_status';
      Variable_name	Value
      Rpl_semi_sync_master_status	ON
      connection server_1;
      create table t1 (a int);
      SHOW STATUS LIKE 'rpl_semi_sync_master_status';
      Variable_name	Value
      Rpl_semi_sync_master_status	ON
      connection server_1;
      insert into t1 values (1);
      #
      SHOW STATUS LIKE 'rpl_semi_sync_master_status';
      Variable_name	Value
      Rpl_semi_sync_master_status	ON
      #
      # Cleanup
      include/rpl_end.inc
      # End of test
      

      Attachments

        Issue Links

          Activity

            People

              monty Michael Widenius
              bnestere Brandon Nesterenko
              Votes:
              0 Vote for this issue
              Watchers:
              1 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                Git Integration

                  Error rendering 'com.xiplink.jira.git.jira_git_plugin:git-issue-webpanel'. Please contact your Jira administrators.