diff --git a/mysql-test/suite/rpl/r/rpl_gtid_crash.result b/mysql-test/suite/rpl/r/rpl_gtid_crash.result index 0b5cfcb..4dab29a 100644 --- a/mysql-test/suite/rpl/r/rpl_gtid_crash.result +++ b/mysql-test/suite/rpl/r/rpl_gtid_crash.result @@ -112,5 +112,86 @@ SHOW VARIABLES like 'gtid_strict_mode'; Variable_name Value gtid_strict_mode ON include/start_slave.inc +*** MDEV-6462: Incorrect recovery on a slave reconnecting to crashed master *** +set sql_log_bin= 0; +call mtr.add_suppression("Error writing file 'master-bin'"); +set sql_log_bin= 1; +set sql_log_bin= 0; +call mtr.add_suppression("Unexpected switch of master, GTID has changed binlog position."); +set sql_log_bin= 1; +SET GLOBAL debug_dbug="+d,inject_error_writing_xid"; +BEGIN; +INSERT INTO t1 VALUES (11); +COMMIT; +ERROR HY000: Error writing file 'master-bin' (errno: 11 "Resource temporarily unavailable") +SET GLOBAL debug_dbug="+d,crash_dispatch_command_before"; +COMMIT; +Got one of the listed errors +SELECT * from t1; +a +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +# Wait 30 seconds for SQL thread to catch up with IO thread +SELECT * from t1; +a +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +# Repeat this with additional transactions on the master +SET GLOBAL debug_dbug="+d,inject_error_writing_xid"; +BEGIN; +INSERT INTO t1 VALUES (12); +COMMIT; +ERROR HY000: Error writing file 'master-bin' (errno: 11 "Resource temporarily unavailable") +SET GLOBAL debug_dbug="+d,crash_dispatch_command_before"; +COMMIT; +Got one of the listed errors +INSERT INTO t1 VALUES (13); +INSERT INTO t1 VALUES (14); +SELECT * from t1; +a +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +13 +14 +include/save_master_gtid.inc +include/sync_with_master_gtid.inc +SELECT * from t1; +a +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +13 +14 DROP TABLE t1; include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/rpl_gtid_crash.test b/mysql-test/suite/rpl/t/rpl_gtid_crash.test index 21b35a3..fc064c2 100644 --- a/mysql-test/suite/rpl/t/rpl_gtid_crash.test +++ b/mysql-test/suite/rpl/t/rpl_gtid_crash.test @@ -294,6 +294,92 @@ SHOW VARIABLES like 'gtid_strict_mode'; eval SET GLOBAL gtid_strict_mode= $old_gtid_strict; --enable_query_log + +--echo *** MDEV-6462: Incorrect recovery on a slave reconnecting to crashed master *** + +--connection server_1 +set sql_log_bin= 0; +call mtr.add_suppression("Error writing file 'master-bin'"); +set sql_log_bin= 1; +--connection server_2 +set sql_log_bin= 0; +call mtr.add_suppression("Unexpected switch of master, GTID has changed binlog position."); +set sql_log_bin= 1; + +--connection server_1 +--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +restart +EOF + +SET GLOBAL debug_dbug="+d,inject_error_writing_xid"; +BEGIN; +INSERT INTO t1 VALUES (11); +--error ER_ERROR_ON_WRITE +COMMIT; +SET GLOBAL debug_dbug="+d,crash_dispatch_command_before"; +--error 2006,2013 +COMMIT; + +--source include/wait_until_disconnected.inc +--enable_reconnect +--source include/wait_until_connected_again.inc + +SELECT * from t1; + +--echo # Wait 30 seconds for SQL thread to catch up with IO thread +--connection server_2 +--let $wait_timeout= 300 +while ($wait_timeout != 0) +{ + --let $read_log_pos= query_get_value('SHOW SLAVE STATUS', Read_Master_Log_Pos, 1) + --let $exec_log_pos= query_get_value('SHOW SLAVE STATUS', Exec_Master_Log_Pos, 1) + if ($read_log_pos == $exec_log_pos) + { + --let $wait_timeout= 0 + } + if ($read_log_pos != $exec_log_pos) + { + --sleep 0.1 + --dec $wait_timeout + } +} +if ($read_log_pos != $exec_log_pos) +{ + --die Timeout wait for SQL thread to catch up with IO thread +} + +SELECT * from t1; + +--echo # Repeat this with additional transactions on the master + +--connection server_1 +--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +restart +EOF + +SET GLOBAL debug_dbug="+d,inject_error_writing_xid"; +BEGIN; +INSERT INTO t1 VALUES (12); +--error ER_ERROR_ON_WRITE +COMMIT; +SET GLOBAL debug_dbug="+d,crash_dispatch_command_before"; +--error 2006,2013 +COMMIT; + +--source include/wait_until_disconnected.inc +--enable_reconnect +--source include/wait_until_connected_again.inc + +INSERT INTO t1 VALUES (13); +INSERT INTO t1 VALUES (14); +SELECT * from t1; +--source include/save_master_gtid.inc + +--connection server_2 +--source include/sync_with_master_gtid.inc +SELECT * from t1; + + --connection server_1 DROP TABLE t1; diff --git a/sql/log.cc b/sql/log.cc index 5614c0e..caf6f53 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -7439,6 +7439,13 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry, } } + DBUG_EXECUTE_IF("inject_error_writing_xid", + { + entry->error_cache= NULL; + entry->commit_errno= 28; + DBUG_RETURN(ER_ERROR_ON_WRITE); + }); + if (entry->end_event->write(&log_file)) { entry->error_cache= NULL; diff --git a/sql/rpl_mi.h b/sql/rpl_mi.h index 47da23d..21a60c7 100644 --- a/sql/rpl_mi.h +++ b/sql/rpl_mi.h @@ -164,6 +164,10 @@ class Master_info : public Slave_reporting_capability rpl_gtid last_queued_gtid; /* Whether last_queued_gtid had the FL_STANDALONE flag set. */ bool last_queued_gtid_standalone; + /* Log file name of the position on the master of the last seen GTID event. */ + char last_gtid_log_name[FN_REFLEN]; + /* Log position on the master of the last seen GTID event. */ + my_off_t last_gtid_log_pos; /* When slave IO thread needs to reconnect, gtid_reconnect_event_skip_count counts number of events to skip from the first GTID-prefixed event group, diff --git a/sql/slave.cc b/sql/slave.cc index d3e9500..64a834a 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -5212,6 +5212,48 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len, event_len - BINLOG_CHECKSUM_LEN : event_len, mi->rli.relay_log.description_event_for_queue); + // Make sure that we are connected to the same master if we have reconnected + // in the middle of an event group. + if (unlikely(mi->gtid_reconnect_event_skip_count && !mi->gtid_event_seen) && + (strlen(mi->last_gtid_log_name) != rev.ident_len || + strncmp(rev.new_log_ident, mi->last_gtid_log_name, rev.ident_len) || + rev.pos != mi->last_gtid_log_pos)) + { + String new_log_name; + new_log_name.append(rev.new_log_ident, rev.ident_len); + sql_print_warning("Unexpected switch of master, GTID has changed binlog " + "position. Expected: %s:%llu, received: %s:%llu. " + "Assuming that the master has crashed.", + mi->last_gtid_log_name, + (ulonglong) mi->last_gtid_log_pos, + new_log_name.c_ptr_safe(), + (ulonglong) rev.pos); + + // If we have reconnected to a different master or master have crashed and + // rolled back previous transaction (thus starting new transaction with + // the same GTID, but at different log position), we need to write bogus + // Format description event into relay log to signal SQL thread that + // the active transaction should be rolled back. + Format_description_log_event fdle(4); + + mysql_mutex_lock(log_lock); + if (likely(!fdle.write(rli->relay_log.get_log_file()) && + !rli->relay_log.flush_and_sync(NULL))) + { + rli->relay_log.harvest_bytes_written(&rli->log_space_total); + } + else + { + error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE; + mysql_mutex_unlock(log_lock); + goto err; + } + rli->relay_log.signal_update(); + mysql_mutex_unlock(log_lock); + + mi->gtid_reconnect_event_skip_count= 0; + mi->events_queued_since_last_gtid= 0; + } if (unlikely(process_io_rotate(mi, &rev))) { error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE; @@ -5449,6 +5449,8 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len, got_gtid_event= true; if (mi->using_gtid == Master_info::USE_GTID_NO) goto default_action; + strncpy(mi->last_gtid_log_name, mi->master_log_name, FN_REFLEN); + mi->last_gtid_log_pos= mi->master_log_pos; if (unlikely(!mi->gtid_event_seen)) { mi->gtid_event_seen= true;