|
binlog_encryption.rpl_checksum 'mix' w1 [ fail ]
|
Test ended at 2020-06-30 08:48:47
|
|
CURRENT_TEST: binlog_encryption.rpl_checksum
|
--- /mnt/buildbot/build/mariadb-10.4.14/mysql-test/suite/binlog_encryption/rpl_checksum.result 2020-06-30 07:10:01.000000000 -0400
|
+++ /mnt/buildbot/build/mariadb-10.4.14/mysql-test/suite/binlog_encryption/rpl_checksum.reject 2020-06-30 08:48:46.815372498 -0400
|
@@ -80,7 +80,7 @@
|
set @@global.debug_dbug='d,simulate_slave_unaware_checksum';
|
start slave;
|
include/wait_for_slave_io_error.inc [errno=1236]
|
-Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'Slave can not handle replication events with the checksum that master is configured to log; the first event 'master-bin.000009' at 411, the last event read from 'master-bin.000010' at 4, the last byte read from 'master-bin.000010' at 256.''
|
+Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'Slave can not handle replication events with the checksum that master is configured to log; the first event 'master-bin.000009' at 371, the last event read from 'master-bin.000010' at 4, the last byte read from 'master-bin.000010' at 256.''
|
select count(*) as zero from t1;
|
zero
|
0
|
|
mysqltest: Result content mismatch
|
|
|
For what it is worth, I observed this failure on a 10.6-based branch after applying the following incorrect patch. This might suggest a relationship with MDEV-25611.
@@ -2444,13 +2445,23 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool()
|
NOTE: The calling thread is not allowed to hold any buffer page latches! */
|
void buf_flush_sync()
|
{
|
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
|
+
|
for (;;)
|
{
|
- const ulint n_flushed= buf_flush_list(srv_max_io_capacity);
|
+ if (!UT_LIST_GET_LEN(buf_pool.flush_list))
|
+ break;
|
+
|
+ const lsn_t lsn= log_sys.get_lsn();
|
+ if (buf_flush_sync_lsn < lsn)
|
+ buf_flush_sync_lsn= lsn;
|
+ pthread_cond_signal(&buf_pool.do_flush_list);
|
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
|
buf_flush_wait_batch_end_acquiring_mutex(false);
|
- if (!n_flushed && !buf_flush_list_length())
|
- return;
|
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
|
}
|
+
|
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
|
}
|
|
#ifdef UNIV_DEBUG
|
|
|
Actually the reason may be simply in non-determistic number of "internal" replication events, such as Binlog_checkpoint one whose side is 40 bytes iIn the Alice's comments case when without checksum.
|
|
A recent failure:
|
10.3 d7ba72ea9bfc26b6b7172b4ad6c5a1ed
|
binlog_encryption.rpl_checksum 'mix' w2 [ fail ]
|
Test ended at 2022-08-10 08:29:06
|
|
CURRENT_TEST: binlog_encryption.rpl_checksum
|
--- /mnt/buildbot/build/mariadb-10.3.36/mysql-test/suite/binlog_encryption/rpl_checksum.result 2022-08-10 05:44:35.000000000 -0400
|
+++ /mnt/buildbot/build/mariadb-10.3.36/mysql-test/suite/binlog_encryption/rpl_checksum.reject 2022-08-10 08:29:06.597403815 -0400
|
@@ -80,7 +80,7 @@
|
set @@global.debug_dbug='d,simulate_slave_unaware_checksum';
|
start slave;
|
include/wait_for_slave_io_error.inc [errno=1236]
|
-Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'Slave can not handle replication events with the checksum that master is configured to log; the first event 'master-bin.000009' at 411, the last event read from 'master-bin.000010' at 4, the last byte read from 'master-bin.000010' at 256.''
|
+Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'Slave can not handle replication events with the checksum that master is configured to log; the first event 'master-bin.000009' at 371, the last event read from 'master-bin.000010' at 4, the last byte read from 'master-bin.000010' at 256.''
|
select count(*) as zero from t1;
|
zero
|
0
|
|
mysqltest: Result content mismatch
|
This was also observed in a 10.11 based branch:
|
bb-10.11-new-innodb-defaults 29834b0c5b8b70434aa0bdbae5d4dff9735c5a54
|
binlog_encryption.rpl_checksum 'mix' w52 [ fail ]
|
Test ended at 2022-11-30 13:30:04
|
|
CURRENT_TEST: binlog_encryption.rpl_checksum
|
--- /home/buildbot/aarch64-ubuntu-2004-debug/build/mysql-test/suite/binlog_encryption/rpl_checksum.result 2022-11-30 13:27:34.000000000 +0000
|
+++ /home/buildbot/aarch64-ubuntu-2004-debug/build/mysql-test/suite/binlog_encryption/rpl_checksum.reject 2022-11-30 13:30:03.657255241 +0000
|
@@ -85,7 +85,7 @@
|
set @@global.debug_dbug='d,simulate_slave_unaware_checksum';
|
start slave;
|
include/wait_for_slave_io_error.inc [errno=1236]
|
-Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'Slave can not handle replication events with the checksum that master is configured to log; the first event 'master-bin.000009' at 411, the last event read from 'master-bin.000010' at 4, the last byte read from 'master-bin.000010' at 256.''
|
+Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'Slave can not handle replication events with the checksum that master is configured to log; the first event 'master-bin.000009' at 371, the last event read from 'master-bin.000010' at 4, the last byte read from 'master-bin.000010' at 256.''
|
select count(*) as zero from t1;
|
zero
|
0
|
|
mysqltest: Result content mismatch
|
That branch is ahead of 10.11 b81b1943932a383a92bcd22d053e112198507ae2 by 2 rather trivial InnoDB changes: MDEV-29983 and MDEV-29986.
|
Actually the reason may be simply in non-determistic number of "internal" replication events, such as Binlog_checkpoint one whose side is 40 bytes iIn the Alice's comments case when without checksum.
Yes, it looks like it. The failing test error log for the primary showed:
2022-11-30 13:30:02 12 [Note] Start binlog_dump to slave_server(2), pos(master-bin.000009, 371), using_gtid(0), gtid('')
|
Whereas for a passing test the primary shows:
2022-11-30 13:30:07 12 [Note] Start binlog_dump to slave_server(2), pos(master-bin.000009, 411), using_gtid(0), gtid('')
|
Creating a custom test (in the binlog suite so the binlog is not encrypted) that ends after checking for the replica error produced for the master-bin.000009:
# at 256
|
#221202 0:04:14 server id 1 end_log_pos 295 Gtid list [0-1-5]
|
# at 295
|
#221202 0:04:14 server id 1 end_log_pos 335 Binlog checkpoint master-bin.000008
|
# at 335
|
#221202 0:04:14 server id 1 end_log_pos 375 Binlog checkpoint master-bin.000009
|
# at 375
|
#221202 0:04:14 server id 1 end_log_pos 419 Rotate to master-bin.000010 pos: 4
|
and for the replica error:
Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'Slave can not handle replication events with the checksum that master is configured to log; the first event 'master-bin.000009' at 375, the last event read from 'master-bin.000010' at 4, the last byte read from 'master-bin.000010' at 256.''
|
Correlating the 375 position to 411, the incorrect 371 position indicates that Binlog checkpoint master-bin.000009 was probably not written to the binlog in time before master-bin.000010 is created at set @@global.binlog_checksum = CRC32;, as the flush logs before sync_slave_with_master creates master-bin.000009.
Therefore, an extra --source include/wait_for_binlog_checkpoint.inc before master-bin.000010 is created would probably resolve this.
|
|
Hi Elkin can you please take a look at my analysis? Thank you!
|
|
angelique.sklavounos Agree with your analysis, I will fix.
- Kristian.
|
|
Pushed to 10.4
|