commit 6ea550968b801bbf18aa0326913f7022593784b7
|
Author: Brandon Nesterenko <brandon.nesterenko@mariadb.com>
|
Date: Wed Jan 8 14:01:29 2025 -0700
|
|
MDEV-35096: History is stored in different partitions on different nodes when using SYSTEM VERSION
|
|
THIS PATCH IS ONLY MEANT TO HIGHLIGHT THE PROBLEM FOR SOMEONE WHO IS
|
MORE FAMILIAR WITH SYSTEM VERSIONING PARTITIONS. IT IS NOT MEANT TO
|
BE REVIEWED AS A SOLUTION.
|
|
There are two problems:
|
1. The m_part_info->hist_part is never set when applying row events
|
(partition_info::vers_set_hist_part(THD*)). For DML commands,
|
hist_part is set during ha_partition::external_lock(THD*,int) (also
|
I saw the call exists in ha_partition::start_stmt(THD*,
|
thr_lock_type), though I didn’t see the execution hit that).
|
|
This patch prototypes a fix by simply calling
|
m_part_info->vers_set_hist_part(current_thd) in the preamble of
|
ha_partition::update_row(); however, it seems more likely to me that
|
there is a higher-level call being missed, and this bug report is
|
just one symptom of such a problem.
|
|
2. LEX::sql_command is not set when applying row events, yet
|
vers_require_hist_part(THD*) uses it to determine if the history
|
partition is applicable (thd->lex->vers_history_generating()).
|
|
This patch prototypes a fix by adding || thd->rgi_slave to the
|
vers_require_hist_part() check. Again, this is not a good fix, but
|
it serves to highlight another missing piece.
|
|
Special thanks for the test case from Jan Lindstrom
|
<jan.lindstrom@galeracluster.com>.
|
|
diff --git a/mysql-test/suite/rpl/t/rpl_system_versioning_partitions.test b/mysql-test/suite/rpl/t/rpl_system_versioning_partitions.test
|
new file mode 100644
|
index 00000000000..ef59c41f1c6
|
--- /dev/null
|
+++ b/mysql-test/suite/rpl/t/rpl_system_versioning_partitions.test
|
@@ -0,0 +1,130 @@
|
+#
|
+# Ensure that when replicating a table that uses system versioning, and
|
+# partitions by system time, the historical partitions on the replica match
|
+# those on the master. The test creates a system-versioned table with
|
+# system_time-based partitioning, fills the table up with enough records that
|
+# bypass the size of each historical partition, and deletes the records from
|
+# the table to split the historical data across multiple partitions. The
|
+# partition distribution on the master and slave should be equivalent.
|
+#
|
+# References:
|
+# MDEV-35096: History is stored in different partitions on different nodes
|
+# when using SYSTEM VERSION
|
+#
|
+--source include/have_binlog_format_row.inc
|
+--source include/master-slave.inc
|
+--source include/have_innodb.inc
|
+--source include/have_partition.inc
|
+
|
+--echo #
|
+--echo # Initialize system-versioned and partitioned table and its data
|
+--connection master
|
+create table t1 (x int) engine=InnoDB with system versioning partition by system_time limit 3 partitions 5;
|
+insert into t1 values(1);
|
+insert into t1 values(2);
|
+insert into t1 values(3);
|
+insert into t1 values(4);
|
+insert into t1 values(5);
|
+--let $master_total_size= `select count(*) from t1`
|
+--let $master_p0_size= `select count(*) from t1 partition (p0)`
|
+--let $master_p1_size= `select count(*) from t1 partition (p1)`
|
+--let $master_p2_size= `select count(*) from t1 partition (p2)`
|
+--sync_slave_with_master
|
+
|
+--connection slave
|
+--let $slave_total_size= `select count(*) from t1`
|
+--let $slave_p0_size= `select count(*) from t1 partition (p0)`
|
+--let $slave_p1_size= `select count(*) from t1 partition (p1)`
|
+--let $slave_p2_size= `select count(*) from t1 partition (p2)`
|
+
|
+if ($slave_total_size != $master_total_size)
|
+{
|
+ --connection master
|
+ select count(*) from t0;
|
+ --connection slave
|
+ select count(*) from t1;
|
+ --die Size of t1 differs between master and slave
|
+}
|
+if ($slave_p0_size != $master_p0_size)
|
+{
|
+ --connection master
|
+ select count(*) from t1 partition (p0);
|
+ --connection slave
|
+ select count(*) from t1 partition (p0);
|
+ --die Size of t1 partition p0 differs between master and slave
|
+}
|
+if ($slave_p1_size != $master_p1_size)
|
+{
|
+ --connection master
|
+ select count(*) from t1 partition (p1);
|
+ --connection slave
|
+ select count(*) from t1 partition (p1);
|
+ --die Size of t1 partition p1 differs between master and slave
|
+}
|
+if ($slave_p2_size != $master_p2_size)
|
+{
|
+ --connection master
|
+ select count(*) from t1 partition (p2);
|
+ --connection slave
|
+ select count(*) from t1 partition (p2);
|
+ --die Size of t1 partition p2 differs between master and slave
|
+}
|
+
|
+--echo #
|
+--echo # Delete each row, which is stored as an update in the binary log
|
+--echo # due to system versioning
|
+--connection master
|
+delete from t1 where x=1;
|
+delete from t1 where x=2;
|
+delete from t1 where x=3;
|
+delete from t1 where x=4;
|
+delete from t1 where x=5;
|
+--let $master_total_size= `select count(*) from t1`
|
+--let $master_p0_size= `select count(*) from t1 partition (p0)`
|
+--let $master_p1_size= `select count(*) from t1 partition (p1)`
|
+--let $master_p2_size= `select count(*) from t1 partition (p2)`
|
+--sync_slave_with_master
|
+
|
+--connection slave
|
+--let $slave_total_size= `select count(*) from t1`
|
+--let $slave_p0_size= `select count(*) from t1 partition (p0)`
|
+--let $slave_p1_size= `select count(*) from t1 partition (p1)`
|
+--let $slave_p2_size= `select count(*) from t1 partition (p2)`
|
+
|
+if ($slave_total_size != $master_total_size)
|
+{
|
+ --connection master
|
+ select count(*) from t1;
|
+ --connection slave
|
+ select count(*) from t1;
|
+ --die Size of t1 differs between master and slave
|
+}
|
+if ($slave_p0_size != $master_p0_size)
|
+{
|
+ --connection master
|
+ select count(*) from t1 partition (p0);
|
+ --connection slave
|
+ select count(*) from t1 partition (p0);
|
+ --die Size of t1 partition p0 differs between master and slave
|
+}
|
+if ($slave_p1_size != $master_p1_size)
|
+{
|
+ --connection master
|
+ select count(*) from t1 partition (p1);
|
+ --connection slave
|
+ select count(*) from t1 partition (p1);
|
+ --die Size of t1 partition p1 differs between master and slave
|
+}
|
+if ($slave_p2_size != $master_p2_size)
|
+{
|
+ --connection master
|
+ select count(*) from t1 partition (p2);
|
+ --connection slave
|
+ select count(*) from t1 partition (p2);
|
+ --die Size of t1 partition p2 differs between master and slave
|
+}
|
+
|
+--connection master
|
+drop table t1;
|
+
|
+--source include/rpl_end.inc
|
diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc
|
index 293e0f5cec4..6615374869a 100644
|
--- a/sql/ha_partition.cc
|
+++ b/sql/ha_partition.cc
|
@@ -4693,6 +4693,16 @@ int ha_partition::update_row(const uchar *old_data, const uchar *new_data)
|
// Need to read partition-related columns, to locate the row's partition:
|
DBUG_ASSERT(bitmap_is_subset(&m_part_info->full_part_field_set,
|
table->read_set));
|
+
|
+ /*
|
+ When called via Update_rows_log_event::do_exec_row, the hist_part is never
|
+ set, leading to the replica adding rows to the wrong partition when using
|
+ system_time partitions with system versioning.
|
+ */
|
+ if (current_thd->rgi_slave)
|
+ {
|
+ m_part_info->vers_set_hist_part(current_thd);
|
+ }
|
#ifndef DBUG_OFF
|
/*
|
The protocol for updating a row is:
|
diff --git a/sql/partition_info.h b/sql/partition_info.h
|
index ebd41ce1764..c3fe5ec949d 100644
|
--- a/sql/partition_info.h
|
+++ b/sql/partition_info.h
|
@@ -413,8 +413,14 @@ class partition_info : public Sql_alloc
|
}
|
bool vers_require_hist_part(THD *thd) const
|
{
|
+ /*
|
+ LEX::vers_history_generating() uses LEX::sql_command, but that isn't set
|
+ when replicating Row log events, which led to this always returning
|
+ false, and an incorrect partition being chosen. Just to highlight this
|
+ place, I added `|| thd->rgi_slave` so the attached test case can pass.
|
+ */
|
return part_type == VERSIONING_PARTITION &&
|
- thd->lex->vers_history_generating();
|
+ (thd->lex->vers_history_generating() || thd->rgi_slave);
|
}
|
int vers_set_hist_part(THD *thd);
|
void vers_check_limit(THD *thd);
|
I can reproduce inconsistency with normal async replication (test case attached):
jan@jan-ThinkPad-P1-Gen-6:~/work/mariadb/10.6/mysql-test$ ./mtr rpl.jan,row
Logging: ./mtr rpl.jan,row
VS config:
vardir: /home/jan/work/mariadb/10.6/mysql-test/var
Checking leftover processes...
Removing old var directory...
Creating var directory '/home/jan/work/mariadb/10.6/mysql-test/var'...
Checking supported features...
MariaDB Version 10.6.21-MariaDB-debug
- SSL connections supported
- binaries are debug compiled
- binaries built with wsrep patch
Collecting tests...
Installing system database...
==============================================================================
TEST RESULT TIME (ms) or COMMENT
--------------------------------------------------------------------------
worker[01] - 'localhost:19000' was not free
worker[01] Using MTR_BUILD_THREAD 301, with reserved ports 19030..19059
include/master-slave.inc
[connection master]
connection master;
CREATE TABLE t1 (x int) ENGINE=InnoDB WITH SYSTEM VERSIONING PARTITION BY SYSTEM_TIME LIMIT 3 PARTITIONS 5;
INSERT INTO t1 VALUES(1);
INSERT INTO t1 VALUES(2);
INSERT INTO t1 VALUES(3);
INSERT INTO t1 VALUES(4);
INSERT INTO t1 VALUES(5);
delete from t1 where x=1;
delete from t1 where x=2;
delete from t1 where x=3;
delete from t1 where x=4;
delete from t1 where x=5;
SELECT * FROM t1 PARTITION (p0);
x
1
2
3
SELECT * FROM t1 PARTITION (p1);
x
4
5
SELECT * FROM t1 PARTITION (p2);
x
connection slave;
connection slave;
SELECT * FROM t1 PARTITION (p0);
x
1
2
3
4
5
SELECT * FROM t1 PARTITION (p1);
x
SELECT * FROM t1 PARTITION (p2);
x
connection master;
DROP TABLE t1;
include/rpl_end.inc
rpl.jan 'row' [ pass ] 1137
--------------------------------------------------------------------------
The servers were restarted 0 times
Spent 1.137 of 4 seconds executing testcases
Completed: All 1 tests were successful.