Details
-
Bug
-
Status: Closed (View Workflow)
-
Minor
-
Resolution: Fixed
-
10.0.15
Description
This was found by code inspection while debugging MDEV-7326. It is probably
unrelated to that bug, except for showing similar symptoms.
The problem occurs if the relaylog (or master binlog, from which relay logs
are copied) contains an event group with GTID that is missing the end
COMMIT/XID event.
There is code in parallel replication that tries to handle this situation, but
it is insufficent, miscounting count_committing_event_groups and
count_queued_event_groups. The result is that the following batch of event
groups will hang waiting for the prior groups to complete, but the 1/2 event
group does not "complete" for the purpose of this wait.
Basically, this code is missing mark_start_commit() (it should be checked if
other stuff is also missing, maybe full finish_event_group()?):
if(thd->wait_for_commit_ptr)
|
{
|
/*
|
This indicates that we get a new GTID event in the middle of
|
a not completed event group. This is corrupt binlog (the master
|
will never write such binlog), so it does not happen unless
|
someone tries to inject wrong crafted binlog, but let us still
|
try to handle it somewhat nicely.
|
*/
|
rgi->cleanup_context(thd, true);
|
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
|
thd->wait_for_commit_ptr->wakeup_subsequent_commits(rgi->worker_error);
|
}
|
Here is an MTR test case to reproduce the bug. It requires a DBUG patch:
=== modified file 'sql/slave.cc'
|
--- sql/slave.cc 2014-12-02 11:11:07 +0000
|
+++ sql/slave.cc 2014-12-16 13:01:43 +0000
|
@@ -5648,6 +5648,18 @@ static int queue_event(Master_info* mi,c
|
}
|
break;
|
|
+#ifndef DBUG_OFF
|
+ case XID_EVENT:
|
+ DBUG_EXECUTE_IF("slave_discard_xid_for_gtid_0_x_1000",
|
+ {
|
+ /* Inject an event group that is missing its XID commit event. */
|
+ if (mi->last_queued_gtid.domain_id == 0 &&
|
+ mi->last_queued_gtid.seq_no == 1000)
|
+ goto skip_relay_logging;
|
+ });
|
+ /* Fall through to default case ... */
|
+#endif
|
+
|
default:
|
default_action:
|
if (mi->using_gtid != Master_info::USE_GTID_NO && mi->gtid_event_seen)
|
--source include/have_debug.inc
|
--source include/have_innodb.inc
|
--source include/have_binlog_format_statement.inc
|
--source include/master-slave.inc
|
|
|
--connection server_1
|
CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
|
--save_master_pos
|
|
--connection server_2
|
--sync_with_master
|
--source include/stop_slave.inc
|
CHANGE MASTER TO master_use_gtid=slave_pos;
|
SET GLOBAL slave_parallel_threads=1;
|
SET @old_dbug= @@GLOBAL.debug_dbug;
|
SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
|
|
--connection server_1
|
INSERT INTO t1 VALUES (1);
|
INSERT INTO t1 VALUES (2);
|
INSERT INTO t1 VALUES (3);
|
INSERT INTO t1 VALUES (4);
|
INSERT INTO t1 VALUES (5);
|
SET gtid_seq_no=1000;
|
INSERT INTO t1 VALUES (6);
|
INSERT INTO t1 VALUES (7);
|
INSERT INTO t1 VALUES (8);
|
INSERT INTO t1 VALUES (9);
|
INSERT INTO t1 VALUES (10);
|
INSERT INTO t1 VALUES (11);
|
INSERT INTO t1 VALUES (12);
|
INSERT INTO t1 VALUES (13);
|
INSERT INTO t1 VALUES (14);
|
INSERT INTO t1 VALUES (15);
|
INSERT INTO t1 VALUES (16);
|
INSERT INTO t1 VALUES (17);
|
INSERT INTO t1 VALUES (18);
|
INSERT INTO t1 VALUES (19);
|
INSERT INTO t1 VALUES (20);
|
INSERT INTO t1 VALUES (21);
|
INSERT INTO t1 VALUES (22);
|
INSERT INTO t1 VALUES (23);
|
INSERT INTO t1 VALUES (24);
|
INSERT INTO t1 VALUES (25);
|
INSERT INTO t1 VALUES (26);
|
INSERT INTO t1 VALUES (27);
|
INSERT INTO t1 VALUES (28);
|
INSERT INTO t1 VALUES (29);
|
INSERT INTO t1 VALUES (30);
|
--source include/save_master_gtid.inc
|
|
--connection server_2
|
--source include/start_slave.inc
|
--source include/sync_with_master_gtid.inc
|
SELECT * FROM t1 ORDER BY a;
|
|
--source include/stop_slave.inc
|
SET GLOBAL debug_dbug=@old_dbug;
|
SET GLOBAL slave_parallel_threads=0;
|
--source include/start_slave.inc
|
|
--connection server_1
|
DROP FUNCTION foo;
|
|
--source include/rpl_end.inc
|