Details
-
Bug
-
Status: Closed (View Workflow)
-
Major
-
Resolution: Fixed
-
10.4(EOL), 10.5, 10.6, 10.7(EOL)
-
None
Description
The following test shows a case where a SR transaction may be rolled back locally, but is never rolled back in the rest of cluster:
--source include/galera_cluster.inc
|
--source include/have_debug_sync.inc
|
|
--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1 |
--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2 |
--connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 |
--connect node_3a, 127.0.0.1, root, , test, $NODE_MYPORT_3 |
|
--connection node_1
|
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY);
|
|
--connection node_2
|
SET SESSION wsrep_trx_fragment_size=1; |
BEGIN;
|
INSERT INTO t1 VALUES (21); |
|
--connection node_3a
|
SET GLOBAL wsrep_provider_options = 'gmcast.isolate = 1'; |
SET SESSION wsrep_sync_wait = 0; |
--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' |
--source include/wait_condition.inc
|
SHOW STATUS LIKE 'wsrep_cluster_status'; |
SET SESSION wsrep_sync_wait = DEFAULT;
|
|
--connection node_1a
|
SET SESSION wsrep_sync_wait = 0; |
--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' |
--source include/wait_condition.inc
|
SHOW STATUS LIKE 'wsrep_cluster_status'; |
SET SESSION wsrep_sync_wait = DEFAULT;
|
|
--connection node_2a
|
SET SESSION wsrep_sync_wait = 0; |
--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' |
--source include/wait_condition.inc
|
SHOW STATUS LIKE 'wsrep_cluster_status'; |
SET SESSION wsrep_sync_wait = DEFAULT;
|
|
|
--echo #### block COMMIT before certification on node 2 ... |
--connection node_2
|
SET DEBUG_SYNC = 'wsrep_before_certification SIGNAL before_cert WAIT_FOR continue'; |
--send COMMIT
|
|
--connection node_2a
|
SET DEBUG_SYNC = 'now WAIT_FOR before_cert'; |
|
--echo #### ... and disconnect node 2 |
SET GLOBAL wsrep_provider_options = 'gmcast.isolate = 1'; |
SET SESSION wsrep_sync_wait = 0; |
--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' |
--source include/wait_condition.inc
|
SHOW STATUS LIKE 'wsrep_cluster_status'; |
SET SESSION wsrep_sync_wait = DEFAULT;
|
|
|
--echo #### unblock commit and observe an error
|
SET DEBUG_SYNC = 'now SIGNAL continue'; |
|
--connection node_2
|
--error 1180 |
--reap
|
|
--connection node_2a
|
SET DEBUG_SYNC = 'RESET'; |
|
--connection node_1a
|
SET SESSION wsrep_sync_wait = 0; |
--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' |
--source include/wait_condition.inc
|
SHOW STATUS LIKE 'wsrep_cluster_status'; |
SET SESSION wsrep_sync_wait = DEFAULT;
|
|
|
--echo #### reconnect node 3 |
|
--connection node_3a
|
SET GLOBAL wsrep_provider_options = 'gmcast.isolate = 0'; |
SET SESSION wsrep_sync_wait = 0; |
--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' |
--source include/wait_condition.inc
|
SHOW STATUS LIKE 'wsrep_cluster_status'; |
SET SESSION wsrep_sync_wait = DEFAULT;
|
|
--connection node_1a
|
SET SESSION wsrep_sync_wait = 0; |
--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' |
--source include/wait_condition.inc
|
SHOW STATUS LIKE 'wsrep_cluster_status'; |
SET SESSION wsrep_sync_wait = DEFAULT;
|
|
|
--echo #### reconnect node 2 |
|
--connection node_2a
|
SET GLOBAL wsrep_provider_options = 'gmcast.isolate = 0'; |
--let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' |
--source include/wait_condition.inc
|
--source include/galera_wait_ready.inc
|
SET SESSION wsrep_sync_wait = DEFAULT;
|
|
--connection node_1a
|
SET SESSION wsrep_sync_wait = 0; |
--let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' |
--source include/wait_condition.inc
|
--source include/galera_wait_ready.inc
|
SET SESSION wsrep_sync_wait = DEFAULT;
|
|
--connection node_3a
|
SET SESSION wsrep_sync_wait = 0; |
--let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' |
--source include/wait_condition.inc
|
--source include/galera_wait_ready.inc
|
SET SESSION wsrep_sync_wait = DEFAULT;
|
|
--echo observe entries still in wsrep_streaming_log table
|
--connection node_1a
|
SELECT COUNT(*) FROM mysql.wsrep_streaming_log;
|
--connection node_2a
|
SELECT COUNT(*) FROM mysql.wsrep_streaming_log;
|
--connection node_3a
|
SELECT COUNT(*) FROM mysql.wsrep_streaming_log;
|
|
--connection node_1
|
DROP TABLE t1;
|
The test issues a SR transaction that replicates a fragment, and attempts to commit. The commit fails because the node is temporarily in a non-primary view. The commit failure causes the following events:
node attempts to send a rollback fragment. this step fails also (node is still non-primary)
the failure in step 1) is ignored (see streaming_rollback() and its callers in wsrep-lib)
the transaction is rolled back locally
the transaction remains alive in the rest of the cluster, the other nodes do not recognize this as a orphaned transaction (see server_state::close_orphaned_sr_transactions())
Notice that the test uses a specific sequence of views events, so from node 2's perspective we have these events:
primary view (1,2,3)
start transaction; insert (1)
primary view (1,2)
non primary
commit() // failure and rollback here
primary view (1,2,3)
So that the views in steps 3 and 6 are not equal consecutive views (see close_orphaned_sr_transactions())