Details
-
Bug
-
Status: Closed (View Workflow)
-
Critical
-
Resolution: Fixed
-
10.11, 11.1(EOL), 11.2, 11.4
-
None
Description
While analyzing MDEV-34422, I found a critical issue that could corrupt the redo log for general case in high load. This leads to LSN moving backwards with the possibility of data loss and unrecoverable server if the server aborts for any reason or is shutdown with innodb_fast_shutdown=2. In debug mode it would hit assert.
Here is a repeatable scenario with a debug mtr. We lower the redo buffer size to 2M and drive the code path to lead to a situation where redo log buffer gets full during mtr commit. Also, the test currently works only with no memory mapping for redo log (although the issue exist with memory map too). Memory map is currently used in linux and the decision is taken dynamically at server startup when we disable memory map if my_mmap call fails with MAP_FAILED.
In my linux box memory map is disabled when I use HDD as the log storage and it readily repeats for me. Otherwise on linux one need to force disable it in code to repeat it.
This is not hit usually as we would flush the log at regular interval and also check and flush it post mtr commit when it is close to half full.
log_buffer_flush.opt
--innodb-log-buffer-size=2M |
log_buffer_flush.test
--source include/have_innodb.inc
|
--source include/have_debug.inc
|
--source include/have_debug_sync.inc
|
--source include/count_sessions.inc
|
|
SET @save_stats_persistent = @@GLOBAL.innodb_stats_persistent; |
SET GLOBAL innodb_stats_persistent = 0; |
|
CREATE TABLE t1 ( |
col1 INT AUTO_INCREMENT PRIMARY KEY, col2 VARCHAR(1024)) ENGINE=InnoDB; |
CREATE TABLE t2(col1 INT PRIMARY KEY) ENGINE=InnoDB; |
|
INSERT INTO t1(col2) VALUES(REPEAT('abc', 300)); |
|
--connect (con1,localhost,root)
|
--echo # Block purge
|
START TRANSACTION; |
SELECT * FROM t2; |
|
--connection default
|
LET $loop_count= 12;
|
while ($loop_count > 0)
|
{
|
INSERT INTO t1(col2) SELECT col2 FROM t1; |
dec $loop_count; |
}
|
|
SELECT COUNT(*) from t1; |
SHOW VARIABLES LIKE 'innodb_log_buffer_size'; |
SHOW VARIABLES LIKE 'innodb_log_file_size'; |
|
--echo block redo log buffer to flush
|
SET @old_debug_dbug = @@global.debug_dbug; |
SET GLOBAL debug_dbug = "+d,ib_log_block_write"; |
SET GLOBAL debug_dbug = "+d,ib_log_checkpoint_avoid_hard"; |
|
SET DEBUG_SYNC="log_append_prepare_wait_no_lock SIGNAL waiting_mtr_commit WAIT_FOR operation_done"; |
--send INSERT INTO t1(col2) SELECT col2 FROM t1
|
|
--connection con1
|
SET DEBUG_SYNC='now WAIT_FOR waiting_mtr_commit'; |
|
--echo allow redo log buffer to flush
|
SET @@global.debug_dbug = @old_debug_dbug; |
|
--echo # Perform some operation to let the LSN progress
|
INSERT INTO t2 VALUES(100); |
COMMIT; |
SET DEBUG_SYNC='now SIGNAL operation_done'; |
|
--connection default
|
--reap
|
SELECT COUNT(*) from t1; |
|
--disconnect con1
|
SET GLOBAL innodb_stats_persistent = @save_stats_persistent; |
--source include/wait_until_count_sessions.inc
|
|
SET DEBUG_SYNC="reset"; |
|
DROP TABLE t1; |
DROP TABLE t2; |
Code changes for debug enforcement
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
|
index f5d9cf85c41..e89cf8f148c 100644
|
--- a/storage/innobase/log/log0log.cc
|
+++ b/storage/innobase/log/log0log.cc
|
@@ -955,6 +955,8 @@ void log_write_up_to(lsn_t lsn, bool durable,
|
ut_ad(lsn != LSN_MAX);
|
ut_ad(lsn != 0);
|
|
+ DBUG_EXECUTE_IF("ib_log_block_write", return;);
|
+
|
if (UNIV_UNLIKELY(recv_no_ibuf_operations))
|
{
|
/* A non-final batch of recovery is active no writes to the log
|
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
|
index 6f3abb442a3..86d24c69700 100644
|
--- a/storage/innobase/mtr/mtr0mtr.cc
|
+++ b/storage/innobase/mtr/mtr0mtr.cc
|
@@ -1011,6 +1011,7 @@ ATTRIBUTE_COLD size_t log_t::append_prepare_wait(size_t b, bool ex, lsn_t lsn)
|
else
|
latch.rd_unlock();
|
|
+ DEBUG_SYNC_C("log_append_prepare_wait_no_lock");
|
log_write_up_to(lsn, is_pmem());
|
|
if (ex)
|
Case:1 Debug Assert is hit while flushing log.
|
./mtr innodb.log_buffer_flush
|
mariadbd: /home/hdd/deb/maria-src2/storage/innobase/log/log0log.cc:863: lsn_t log_t::write_buf() [with bool release_latch = true; lsn_t = long unsigned int]: Assertion `new_buf_free == ((lsn - first_lsn) & write_size_1)' failed. |
240801 17:56:24 [ERROR] mysqld got signal 6 ; |
Sorry, we probably made a mistake, and this is a bug. |
...
|
log/log0log.cc:864(unsigned long log_t::write_buf<true>())[0x6149dde38ee1] |
log/log0log.cc:994(log_write_up_to(unsigned long, bool, completion_callback const*))[0x6149dde35eaf] |
log/log0log.cc:1016(log_buffer_flush_to_disk(bool))[0x6149dde35f64] |
srv/srv0srv.cc:1270(srv_sync_log_buffer_in_background())[0x6149ddfe1c09] |
srv/srv0srv.cc:1374(srv_master_callback)[0x6149ddfe22e6] |
Case-2: We disable a few debug asserts to see whet happens in release mode.
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
|
index e89cf8f148c..876d3517f6c 100644
|
--- a/storage/innobase/log/log0log.cc
|
+++ b/storage/innobase/log/log0log.cc
|
@@ -833,7 +833,7 @@ template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
|
ut_ad(ut_is_2pow(write_size));
|
size_t length{buf_free.load(std::memory_order_relaxed)};
|
lsn_t offset{calc_lsn_offset(write_lsn)};
|
- ut_ad(length >= (offset & write_size_1));
|
+ //ut_ad(length >= (offset & write_size_1));
|
ut_ad(write_size_1 >= 511);
|
|
const byte *const write_buf{buf};
|
@@ -841,7 +841,7 @@ template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
|
|
if (length <= write_size_1)
|
{
|
- ut_ad(!((length ^ (size_t(lsn) - size_t(first_lsn))) & write_size_1));
|
+ // ut_ad(!((length ^ (size_t(lsn) - size_t(first_lsn))) & write_size_1));
|
/* Keep filling the same buffer until we have more than one block. */
|
#if 0 /* TODO: Pad the last log block with dummy records. */
|
buf_free= log_pad(lsn, (write_size_1 + 1) - length,
|
@@ -860,7 +860,7 @@ template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
|
else
|
{
|
const size_t new_buf_free{length & write_size_1};
|
- ut_ad(new_buf_free == ((lsn - first_lsn) & write_size_1));
|
+ // ut_ad(new_buf_free == ((lsn - first_lsn) & write_size_1));
|
buf_free.store(new_buf_free, std::memory_order_relaxed);
|
|
if (new_buf_free)
|
In the test log_buffer_flush.test try restarting the server at the end.
...
|
--echo # Try Restart
|
--source include/restart_mysqld.inc |
./mtr innodb.log_buffer_flush
|
2024-07-31 20:46:26 0 [ERROR] InnoDB: Missing FILE_CHECKPOINT(8416546) at 8416546 |
2024-07-31 20:46:26 0 [ERROR] InnoDB: Log scan aborted at LSN 8416546 |
2024-07-31 20:46:26 0 [ERROR] InnoDB: Plugin initialization aborted at srv0start.cc[1462] with error Generic error |
2024-07-31 20:46:26 0 [ERROR] Plugin 'InnoDB' registration as a STORAGE ENGINE failed. |
^ Found warnings in /home/hdd/deb/maria-src2/bld_dbg/mysql-test/var/log/mysqld.1.err |
Case-3: We try to backup the server.
|
--echo # Try Backup
|
let $targetdir=$MYSQLTEST_VARDIR/tmp/backup;
|
|
mkdir $targetdir;
|
exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir=$targetdir |
./mtr innodb.log_buffer_flush
|
[00] 2024-07-31 20:59:06 Waiting for log copy thread to read lsn 8416446 |
[00] 2024-07-31 20:59:07 Retrying read of log at LSN=7334851 |
[00] 2024-07-31 20:59:08 Retrying read of log at LSN=7334851 |
[00] 2024-07-31 20:59:09 Retrying read of log at LSN=7334851 |
[00] 2024-07-31 20:59:10 Retrying read of log at LSN=7334851 |
[00] FATAL ERROR: 2024-07-31 20:59:11 Was only able to copy log from 7334851 to 7334851, not 8416446; try increasing innodb_log_file_size |
Attachments
Issue Links
- is caused by
-
MDEV-27774 Reduce scalability bottlenecks in mtr_t::commit()
- Closed