Details
-
Bug
-
Status: In Review (View Workflow)
-
Blocker
-
Resolution: Unresolved
-
N/A
-
Can result in data loss
-
Q2/2026 Server Maintenance
Description
In MDEV-37949, I wanted to make log_sys.buf_size be trustable in the log-based writing code path. I failed to realize that the field is limited to 32 bits due to an API limitation of write system calls, but that in the memory-mapped code path we can actually have a larger innodb_log_file_size.
As far as I can tell, we have two possible fixes: change the type of log_sys.buf_size to size_t, or adjust some code, like this:
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
|
index 1ae2d5a7cf0..c92d71c10d9 100644
|
--- a/storage/innobase/buf/buf0flu.cc
|
+++ b/storage/innobase/buf/buf0flu.cc
|
@@ -1848,7 +1848,6 @@ inline void log_t::write_checkpoint(lsn_t checkpoint, lsn_t end_lsn) noexcept
|
resize_buf= nullptr;
|
first_lsn+= capacity();
|
file_size= resize_target;
|
- buf_size= unsigned(capacity());
|
goto unmap_old_checkpoint;
|
}
|
else if (c && is_mmap())
|
@@ -1873,9 +1872,7 @@ inline void log_t::write_checkpoint(lsn_t checkpoint, lsn_t end_lsn) noexcept
|
#ifdef HAVE_PMEM
|
if (!c)
|
{
|
- buf_size= unsigned(capacity());
|
first_checkpoint_in_new_archive:
|
- ut_ad(buf_size == capacity());
|
c= buf;
|
ut_ad(!memcmp_aligned<512>(c, field_ref_zero, START_OFFSET));
|
}
|
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
|
index 4bdd32e20e0..7ac39dc1ba1 100644
|
--- a/storage/innobase/log/log0log.cc
|
+++ b/storage/innobase/log/log0log.cc
|
@@ -494,7 +494,6 @@ void log_t::create(lsn_t lsn) noexcept
|
ut_ad(is_mmap_writeable());
|
ut_ad(is_opened() == archive);
|
mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
|
- buf_size= unsigned(capacity());
|
if (archive)
|
goto create_archive_header;
|
memset_aligned<4096>(buf, 0, 4096);
|
@@ -1888,7 +1887,6 @@ void log_t::clear_mmap() noexcept
|
#ifdef HAVE_PMEM
|
if (is_mmap_writeable() && !recv_sys.rpo)
|
{
|
- buf_size= unsigned(capacity());
|
mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
|
ut_ad(next_checkpoint_no <= START_OFFSET / 4);
|
if (archive)
|
@@ -1921,7 +1919,7 @@ void log_t::clear_mmap() noexcept
|
|
close_file(false);
|
log_mmap= false;
|
- buf_size= std::min(buf_size, unsigned(capacity()));
|
+ buf_size= unsigned(std::min(uint64_t{buf_size}, capacity()));
|
ut_a(attach(log, file_size, READ_WRITE));
|
ut_ad(!is_mmap());
|
|
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
|
index 93cd638a229..7ee02995898 100644
|
--- a/storage/innobase/mtr/mtr0mtr.cc
|
+++ b/storage/innobase/mtr/mtr0mtr.cc
|
@@ -342,7 +342,6 @@ void mtr_t::release()
|
ATTRIBUTE_COLD lsn_t log_t::archived_mmap_switch_complete() noexcept
|
{
|
ut_ad(latch_have_wr());
|
- ut_ad(buf_size == capacity());
|
if (!archive)
|
return 0;
|
ut_ad(file_size <= ARCHIVE_FILE_SIZE_MAX);
|
@@ -359,7 +358,6 @@ ATTRIBUTE_COLD lsn_t log_t::archived_mmap_switch_complete() noexcept
|
resize_buf= nullptr;
|
first_lsn= last_lsn;
|
file_size= resize_target;
|
- buf_size= unsigned(capacity());
|
return lsn;
|
}
|
|
@@ -950,7 +948,6 @@ log_t::append_prepare<log_t::ARCHIVED_MMAP>(size_t size, bool ex) noexcept
|
ut_ad(ex ? latch_have_wr() : latch_have_rd());
|
ut_ad(is_mmap());
|
ut_ad(is_mmap_writeable());
|
- ut_ad(buf_size == capacity());
|
ut_ad(archive);
|
ut_ad(archived_lsn);
|
|
@@ -959,7 +956,7 @@ log_t::append_prepare<log_t::ARCHIVED_MMAP>(size_t size, bool ex) noexcept
|
while (UNIV_UNLIKELY((l= write_lsn_offset.fetch_add(size + WRITE_TO_BUF) &
|
(WRITE_TO_BUF - 1)) + size +
|
(lsn= base_lsn.load(std::memory_order_relaxed)) >=
|
- first_lsn + buf_size) &&
|
+ first_lsn + capacity()) &&
|
!resize_buf && !checkpoint_buf)
|
{
|
/* The following is inlined here instead of being part of
|
@@ -1062,11 +1059,11 @@ std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
|
static_assert(mode == WRITE_NORMAL || mode == CIRCULAR_MMAP, "");
|
ut_ad(bool(mode) == is_mmap());
|
ut_ad(is_mmap() == is_mmap_writeable());
|
- ut_ad(!mode || buf_size == capacity());
|
uint64_t l;
|
static_assert(WRITE_TO_BUF == WRITE_BACKOFF << 1, "");
|
while (UNIV_UNLIKELY((l= write_lsn_offset.fetch_add(size + WRITE_TO_BUF) &
|
- (WRITE_TO_BUF - 1)) >= buf_size - size))
|
+ (WRITE_TO_BUF - 1)) >=
|
+ (mode ? capacity() : buf_size) - size))
|
{
|
/* The following is inlined here instead of being part of
|
append_prepare_wait(), in order to increase the locality of reference
|
@@ -1084,7 +1081,7 @@ std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
|
set_check_for_checkpoint(true);
|
|
return {lsn,
|
- buf + size_t(mode ? FIRST_LSN + (lsn - first_lsn) % buf_size : l)};
|
+ buf + size_t(mode ? FIRST_LSN + (lsn - first_lsn) % capacity() : l)};
|
}
|
|
/** Finish appending data to the log. |
This code change would make mtr_t::finish_writer<log_t::CIRCULAR_MMAP>(mtr_t*,size_t) expand by 18 bytes on x86-64-v3. The difference is due to about 6 added instructions. We would also save some instructions elsewhere, because log_sys.buf_size would have to be maintained less often.
The alternative of widening the type of log_sys.buf_size would expand mtr_t::finish_writer<log_t::CIRCULAR_MMAP>(mtr_t*,size_t) by 16 bytes on x86-64-v3. The differences in the generated code would be as follows:
- a 32-bit load is replaced with 64-bit
- a 32-bit remainder computation is replaced with 64-bit
- some padding is added before a CRC-32C loop
- some offsets for accessing members of log_sys are changed
The size of mtr_t::finish_writer<log_t::ARCHIVED_MMAP>(mtr_t*,size_t) is not affected by these changes. The size of some instructions (including some padding before a loop) does change, but the number of instructions does not.
I am leaning towards the larger patch above.
Attachments
Issue Links
- blocks
-
MDEV-14992 BACKUP SERVER to mounted file system
-
- In Progress
-
- is caused by
-
MDEV-37949 Implement innodb_log_archive
-
- Approved
-