Uploaded image for project: 'MariaDB Server'
  1. MariaDB Server
  2. MDEV-39225

Assertion failures with innodb_log_file_size>4g

    XMLWordPrintable

Details

    • Can result in data loss
    • Q2/2026 Server Maintenance

    Description

      In MDEV-37949, I wanted to make log_sys.buf_size be trustable in the log-based writing code path. I failed to realize that the field is limited to 32 bits due to an API limitation of write system calls, but that in the memory-mapped code path we can actually have a larger innodb_log_file_size.

      As far as I can tell, we have two possible fixes: change the type of log_sys.buf_size to size_t, or adjust some code, like this:

      diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
      index 1ae2d5a7cf0..c92d71c10d9 100644
      --- a/storage/innobase/buf/buf0flu.cc
      +++ b/storage/innobase/buf/buf0flu.cc
      @@ -1848,7 +1848,6 @@ inline void log_t::write_checkpoint(lsn_t checkpoint, lsn_t end_lsn) noexcept
               resize_buf= nullptr;
               first_lsn+= capacity();
               file_size= resize_target;
      -        buf_size= unsigned(capacity());
               goto unmap_old_checkpoint;
             }
             else if (c && is_mmap())
      @@ -1873,9 +1872,7 @@ inline void log_t::write_checkpoint(lsn_t checkpoint, lsn_t end_lsn) noexcept
       #ifdef HAVE_PMEM
               if (!c)
               {
      -          buf_size= unsigned(capacity());
               first_checkpoint_in_new_archive:
      -          ut_ad(buf_size == capacity());
                 c= buf;
                 ut_ad(!memcmp_aligned<512>(c, field_ref_zero, START_OFFSET));
               }
      diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
      index 4bdd32e20e0..7ac39dc1ba1 100644
      --- a/storage/innobase/log/log0log.cc
      +++ b/storage/innobase/log/log0log.cc
      @@ -494,7 +494,6 @@ void log_t::create(lsn_t lsn) noexcept
           ut_ad(is_mmap_writeable());
           ut_ad(is_opened() == archive);
           mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
      -    buf_size= unsigned(capacity());
           if (archive)
             goto create_archive_header;
           memset_aligned<4096>(buf, 0, 4096);
      @@ -1888,7 +1887,6 @@ void log_t::clear_mmap() noexcept
       #ifdef HAVE_PMEM
         if (is_mmap_writeable() && !recv_sys.rpo)
         {
      -    buf_size= unsigned(capacity());
           mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
           ut_ad(next_checkpoint_no <= START_OFFSET / 4);
           if (archive)
      @@ -1921,7 +1919,7 @@ void log_t::clear_mmap() noexcept
       
             close_file(false);
             log_mmap= false;
      -      buf_size= std::min(buf_size, unsigned(capacity()));
      +      buf_size= unsigned(std::min(uint64_t{buf_size}, capacity()));
             ut_a(attach(log, file_size, READ_WRITE));
             ut_ad(!is_mmap());
       
      diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
      index 93cd638a229..7ee02995898 100644
      --- a/storage/innobase/mtr/mtr0mtr.cc
      +++ b/storage/innobase/mtr/mtr0mtr.cc
      @@ -342,7 +342,6 @@ void mtr_t::release()
       ATTRIBUTE_COLD lsn_t log_t::archived_mmap_switch_complete() noexcept
       {
         ut_ad(latch_have_wr());
      -  ut_ad(buf_size == capacity());
         if (!archive)
           return 0;
         ut_ad(file_size <= ARCHIVE_FILE_SIZE_MAX);
      @@ -359,7 +358,6 @@ ATTRIBUTE_COLD lsn_t log_t::archived_mmap_switch_complete() noexcept
         resize_buf= nullptr;
         first_lsn= last_lsn;
         file_size= resize_target;
      -  buf_size= unsigned(capacity());
         return lsn;
       }
       
      @@ -950,7 +948,6 @@ log_t::append_prepare<log_t::ARCHIVED_MMAP>(size_t size, bool ex) noexcept
         ut_ad(ex ? latch_have_wr() : latch_have_rd());
         ut_ad(is_mmap());
         ut_ad(is_mmap_writeable());
      -  ut_ad(buf_size == capacity());
         ut_ad(archive);
         ut_ad(archived_lsn);
       
      @@ -959,7 +956,7 @@ log_t::append_prepare<log_t::ARCHIVED_MMAP>(size_t size, bool ex) noexcept
         while (UNIV_UNLIKELY((l= write_lsn_offset.fetch_add(size + WRITE_TO_BUF) &
                               (WRITE_TO_BUF - 1)) + size +
                              (lsn= base_lsn.load(std::memory_order_relaxed)) >=
      -                       first_lsn + buf_size) &&
      +                       first_lsn + capacity()) &&
                !resize_buf && !checkpoint_buf)
         {
           /* The following is inlined here instead of being part of
      @@ -1062,11 +1059,11 @@ std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
         static_assert(mode == WRITE_NORMAL || mode == CIRCULAR_MMAP, "");
         ut_ad(bool(mode) == is_mmap());
         ut_ad(is_mmap() == is_mmap_writeable());
      -  ut_ad(!mode || buf_size == capacity());
         uint64_t l;
         static_assert(WRITE_TO_BUF == WRITE_BACKOFF << 1, "");
         while (UNIV_UNLIKELY((l= write_lsn_offset.fetch_add(size + WRITE_TO_BUF) &
      -                        (WRITE_TO_BUF - 1)) >= buf_size - size))
      +                        (WRITE_TO_BUF - 1)) >=
      +                       (mode ? capacity() : buf_size) - size))
         {
           /* The following is inlined here instead of being part of
           append_prepare_wait(), in order to increase the locality of reference
      @@ -1084,7 +1081,7 @@ std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
           set_check_for_checkpoint(true);
       
         return {lsn,
      -          buf + size_t(mode ? FIRST_LSN + (lsn - first_lsn) % buf_size : l)};
      +          buf + size_t(mode ? FIRST_LSN + (lsn - first_lsn) % capacity() : l)};
       }
       
       /** Finish appending data to the log.
      

      This code change would make mtr_t::finish_writer<log_t::CIRCULAR_MMAP>(mtr_t*,size_t) expand by 18 bytes on x86-64-v3. The difference is due to about 6 added instructions. We would also save some instructions elsewhere, because log_sys.buf_size would have to be maintained less often.

      The alternative of widening the type of log_sys.buf_size would expand mtr_t::finish_writer<log_t::CIRCULAR_MMAP>(mtr_t*,size_t) by 16 bytes on x86-64-v3. The differences in the generated code would be as follows:

      • a 32-bit load is replaced with 64-bit
      • a 32-bit remainder computation is replaced with 64-bit
      • some padding is added before a CRC-32C loop
      • some offsets for accessing members of log_sys are changed

      The size of mtr_t::finish_writer<log_t::ARCHIVED_MMAP>(mtr_t*,size_t) is not affected by these changes. The size of some instructions (including some padding before a loop) does change, but the number of instructions does not.

      I am leaning towards the larger patch above.

      Attachments

        Issue Links

          Activity

            People

              thiru Thirunarayanan Balathandayuthapani
              marko Marko Mäkelä
              Marko Mäkelä Marko Mäkelä
              Votes:
              0 Vote for this issue
              Watchers:
              1 Start watching this issue

              Dates

                Created:
                Updated:

                Git Integration

                  Error rendering 'com.xiplink.jira.git.jira_git_plugin:git-issue-webpanel'. Please contact your Jira administrators.