Uploaded image for project: 'MariaDB Server'
  1. MariaDB Server
  2. MDEV-23839

innodb_fast_shutdown=0 hang on change buffer merge

    XMLWordPrintable

    Details

      Description

      I encountered the following crash when running tests locally:

      10.2 842616532a76cd078ba1d526ae4b85d3ad4be069

      innodb.xa_recovery 'innodb'              w34 [ fail ]  Found warnings/errors in server log file!
              Test ended at 2020-09-29 10:10:40
      line
      mysqld: /mariadb/10.2o/storage/innobase/include/sync0rw.ic:284: void rw_lock_s_lock_func(rw_lock_t*, ulint, const char*, unsigned int): Assertion `!rw_lock_own_flagged(lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)' failed.
      #7  0x00007fe19f0835b2 in __GI___assert_fail (assertion=assertion@entry=0x561773fc7158 "!rw_lock_own_flagged(lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)", file=file@entry=0x561773fb3980 "/mariadb/10.2o/storage/innobase/include/sync0rw.ic", line=line@entry=284, function=function@entry=0x561773fc7110 "void rw_lock_s_lock_func(rw_lock_t*, ulint, const char*, unsigned int)") at assert.c:101
      #8  0x0000561773bc1d2b in rw_lock_s_lock_func (line=1183, file_name=0x561774022b88 "/mariadb/10.2o/storage/innobase/btr/btr0cur.cc", pass=0, lock=0x561775d11918) at /mariadb/10.2o/storage/innobase/include/sync0rw.ic:286
      #9  pfs_rw_lock_s_lock_func (lock=lock@entry=0x561775d11918, pass=pass@entry=0, file_name=file_name@entry=0x561774022b88 "/mariadb/10.2o/storage/innobase/btr/btr0cur.cc", line=line@entry=1183) at /mariadb/10.2o/storage/innobase/include/sync0rw.ic:631
      #10 0x0000561773bd9aab in mtr_t::s_lock (line=1183, file=0x561774022b88 "/mariadb/10.2o/storage/innobase/btr/btr0cur.cc", lock=0x561775d11918, this=0x7fe1837fa8e0) at /mariadb/10.2o/storage/innobase/include/mtr0mtr.ic:237
      #11 btr_cur_search_to_nth_level_func (index=index@entry=0x561775d117b0, level=level@entry=0, tuple=tuple@entry=0x7fe1880012d0, mode=mode@entry=PAGE_CUR_GE, latch_mode=<optimized out>, latch_mode@entry=2, cursor=cursor@entry=0x7fe1837fa7e8, has_search_latch=0, file=0x561773fc8df8 "/mariadb/10.2o/storage/innobase/ibuf/ibuf0ibuf.cc", line=4332, mtr=0x7fe1837fa8e0, autoinc=0) at /mariadb/10.2o/storage/innobase/btr/btr0cur.cc:1183
      #12 0x00005617739ea387 in btr_pcur_open_low (index=0x561775d117b0, level=<optimized out>, level@entry=0, tuple=tuple@entry=0x7fe1880012d0, mode=mode@entry=PAGE_CUR_GE, latch_mode=latch_mode@entry=2, cursor=cursor@entry=0x7fe1837fa7e8, file=<optimized out>, line=<optimized out>, autoinc=0, mtr=0x7fe1837fa8e0) at /mariadb/10.2o/storage/innobase/include/btr0pcur.ic:457
      #13 0x00005617739ee5fa in ibuf_delete_recs (page_id=<optimized out>) at /mariadb/10.2o/storage/innobase/ibuf/ibuf0ibuf.cc:4332
      #14 0x00005617739eec38 in ibuf_merge_or_delete_for_page (block=block@entry=0x7fe1995cee70, page_id={m_space = 0, m_page_no = 835}, page_size=page_size@entry=0x7fe1995cee80, update_ibuf_bitmap=update_ibuf_bitmap@entry=1) at /mariadb/10.2o/storage/innobase/ibuf/ibuf0ibuf.cc:4480
      #15 0x0000561773c10b3c in buf_page_io_complete (bpage=bpage@entry=0x7fe1995cee70, dblwr=dblwr@entry=false, evict=evict@entry=false) at /mariadb/10.2o/storage/innobase/buf/buf0buf.cc:6206
      #16 0x0000561773c3845a in buf_read_page_low (err=err@entry=0x7fe1837fbaa4, sync=sync@entry=true, type=type@entry=0, mode=mode@entry=132, page_id={m_space = 0, m_page_no = 835}, page_size=@0x7fe1837fc130: {m_physical = 16384, m_logical = 16384, m_is_compressed = 0}, unzip=false, ignore_missing_space=false) at /mariadb/10.2o/storage/innobase/buf/buf0rea.cc:215
      #17 0x0000561773c38d9e in buf_read_page (page_id={m_space = 0, m_page_no = 835}, page_size=@0x7fe1837fc130: {m_physical = 16384, m_logical = 16384, m_is_compressed = 0}) at /mariadb/10.2o/storage/innobase/buf/buf0rea.cc:412
      #18 0x0000561773c0db4e in buf_page_get_low (page_id=page_id@entry={m_space = 0, m_page_no = 835}, page_size=@0x7fe1837fc130: {m_physical = 16384, m_logical = 16384, m_is_compressed = 0}, rw_latch=rw_latch@entry=8, guess=<optimized out>, guess@entry=0x0, mode=mode@entry=10, file=file@entry=0x561773fc8df8 "/mariadb/10.2o/storage/innobase/ibuf/ibuf0ibuf.cc", line=4786, mtr=0x7fe1837fd050, err=0x7fe1837fc104) at /mariadb/10.2o/storage/innobase/buf/buf0buf.cc:4387
      #19 0x0000561773c0f22e in buf_page_get_gen (page_id={m_space = 0, m_page_no = 835}, page_size=@0x7fe1837fc130: {m_physical = 16384, m_logical = 16384, m_is_compressed = 0}, rw_latch=rw_latch@entry=8, guess=guess@entry=0x0, mode=mode@entry=10, file=file@entry=0x561773fc8df8 "/mariadb/10.2o/storage/innobase/ibuf/ibuf0ibuf.cc", line=4786, mtr=0x7fe1837fd050, err=0x7fe1837fc104) at /mariadb/10.2o/storage/innobase/buf/buf0buf.cc:4907
      #20 0x0000561773bda515 in btr_cur_search_to_nth_level_func (index=index@entry=0x561775d117b0, level=level@entry=0, tuple=tuple@entry=0x7fe188001040, mode=mode@entry=PAGE_CUR_GE, latch_mode=<optimized out>, latch_mode@entry=2, cursor=cursor@entry=0x7fe1837fcf50, has_search_latch=<optimized out>, file=<optimized out>, line=<optimized out>, mtr=<optimized out>, autoinc=<optimized out>) at /mariadb/10.2o/storage/innobase/btr/btr0cur.cc:1281
      #21 0x0000561773be5fcc in btr_pcur_open_low (index=0x561775d117b0, level=<optimized out>, level@entry=0, tuple=tuple@entry=0x7fe188001040, mode=mode@entry=PAGE_CUR_GE, latch_mode=latch_mode@entry=2, cursor=cursor@entry=0x7fe1837fcf50, file=<optimized out>, line=<optimized out>, autoinc=0, mtr=0x7fe1837fd050) at /mariadb/10.2o/storage/innobase/include/btr0pcur.ic:457
      #22 0x0000561773be6192 in btr_pcur_open_on_user_rec_func (index=<optimized out>, tuple=tuple@entry=0x7fe188001040, mode=mode@entry=PAGE_CUR_GE, latch_mode=latch_mode@entry=2, cursor=cursor@entry=0x7fe1837fcf50, file=file@entry=0x561773fc8df8 "/mariadb/10.2o/storage/innobase/ibuf/ibuf0ibuf.cc", line=4786, mtr=0x7fe1837fd050) at /mariadb/10.2o/storage/innobase/btr/btr0pcur.cc:604
      #23 0x00005617739ef9a4 in ibuf_delete_for_discarded_space (space=37) at /mariadb/10.2o/storage/innobase/ibuf/ibuf0ibuf.cc:4786
      #24 0x0000561773c3a133 in buf_read_ibuf_merge_pages (sync=sync@entry=false, space_ids=space_ids@entry=0x7fe1837fd7a0, page_nos=page_nos@entry=0x7fe1837fd760, n_stored=1) at /mariadb/10.2o/storage/innobase/buf/buf0rea.cc:846
      #25 0x00005617739ed1ac in ibuf_merge_pages (n_pages=0x7fe1837fddf8, sync=sync@entry=false) at /mariadb/10.2o/storage/innobase/ibuf/ibuf0ibuf.cc:2519
      #26 0x00005617739ed1f5 in ibuf_merge (n_pages=n_pages@entry=0x7fe1837fddf8, sync=sync@entry=false) at /mariadb/10.2o/storage/innobase/ibuf/ibuf0ibuf.cc:2624
      #27 0x00005617739f10e3 in ibuf_merge_in_background (full=full@entry=false) at /mariadb/10.2o/storage/innobase/ibuf/ibuf0ibuf.cc:2695
      #28 0x0000561773b395c0 in srv_master_do_active_tasks () at /mariadb/10.2o/storage/innobase/srv/srv0srv.cc:2234
      #29 0x0000561773b3c036 in srv_master_thread (arg=<optimized out>) at /mariadb/10.2o/storage/innobase/srv/srv0srv.cc:2468
      #30 0x00007fe19f549ea7 in start_thread (arg=<optimized out>) at pthread_create.c:477
      #31 0x00007fe19f14ceaf in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
      

      The problem is that ibuf_merge_or_delete_for_page() is invoking ibuf_delete_recs() on the change buffer tree itself. By design, the changes to the change buffer are never buffered by the change buffer.

      The assertion fails, because ibuf->index->lock was already S-latched at the time when we are trying to acquire an S-latch. In a non-debug build, such an execution should cause a server hang on shutdown.

      I was able to repeat the crash by running the following (on 1 of 2 tries):

      ./mtr --no-reorder innodb.innodb-page_compression_default innodb.innodb-page_compression_snappy innodb.innodb-page_compression_zip innodb.innodb_wl6326_big innodb.xa_recovery
      

      The following change fixes the bug:

      diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc
      index 7e01c864a50..7068dab77a4 100644
      --- a/storage/innobase/ibuf/ibuf0ibuf.cc
      +++ b/storage/innobase/ibuf/ibuf0ibuf.cc
      @@ -4472,7 +4472,10 @@ ibuf_merge_or_delete_for_page(
       
       				fil_space_release(space);
       				if (UNIV_UNLIKELY(srv_shutdown_state)
      -				    && !srv_fast_shutdown) {
      +				    && !srv_fast_shutdown
      +				    && (!block
      +					|| btr_page_get_index_id(block->frame)
      +					!= DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
       					/* Prevent an infinite loop on slow
       					shutdown, in case the bitmap bits are
       					wrongly clear even though buffered
      

      I successfully ran the above sequence of tests 5 times with the above fix.

        Attachments

          Issue Links

            Activity

              People

              Assignee:
              marko Marko Mäkelä
              Reporter:
              marko Marko Mäkelä
              Votes:
              0 Vote for this issue
              Watchers:
              1 Start watching this issue

                Dates

                Created:
                Updated:
                Resolved:

                  Git Integration