Uploaded image for project: 'MariaDB Server'
  1. MariaDB Server
  2. MDEV-25125

Draft: InnoDB: Assertion failure in trx0i_s.cc line 1167 upon SELECT * FROM information_schema.innodb_trx

Details

    Description

      This failure was observed first by Sujatha.  It caused a SEGV on 10.5.8-enterprise.
      After adding a SELECT * FROM information_schema.innodb_trx to some RQG grammar
      I was able to hit the problem on some 10.6 development too.
       
       2021-03-11 19:40:38 0 [Note] InnoDB: Completed resizing buffer pool at 210311 19:40:38.
             # I mention this because the resizing operation was error prone in the last 2 - 3 weeks.
      # 2021-03-11T19:43:27 [3351230] | 2021-03-11 19:40:38 0x7f1edda88300  InnoDB: Assertion failure in file /Server/bb-10.6-uring/storage/innobase/trx/trx0i_s.cc line 1167
      ??:0(__interceptor_tcgetattr)[0x7f1efae2fd30]
      # 2021-03-11T19:43:27 [3351230] | /Server_bin/bb-10.6-uring_asan/bin/mysqld(my_print_stacktrace+0xec)[0x5603cb31aa42]
      # 2021-03-11T19:43:27 [3351230] | /Server_bin/bb-10.6-uring_asan/bin/mysqld(handle_fatal_signal+0xa1a)[0x5603c9ed4aa9]
      # 2021-03-11T19:43:27 [3351230] | sigaction.c:0(__restore_rt)[0x7f1efac7d3c0]
      # 2021-03-11T19:43:27 [3351230] | ??:0(gsignal)[0x7f1efa76318b]
      # 2021-03-11T19:43:27 [3351230] | ??:0(abort)[0x7f1efa742859]
      # 2021-03-11T19:43:27 [3351230] | /Server_bin/bb-10.6-uring_asan/bin/mysqld(+0x30d5be3)[0x5603cae46be3]
      # 2021-03-11T19:43:27 [3351230] | /Server_bin/bb-10.6-uring_asan/bin/mysqld(+0x3047258)[0x5603cadb8258]
      # 2021-03-11T19:43:27 [3351230] | /Server_bin/bb-10.6-uring_asan/bin/mysqld(+0x3047517)[0x5603cadb8517]
      # 2021-03-11T19:43:27 [3351230] | ut/ut0dbg.cc:60(_sub_D_00099_0)[0x5603cadb9153]
      # 2021-03-11T19:43:27 [3351230] | trx/trx0i_s.cc:1167(fetch_data_into_cache_low(trx_i_s_cache_t*, trx_t const*))[0x5603cadb862d]
      # 2021-03-11T19:43:27 [3351230] | trx/trx0i_s.cc:1202(fetch_data_into_cache(trx_i_s_cache_t*)::{lambda(trx_t const&)#1}::operator()(trx_t const&) const)[0x5603cadb8773]
      # 2021-03-11T19:43:27 [3351230] | include/trx0sys.h:830(void thread_safe_trx_ilist_t::for_each<fetch_data_into_cache(trx_i_s_cache_t*)::{lambda(trx_t const&)#1}>(fetch_data_into_cache(trx_i_s_cache_t*)::{lambda(trx_t const&)#1}&&) const)[0x5603caa3c943]
      # 2021-03-11T19:43:27 [3351230] | trx/trx0i_s.cc:1196(fetch_data_into_cache(trx_i_s_cache_t*))[0x5603c993999d]
      # 2021-03-11T19:43:27 [3351230] | trx/trx0i_s.cc:1225(trx_i_s_possibly_fetch_data_into_cache(trx_i_s_cache_t*))[0x5603c9817c6e]
      # 2021-03-11T19:43:27 [3351230] | handler/i_s.cc:974(trx_i_s_common_fill_table(THD*, TABLE_LIST*, Item*))[0x5603c9815d42]
      # 2021-03-11T19:43:27 [3351230] | sql/sql_show.cc:8725(get_schema_tables_result(JOIN*, enum_schema_table_state))[0x5603c9819820]
      # 2021-03-11T19:43:27 [3351230] | sql/sql_select.cc:4433(JOIN::exec_inner())[0x5603c97ef8ec]
      # 2021-03-11T19:43:27 [3351230] | sql/sql_parse.cc:6230(execute_sqlcom_select(THD*, TABLE_LIST*))[0x5603c976c5b9]
      # 2021-03-11T19:43:27 [3351230] | sql/sql_parse.cc:3926(mysql_execute_command(THD*))[0x5603c975b9ac]
      # 2021-03-11T19:43:27 [3351230] | sql/sql_parse.cc:7998(mysql_parse(THD*, char*, unsigned int, Parser_state*))[0x5603c97767fb]
      # 2021-03-11T19:43:27 [3351230] | sql/sql_parse.cc:1888(dispatch_command(enum_server_command, THD*, char*, unsigned int, bool))[0x5603c974ef17]
      # 2021-03-11T19:43:27 [3351230] | sql/sql_parse.cc:1397(do_command(THD*, bool))[0x5603c974c1e9]
      # 2021-03-11T19:43:27 [3351230] | sql/sql_connect.cc:1410(do_handle_one_connection(CONNECT*, bool))[0x5603c9b3958c]
      # 2021-03-11T19:43:27 [3351230] | sql/sql_connect.cc:1314(handle_one_connection)[0x5603c9b38eee]
      # 2021-03-11T19:43:27 [3351230] | perfschema/pfs.cc:2203(pfs_spawn_thread)[0x5603ca7590cd]
      # 2021-03-11T19:43:27 [3351230] | nptl/pthread_create.c:478(start_thread)[0x7f1efac71609]
      # 2021-03-11T19:43:27 [3351230] | ??:0(clone)[0x7f1efa83f293]
      # 2021-03-11T19:43:27 [3351230] |
      # 2021-03-11T19:43:27 [3351230] | Trying to get some variables.
      # 2021-03-11T19:43:27 [3351230] | Some pointers may be invalid and cause the dump to abort.
      # 2021-03-11T19:43:27 [3351230] | Query (0x62b000069238): SELECT * FROM information_schema.innodb_trx /* E_R Thread1 QNO 5623 CON_ID 408 */
       
      I will add more information soon.
      

      Attachments

        Issue Links

          Activity

            There is a ut_error that is only present in debug builds:

            static void fetch_data_into_cache_low(trx_i_s_cache_t *cache, const trx_t *trx)
            {
              i_s_locks_row_t *requested_lock_row;
             
              assert_trx_nonlocking_or_in_list(trx);
            

            In the 10.6 core dump that I analyzed, we have the following:

            #7  0x00005578905745cb in fetch_data_into_cache_low (cache=0x557892d5f320 <trx_i_s_cache_static>, 
                trx=0x7f8e1bbebd00) at /Server/10.6/storage/innobase/trx/trx0i_s.cc:1167
            #8  0x0000557890574b6b in <lambda(const trx_t&)>::operator() (trx=<optimized out>, 
                __closure=0x7f8e0bf39bc0) at /Server/10.6/storage/innobase/trx/trx0i_s.cc:1200
            #9  thread_safe_trx_ilist_t::for_each<fetch_data_into_cache(trx_i_s_cache_t*)::<lambda(const trx_t&)> >
                (callback=@0x7f8e0bf39bc0: {__cache = 0x557892d5f320 <trx_i_s_cache_static>}, this=<optimized out>)
                at /Server/10.6/storage/innobase/include/trx0sys.h:831
            #10 fetch_data_into_cache (cache=cache@entry=0x557892d5f320 <trx_i_s_cache_static>)
                at /Server/10.6/storage/innobase/trx/trx0i_s.cc:1196
            #11 0x0000557890574e4f in trx_i_s_possibly_fetch_data_into_cache (
                cache=cache@entry=0x557892d5f320 <trx_i_s_cache_static>)
                at /Server/10.6/storage/innobase/trx/trx0i_s.cc:1222
            

            and:

            (gdb) p trx->state
            $1 = {m = {static _S_min_alignment = 4, static _S_alignment = 4, _M_i = TRX_STATE_NOT_STARTED}}
            

            This should cause the macro check_trx_state() to abort in ut_error.

            This might be broken already in 10.3, by MDEV-14638 or related changes. A proper fix might be to acquire trx->mutex and then check trx->state. Looking at the 10.6 code, this code is protected by exclusive lock_sys.latch. So, if trx is holding any locks, it cannot release them (and thus cannot fully commit) while we traverse the collection. But, trx_sys.trx_list can also contain transactions that are not necessarily holding any locks. It looks like this traversal is blocking concurrent trx_t::free() thanks to trx_sys.trx_list.mutex:

              trx_sys.trx_list.for_each([cache](const trx_t &trx) {
                if (!cache->is_truncated && trx_is_started(&trx) &&
                    &trx != purge_sys.query->trx)
                {
                  fetch_data_into_cache_low(cache, &trx);
                }
              });
            

            The only thing that seems to be missing is the trx->mutex acquisition before rechecking the trx->state. Currently, if the transaction is not holding any locks, it can transition from TRX_STATE_COMMITTED_IN_MEMORY to TRX_STATE_NOT_STARTED during our execution.

            marko Marko Mäkelä added a comment - There is a ut_error that is only present in debug builds: static void fetch_data_into_cache_low(trx_i_s_cache_t *cache, const trx_t *trx) { i_s_locks_row_t *requested_lock_row;   assert_trx_nonlocking_or_in_list(trx); In the 10.6 core dump that I analyzed, we have the following: #7 0x00005578905745cb in fetch_data_into_cache_low (cache=0x557892d5f320 <trx_i_s_cache_static>, trx=0x7f8e1bbebd00) at /Server/10.6/storage/innobase/trx/trx0i_s.cc:1167 #8 0x0000557890574b6b in <lambda(const trx_t&)>::operator() (trx=<optimized out>, __closure=0x7f8e0bf39bc0) at /Server/10.6/storage/innobase/trx/trx0i_s.cc:1200 #9 thread_safe_trx_ilist_t::for_each<fetch_data_into_cache(trx_i_s_cache_t*)::<lambda(const trx_t&)> > (callback=@0x7f8e0bf39bc0: {__cache = 0x557892d5f320 <trx_i_s_cache_static>}, this=<optimized out>) at /Server/10.6/storage/innobase/include/trx0sys.h:831 #10 fetch_data_into_cache (cache=cache@entry=0x557892d5f320 <trx_i_s_cache_static>) at /Server/10.6/storage/innobase/trx/trx0i_s.cc:1196 #11 0x0000557890574e4f in trx_i_s_possibly_fetch_data_into_cache ( cache=cache@entry=0x557892d5f320 <trx_i_s_cache_static>) at /Server/10.6/storage/innobase/trx/trx0i_s.cc:1222 and: (gdb) p trx->state $1 = {m = {static _S_min_alignment = 4, static _S_alignment = 4, _M_i = TRX_STATE_NOT_STARTED}} This should cause the macro check_trx_state() to abort in ut_error . This might be broken already in 10.3, by MDEV-14638 or related changes. A proper fix might be to acquire trx->mutex and then check trx->state . Looking at the 10.6 code, this code is protected by exclusive lock_sys.latch . So, if trx is holding any locks, it cannot release them (and thus cannot fully commit) while we traverse the collection. But, trx_sys.trx_list can also contain transactions that are not necessarily holding any locks. It looks like this traversal is blocking concurrent trx_t::free() thanks to trx_sys.trx_list.mutex : trx_sys.trx_list.for_each([cache]( const trx_t &trx) { if (!cache->is_truncated && trx_is_started(&trx) && &trx != purge_sys.query->trx) { fetch_data_into_cache_low(cache, &trx); } }); The only thing that seems to be missing is the trx->mutex acquisition before rechecking the trx->state . Currently, if the transaction is not holding any locks, it can transition from TRX_STATE_COMMITTED_IN_MEMORY to TRX_STATE_NOT_STARTED during our execution.

            In 10.3 and 10.4, fetch_data_into_cache() is protected by both lock_sys.mutex and trx_sys.mutex, and trx_t::free() is also protected by trx_sys.mutex. It looks like this race condition was introduced by MDEV-14638 in MariaDB 10.3.4 already.

            In MDEV-22701 in MariaDB Server 10.5.5, the iteration was refactored, and concurrent trx_t::free() is blocked by trx_sys.trx_list.mutex instead of trx_sys.mutex.

            I think that the following change (applicable to 10.6) should fix it:

            diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
            index d3741940d76..392d44a3cdd 100644
            --- a/storage/innobase/include/trx0sys.h
            +++ b/storage/innobase/include/trx0sys.h
            @@ -832,6 +832,14 @@ class thread_safe_trx_ilist_t
                 mysql_mutex_unlock(&mutex);
               }
             
            +  template <typename Callable> void for_each(Callable &&callback)
            +  {
            +    mysql_mutex_lock(&mutex);
            +    for (auto &trx : trx_list)
            +      callback(trx);
            +    mysql_mutex_unlock(&mutex);
            +  }
            +
               void freeze() const { mysql_mutex_lock(&mutex); }
               void unfreeze() const { mysql_mutex_unlock(&mutex); }
             
            diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc
            index 17b7f6c4701..b79c1169d33 100644
            --- a/storage/innobase/trx/trx0i_s.cc
            +++ b/storage/innobase/trx/trx0i_s.cc
            @@ -1193,11 +1193,14 @@ static void fetch_data_into_cache(trx_i_s_cache_t *cache)
               trx_i_s_cache_clear(cache);
             
               /* Capture the state of transactions */
            -  trx_sys.trx_list.for_each([cache](const trx_t &trx) {
            -    if (!cache->is_truncated && trx_is_started(&trx) &&
            +  trx_sys.trx_list.for_each([cache](trx_t &trx) {
            +    if (!cache->is_truncated && trx.state != TRX_STATE_NOT_STARTED &&
                     &trx != purge_sys.query->trx)
                 {
            -      fetch_data_into_cache_low(cache, &trx);
            +      trx.mutex_lock();
            +      if (trx.state != TRX_STATE_NOT_STARTED)
            +        fetch_data_into_cache_low(cache, &trx);
            +      trx.mutex_unlock();
                 }
               });
               cache->is_truncated= false;
            

            marko Marko Mäkelä added a comment - In 10.3 and 10.4, fetch_data_into_cache() is protected by both lock_sys.mutex and trx_sys.mutex , and trx_t::free() is also protected by trx_sys.mutex . It looks like this race condition was introduced by MDEV-14638 in MariaDB 10.3.4 already. In MDEV-22701 in MariaDB Server 10.5.5, the iteration was refactored, and concurrent trx_t::free() is blocked by trx_sys.trx_list.mutex instead of trx_sys.mutex . I think that the following change (applicable to 10.6) should fix it: diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index d3741940d76..392d44a3cdd 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -832,6 +832,14 @@ class thread_safe_trx_ilist_t mysql_mutex_unlock(&mutex); } + template <typename Callable> void for_each(Callable &&callback) + { + mysql_mutex_lock(&mutex); + for (auto &trx : trx_list) + callback(trx); + mysql_mutex_unlock(&mutex); + } + void freeze() const { mysql_mutex_lock(&mutex); } void unfreeze() const { mysql_mutex_unlock(&mutex); } diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc index 17b7f6c4701..b79c1169d33 100644 --- a/storage/innobase/trx/trx0i_s.cc +++ b/storage/innobase/trx/trx0i_s.cc @@ -1193,11 +1193,14 @@ static void fetch_data_into_cache(trx_i_s_cache_t *cache) trx_i_s_cache_clear(cache); /* Capture the state of transactions */ - trx_sys.trx_list.for_each([cache](const trx_t &trx) { - if (!cache->is_truncated && trx_is_started(&trx) && + trx_sys.trx_list.for_each([cache](trx_t &trx) { + if (!cache->is_truncated && trx.state != TRX_STATE_NOT_STARTED && &trx != purge_sys.query->trx) { - fetch_data_into_cache_low(cache, &trx); + trx.mutex_lock(); + if (trx.state != TRX_STATE_NOT_STARTED) + fetch_data_into_cache_low(cache, &trx); + trx.mutex_unlock(); } }); cache->is_truncated= false;

            People

              marko Marko Mäkelä
              mleich Matthias Leich
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                Git Integration

                  Error rendering 'com.xiplink.jira.git.jira_git_plugin:git-issue-webpanel'. Please contact your Jira administrators.