Details

    Description

      Note: the `srv_purge_coordinator_thread()` should signal `purge_sys->event` part of the summary comes from Marko.

      Crashing test case

      --source include/have_innodb.inc
       
      CREATE TABLE t (pk INT AUTO_INCREMENT PRIMARY KEY, i INT, j INT, KEY(i), KEY(j)) ENGINE=InnoDB;
       
      --connect (con1,localhost,root,,)
      --send
        FLUSH TABLE t FOR EXPORT;
        
      --connection default
      --source include/restart_mysqld.inc
       
      # Cleanup
      DROP TABLE t;
      

      10.2 387bdf07ae

      #5  0x00007fac6e1233fa in abort () from /lib/x86_64-linux-gnu/libc.so.6
      #6  0x000055fbc1aca859 in ut_dbg_assertion_failed (expr=0x0, file=0x55fbc1f90cb8 "/data/src/10.2/storage/innobase/trx/trx0purge.cc", line=1835) at /data/src/10.2/storage/innobase/ut/ut0dbg.cc:61
      #7  0x000055fbc1a95f31 in trx_purge_run () at /data/src/10.2/storage/innobase/trx/trx0purge.cc:1835
      #8  0x000055fbc1a578ee in row_quiesce_table_complete (table=0x7fac18089808, trx=0x7fac6808bca8) at /data/src/10.2/storage/innobase/row/row0quiesce.cc:606
      #9  0x000055fbc18cbd9c in ha_innobase::external_lock (this=0x7fac0c018d38, thd=0x7fac0c000b00, lock_type=2) at /data/src/10.2/storage/innobase/handler/ha_innodb.cc:16273
      #10 0x000055fbc15b4236 in handler::ha_external_lock (this=0x7fac0c018d38, thd=0x7fac0c000b00, lock_type=2) at /data/src/10.2/sql/handler.cc:5897
      #11 0x000055fbc16be0fc in unlock_external (thd=0x7fac0c000b00, table=0x7fac0c01f158, count=1) at /data/src/10.2/sql/lock.cc:719
      #12 0x000055fbc16bd5d4 in mysql_unlock_tables (thd=0x7fac0c000b00, sql_lock=0x7fac0c01f140, free_lock=false) at /data/src/10.2/sql/lock.cc:429
      #13 0x000055fbc16bd285 in mysql_lock_tables (thd=0x7fac0c000b00, sql_lock=0x7fac0c01f140, flags=4096) at /data/src/10.2/sql/lock.cc:366
      #14 0x000055fbc16bcfe0 in mysql_lock_tables (thd=0x7fac0c000b00, tables=0x7fac0c011770, count=1, flags=4096) at /data/src/10.2/sql/lock.cc:303
      #15 0x000055fbc12cb15e in lock_tables (thd=0x7fac0c000b00, tables=0x7fac0c011158, count=1, flags=4096) at /data/src/10.2/sql/sql_base.cc:4902
      #16 0x000055fbc12ca97a in open_and_lock_tables (thd=0x7fac0c000b00, options=..., tables=0x7fac0c011158, derived=false, flags=4096, prelocking_strategy=0x7fac604f65a0) at /data/src/10.2/sql/sql_base.cc:4691
      #17 0x000055fbc14b3db4 in open_and_lock_tables (thd=0x7fac0c000b00, tables=0x7fac0c011158, derived=false, flags=4096, prelocking_strategy=0x7fac604f65a0) at /data/src/10.2/sql/sql_base.h:249
      #18 0x000055fbc14b4ad4 in flush_tables_with_read_lock (thd=0x7fac0c000b00, all_tables=0x7fac0c011158) at /data/src/10.2/sql/sql_reload.cc:559
      #19 0x000055fbc133d859 in mysql_execute_command (thd=0x7fac0c000b00) at /data/src/10.2/sql/sql_parse.cc:5330
      #20 0x000055fbc134511c in mysql_parse (thd=0x7fac0c000b00, rawbuf=0x7fac0c011078 "FLUSH TABLE t FOR EXPORT", length=24, parser_state=0x7fac604f7200, is_com_multi=false, is_next_command=false) at /data/src/10.2/sql/sql_parse.cc:7875
      #21 0x000055fbc1332dc0 in dispatch_command (command=COM_QUERY, thd=0x7fac0c000b00, packet=0x7fac0c008891 "FLUSH TABLE t FOR EXPORT", packet_length=24, is_com_multi=false, is_next_command=false) at /data/src/10.2/sql/sql_parse.cc:1812
      #22 0x000055fbc1331736 in do_command (thd=0x7fac0c000b00) at /data/src/10.2/sql/sql_parse.cc:1360
      #23 0x000055fbc147e917 in do_handle_one_connection (connect=0x55fbc462d800) at /data/src/10.2/sql/sql_connect.cc:1354
      #24 0x000055fbc147e6a4 in handle_one_connection (arg=0x55fbc462d800) at /data/src/10.2/sql/sql_connect.cc:1260
      #25 0x000055fbc189b060 in pfs_spawn_thread (arg=0x55fbc464c400) at /data/src/10.2/storage/perfschema/pfs.cc:1862
      #26 0x00007fac7005f494 in start_thread (arg=0x7fac604f8700) at pthread_create.c:333
      #27 0x00007fac6e1d793f in clone () from /lib/x86_64-linux-gnu/libc.so.6
      

      Also crashes on a non-debug build.

      Hanging test case

      --source include/have_innodb.inc
       
      CREATE TABLE t (pk INT AUTO_INCREMENT PRIMARY KEY, i INT, j INT, KEY(i), KEY(j)) ENGINE=InnoDB;
       
      BEGIN;
      INSERT INTO t VALUES (NULL,1,10),(NULL,2,20),(NULL,3,30),(NULL,4,40);
      INSERT INTO t SELECT NULL, i, j FROM t;
      INSERT INTO t SELECT NULL, i, j FROM t;
      INSERT INTO t SELECT NULL, i, j FROM t;
      INSERT INTO t SELECT NULL, i, j FROM t;
      INSERT INTO t SELECT NULL, i, j FROM t;
      INSERT INTO t SELECT NULL, i, j FROM t;
      INSERT INTO t SELECT NULL, i, j FROM t;
      INSERT INTO t SELECT NULL, i, j FROM t;
      INSERT INTO t SELECT NULL, i, j FROM t;
      INSERT INTO t SELECT NULL, i, j FROM t;
      COMMIT;
      UPDATE t SET i = i+10, j = j*100;
       
      --connect (con1,localhost,root,,)
      --send
        FLUSH TABLE t FOR EXPORT;
        
      --connection default
      --source include/restart_mysqld.inc
       
      # Cleanup
      DROP TABLE t;
      

      10.2 387bdf07ae

      Thread 3 (Thread 0x7f0d540cb700 (LWP 21779)):
      #0  pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
      #1  0x0000562cd0908864 in os_event::wait (this=0x562cd3b0cf80) at /data/src/10.2/storage/innobase/os/os0event.cc:166
      #2  0x0000562cd0908273 in os_event::wait_low (this=0x562cd3b0cf80, reset_sig_count=1) at /data/src/10.2/storage/innobase/os/os0event.cc:336
      #3  0x0000562cd0908619 in os_event_wait_low (event=0x562cd3b0cf80, reset_sig_count=1) at /data/src/10.2/storage/innobase/os/os0event.cc:535
      #4  0x0000562cd0a25ce6 in trx_purge_stop () at /data/src/10.2/storage/innobase/trx/trx0purge.cc:1798
      #5  0x0000562cd09e72f7 in row_quiesce_table_start (table=0x7f0d00089708, trx=0x7f0d57217ca8) at /data/src/10.2/storage/innobase/row/row0quiesce.cc:525
      #6  0x0000562cd085bd2d in ha_innobase::external_lock (this=0x7f0d0000a138, thd=0x7f0d04000b00, lock_type=0) at /data/src/10.2/storage/innobase/handler/ha_innodb.cc:16257
      #7  0x0000562cd0544236 in handler::ha_external_lock (this=0x7f0d0000a138, thd=0x7f0d04000b00, lock_type=0) at /data/src/10.2/sql/handler.cc:5897
      #8  0x0000562cd064d41b in lock_external (thd=0x7f0d04000b00, tables=0x7f0d040067e8, count=1) at /data/src/10.2/sql/lock.cc:395
      #9  0x0000562cd064d124 in mysql_lock_tables (thd=0x7f0d04000b00, sql_lock=0x7f0d040067d0, flags=4096) at /data/src/10.2/sql/lock.cc:341
      #10 0x0000562cd064cfe0 in mysql_lock_tables (thd=0x7f0d04000b00, tables=0x7f0d04011770, count=1, flags=4096) at /data/src/10.2/sql/lock.cc:303
      #11 0x0000562cd025b15e in lock_tables (thd=0x7f0d04000b00, tables=0x7f0d04011158, count=1, flags=4096) at /data/src/10.2/sql/sql_base.cc:4902
      #12 0x0000562cd025a97a in open_and_lock_tables (thd=0x7f0d04000b00, options=..., tables=0x7f0d04011158, derived=false, flags=4096, prelocking_strategy=0x7f0d540c95a0) at /data/src/10.2/sql/sql_base.cc:4691
      #13 0x0000562cd0443db4 in open_and_lock_tables (thd=0x7f0d04000b00, tables=0x7f0d04011158, derived=false, flags=4096, prelocking_strategy=0x7f0d540c95a0) at /data/src/10.2/sql/sql_base.h:249
      #14 0x0000562cd0444ad4 in flush_tables_with_read_lock (thd=0x7f0d04000b00, all_tables=0x7f0d04011158) at /data/src/10.2/sql/sql_reload.cc:559
      #15 0x0000562cd02cd859 in mysql_execute_command (thd=0x7f0d04000b00) at /data/src/10.2/sql/sql_parse.cc:5330
      #16 0x0000562cd02d511c in mysql_parse (thd=0x7f0d04000b00, rawbuf=0x7f0d04011078 "FLUSH TABLE t FOR EXPORT", length=24, parser_state=0x7f0d540ca200, is_com_multi=false, is_next_command=false) at /data/src/10.2/sql/sql_parse.cc:7875
      #17 0x0000562cd02c2dc0 in dispatch_command (command=COM_QUERY, thd=0x7f0d04000b00, packet=0x7f0d04008891 "", packet_length=24, is_com_multi=false, is_next_command=false) at /data/src/10.2/sql/sql_parse.cc:1812
      #18 0x0000562cd02c1736 in do_command (thd=0x7f0d04000b00) at /data/src/10.2/sql/sql_parse.cc:1360
      #19 0x0000562cd040e917 in do_handle_one_connection (connect=0x562cd3953580) at /data/src/10.2/sql/sql_connect.cc:1354
      #20 0x0000562cd040e6a4 in handle_one_connection (arg=0x562cd3953580) at /data/src/10.2/sql/sql_connect.cc:1260
      #21 0x0000562cd082b060 in pfs_spawn_thread (arg=0x562cd3972180) at /data/src/10.2/storage/perfschema/pfs.cc:1862
      #22 0x00007f0d6322f494 in start_thread (arg=0x7f0d540cb700) at pthread_create.c:333
      #23 0x00007f0d613a793f in clone () from /lib/x86_64-linux-gnu/libc.so.6
       
      Thread 2 (Thread 0x7f0d54081700 (LWP 21781)):
      #0  pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
      #1  0x0000562cd0ca0158 in safe_cond_wait (cond=0x562cd180e7c0 <COND_thread_count>, mp=0x562cd180d4c0 <LOCK_thread_count>, file=0x562cd0d04640 "/data/src/10.2/include/mysql/psi/mysql_thread.h", line=1138) at /data/src/10.2/mysys/thr_mutex.c:491
      #2  0x0000562cd01e2f37 in inline_mysql_cond_wait (that=0x562cd180e7c0 <COND_thread_count>, mutex=0x562cd180d4c0 <LOCK_thread_count>, src_file=0x562cd0d0639a "/data/src/10.2/sql/mysqld.cc", src_line=1823) at /data/src/10.2/include/mysql/psi/mysql_thread.h:1138
      #3  0x0000562cd01e58e6 in close_connections () at /data/src/10.2/sql/mysqld.cc:1823
      #4  0x0000562cd01e5e40 in kill_server (sig_ptr=0x0) at /data/src/10.2/sql/mysqld.cc:2006
      #5  0x0000562cd01e5ebb in kill_server_thread (arg=0x7f0d541a8e00) at /data/src/10.2/sql/mysqld.cc:2038
      #6  0x0000562cd082b060 in pfs_spawn_thread (arg=0x7f0d080013f0) at /data/src/10.2/storage/perfschema/pfs.cc:1862
      #7  0x00007f0d6322f494 in start_thread (arg=0x7f0d54081700) at pthread_create.c:333
      #8  0x00007f0d613a793f in clone () from /lib/x86_64-linux-gnu/libc.so.6
      

      Couldn't reproduce either the crash or the hang on 10.0/10.1, both produce

      2017-10-03 19:12:32 140068205673216 [Warning] InnoDB: Quiesce aborted!
      

      Attachments

        Issue Links

          Activity

            It looks like we should release the purge_sys->latch before invoking srv_purge_wakeup():

            diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
            index 16ee1d2e8e8..603b967484e 100644
            --- a/storage/innobase/trx/trx0purge.cc
            +++ b/storage/innobase/trx/trx0purge.cc
            @@ -1781,8 +1781,8 @@ trx_purge_stop(void)
             
             		const int64_t sig_count = os_event_reset(purge_sys->event);
             		purge_sys->state = PURGE_STATE_STOP;
            -		srv_purge_wakeup();
             		rw_lock_x_unlock(&purge_sys->latch);
            +		srv_purge_wakeup();
             		/* Wait for purge coordinator to signal that it
             		is suspended. */
             		os_event_wait_low(purge_sys->event, sig_count);
            

            Before this 10.2 fix attempt of MDEV-11802, there would be 2 code paths that call {srv_purge_wakeup()}} in trx_purge_stop(). One of them would hold purge_sys->latch; the other would not. After the fix, both code paths would hold purge_sys->latch. I do not see a correctness reason why we should hold the latch there.

            marko Marko Mäkelä added a comment - It looks like we should release the purge_sys->latch  before invoking srv_purge_wakeup() : diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index 16ee1d2e8e8..603b967484e 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -1781,8 +1781,8 @@ trx_purge_stop(void) const int64_t sig_count = os_event_reset(purge_sys->event); purge_sys->state = PURGE_STATE_STOP; - srv_purge_wakeup(); rw_lock_x_unlock(&purge_sys->latch); + srv_purge_wakeup(); /* Wait for purge coordinator to signal that it is suspended. */ os_event_wait_low(purge_sys->event, sig_count); Before this 10.2 fix attempt of MDEV-11802 , there would be 2 code paths that call {srv_purge_wakeup()}} in trx_purge_stop() . One of them would hold purge_sys->latch ; the other would not. After the fix, both code paths would hold purge_sys->latch . I do not see a correctness reason why we should hold the latch there.

            Coredump from 10.3 4a5e23e257e229b548599133dbed5162af9df6d9 is on perro together with the binary, logs and data directory.

            elenst Elena Stepanova added a comment - Coredump from 10.3 4a5e23e257e229b548599133dbed5162af9df6d9 is on perro together with the binary, logs and data directory.

            I was unable to repeat the hang in 10.3. The suggestion in this bug title was implemented in MDEV-14422 (10.2.12, 10.3.3).

            The hang in threads_10.3 is in dict_stats_thread, waiting for rw_lock_x_lock(dict_operation_lock). It prevents dict_stats_shutdown from terminating. The reason is that srv_worker_thread (Thread 10) is waiting for purge_sys.latch while holding dict_operation_lock in shared mode.

            In threads_10.2 it is possible that the loop in srv_purge_wakeup() is not terminating, because not only one srv_purge_worker (Thread 25) is waiting for dict_operation_lock, but also the srv_purge_coordinator_thread (Thread 22) is waiting for purge_sys->latch while holding dict_operation_lock in shared mode. This thread is blocking many DDL operations, which are waiting for an exclusive lock on dict_operation_lock, and blocking further S-lock requests.

            Either way, we must find out whether or why trx_undo_get_undo_rec() is waiting for purge_sys.latch forever. The purge threads should only hold dict_operation_lock for the duration of a single undo log record operation, so it seems that trx_undo_get_undo_rec() should be stuck. The only way how purge_sys.latch should be continuously held in exclusive mode is that srv_release_threads() remains stuck in a loop.

            elenst, can I please have a core dump of either hang?

            marko Marko Mäkelä added a comment - I was unable to repeat the hang in 10.3. The suggestion in this bug title was implemented in MDEV-14422 ( 10.2.12, 10.3.3 ). The hang in threads_10.3 is in dict_stats_thread , waiting for rw_lock_x_lock(dict_operation_lock) . It prevents dict_stats_shutdown  from terminating. The reason is that srv_worker_thread (Thread 10) is waiting for purge_sys.latch while holding dict_operation_lock in shared mode. In threads_10.2 it is possible that the loop in srv_purge_wakeup() is not terminating, because not only one srv_purge_worker  (Thread 25) is waiting for dict_operation_lock , but also the srv_purge_coordinator_thread (Thread 22) is waiting for purge_sys->latch while holding dict_operation_lock in shared mode. This thread is blocking many DDL operations, which are waiting for an exclusive lock on dict_operation_lock , and blocking further S-lock requests. Either way, we must find out whether or why trx_undo_get_undo_rec()  is waiting for purge_sys.latch forever. The purge threads should only hold dict_operation_lock  for the duration of a single undo log record operation, so it seems that trx_undo_get_undo_rec() should be stuck. The only way how purge_sys.latch  should be continuously held in exclusive mode is that srv_release_threads()  remains stuck in a loop. elenst , can I please have a core dump of either hang?

            It seems the test cases don't work well on 10.2/10.3 anymore (at least they don't for me at the moment). Here is the alternative way to reproduce the hang:

            git clone https://github.com/elenst/rqg --branch mdev13987 rqg-mdev13987
            cd rqg-mdev13987
            # Use your path to the basedir!
            perl ./run-scenario.pl --duration=350 --threads=8 --grammar=mdev13987.yy --seed=1525736601 --scenario=Restart --mysqld=--lock-wait-timeout=10 --mysqld=--innodb-lock-wait-timeout=5 --vardir=/dev/shm/vardir --basedir=/data/bld/10.3
            

            As of 10.3 15419a558370 and 10.2 6c0f3dd3414a, it fails quite well for me on 10.3 (nearly every time), and much worse on 10.2, but still fails. Stack traces from 10.2 and 10.3 are attached:
            threads_10.2
            threads_10.3

            elenst Elena Stepanova added a comment - It seems the test cases don't work well on 10.2/10.3 anymore (at least they don't for me at the moment). Here is the alternative way to reproduce the hang: git clone https://github.com/elenst/rqg --branch mdev13987 rqg-mdev13987 cd rqg-mdev13987 # Use your path to the basedir! perl ./run-scenario.pl --duration=350 --threads=8 --grammar=mdev13987.yy --seed=1525736601 --scenario=Restart --mysqld=--lock-wait-timeout=10 --mysqld=--innodb-lock-wait-timeout=5 --vardir=/dev/shm/vardir --basedir=/data/bld/10.3 As of 10.3 15419a558370 and 10.2 6c0f3dd3414a, it fails quite well for me on 10.3 (nearly every time), and much worse on 10.2, but still fails. Stack traces from 10.2 and 10.3 are attached: threads_10.2 threads_10.3

            People

              marko Marko Mäkelä
              elenst Elena Stepanova
              Votes:
              0 Vote for this issue
              Watchers:
              2 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                Git Integration

                  Error rendering 'com.xiplink.jira.git.jira_git_plugin:git-issue-webpanel'. Please contact your Jira administrators.