Uploaded image for project: 'MariaDB Server'
  1. MariaDB Server
  2. MDEV-30863

Server freeze, all threads in trx_assign_rseg_low

Details

    Description

      origin/bb-10.6-MDEV-26055-MDEV-26827 78dd355afa70f1c1eb36d8ddd235d797a2d4cc96 2023-03-15T19:11:59+02:00 with some additional patch.
      Per Marko: The problem affects trees without his modifications too.
       
      Several sessions run a DDL/DML Mix.
      At some point of time the processlist looks like
       
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': Content of processlist ---------- end
      # 2023-03-15T10:20:52 [2723568] ERROR: Reporter 'Deadlock': 9 stalled queries detected, declaring deadlock at DSN dbi:mysql:host=127.0.0.1:port=25880:user=root:database=test:mysql_local_infile=1. Will exit with STATUS_SERVER_DEADLOCKED later.
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': Content of processlist ---------- begin
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': ID -- COMMAND -- TIME -- INFO -- state
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': 17 -- Sleep -- 319 -- <undef> -- ok
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': 18 -- Killed -- 205 -- INSERT INTO `table100_innodb_key_pk_parts_2_int_autoinc` ( `col_char_255_ucs2`, `col_set_ucs2`, `col_longtext_utf8` ) VALUES ( REPEAT( _UTF8 0x1E, 242 ), CONVERT( 'get' USING UTF8 ), CONVERT( 'back' USING UCS2 ) ) /* E_R Thread1 QNO 955 CON_ID 18 */ -- stalled?
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': 19 -- Killed -- 205 -- UPDATE `table100_innodb_int_autoinc` SET `col_longtext_ucs2` = REPEAT( _ASCII 0x6258, 199 ) WHERE `col_enum_ucs2` BETWEEN CONVERT( 'uvztocbykajipcgbfcaijsbsqvfewnrawdyifzjdhxhpbuirpfxjypybjnzwchwpljdzmbbasnrmymaovoanpoongypdtrubpvdmyiyhtjllbntfuszqvmblzzcsvvywourkylxzoykajlulkirqwlsroznzobkmdreettzgtnvlekuuetinpd' USING ASCII ) AND REPEAT( _LATIN1 0xDE8E, 248 )  /* E_R Thread3 QNO 1004 CON_ID 19 */ -- stalled?
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': 20 -- Killed -- 205 -- DELETE FROM `table100_innodb_key_pk_parts_2_int_autoinc` WHERE `col_longtext_utf8` IS NULL ORDER BY `pk` LIMIT 1 /* E_R Thread2 QNO 1017 CON_ID 20 */ -- stalled?
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': 21 -- Killed -- 205 -- INSERT INTO `table100_innodb_key_pk_parts_2_int_autoinc` ( `col_longtext_utf8`, `col_longtext_ucs2`, `col_set_utf8` ) VALUES ( REPEAT( _UTF8 0x1C, 129 ), CONVERT( 'txgjhmvjmpwdfsmlslyyrxsrxkydumkbfpuhpwmtkeobmnfcdpmqnthfnqebbpzyaxpcueewdhfhefzbyhicxdkyhdsvontjfoosbyrzxnzogyjvialjaxfyzdswuhwnkevgpdxnlbwlnxtvtjhxuyxoddyocjoswfabdcqxxcuimbfgpbodlvcupyhsdmnjsdltadhwlaqkgyqukhwuohwotatvvbzljmmhibmmgikcrg' USING UCS2 ), CONVERT( 'xgjhmvjmpwdfsmlslyyrxsrxkydumkbfpuhpwmtkeobmnfcd' USING LATIN1 ) ) /* E_R Thread5 QNO 1028 CON_ID 21 */ -- stalled?
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': 22 -- Killed -- 205 -- UPDATE `table100_innodb_key_pk_parts_2_int_autoinc` SET `col_varchar_255_ucs2_key` = REPEAT( _LATIN1 0x29D1, 207 ) WHERE `col_enum_utf8` IS NOT NULL ORDER BY `pk` LIMIT 5 /* E_R Thread4 QNO 927 CON_ID 22 */ -- stalled?
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': 23 -- Killed -- 205 -- UPDATE `table100_innodb_int_autoinc` SET `col_varchar_255_utf8_key` = REPEAT( _LATIN1 0x1591, 66 ) WHERE `col_set_utf8_key` BETWEEN CONVERT( 'mjvxogltyqjvrnpnglicoskkieckxstgobkmxfgzaqetvu' USING LATIN1 ) AND REPEAT( _LATIN1 0x2, 217 )  /* E_R Thread7 QNO 1021 CON_ID 23 */ -- stalled?
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': 24 -- Killed -- 205 -- UPDATE `table100_innodb_key_pk_parts_2_int_autoinc` SET `col_varchar_255_utf8` = CONVERT( 'omdflvtldhz' USING UTF8 ) WHERE `col_enum_ucs2_key` IS NOT NULL ORDER BY `pk` LIMIT 2 /* E_R Thread6 QNO 1031 CON_ID 24 */ -- stalled?
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': 25 -- Killed -- 205 -- UPDATE `table100_innodb_key_pk_parts_2_int_autoinc` SET `col_set_ucs2` = REPEAT( _UTF8 0xC, 46 ) WHERE `col_enum_ucs2` LIKE CONCAT( REPEAT( _UCS2 0xA, 240 ), '%' )  /* E_R Thread8 QNO 1014 CON_ID 25 */ -- stalled?
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': 26 -- Killed -- 205 -- INSERT INTO `table100_innodb_key_pk_parts_2_int_autoinc` ( `col_set_utf8`, `col_enum_utf8_key`, `col_char_255_utf8` ) VALUES ( REPEAT( _ASCII 0xC716, 24 ), REPEAT( _UCS2 0x646, 140 ), CONVERT( 'hxtyhogddlxobyaqntcyfqauvgivvxqlpbtlxckexcugzcshsqdbbhkgppazqpbvzyttzisdtmmpciojsnfvjrglfwpcjvgleziycgrdkn' USING UTF8 ) ) /* E_R Thread9 QNO 969 CON_ID 26 */ -- stalled?
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': 37 -- Query -- 0 -- SHOW FULL PROCESSLIST -- ok
      # 2023-03-15T10:20:52 [2723568] Reporter 'Deadlock': Content of processlist ---------- end
       
       
      RQG
      ===
      # git clone https://github.com/mleich1/rqg --branch <pick the right branch> RQG
      #
      # GIT_SHOW: HEAD -> master, origin/master, origin/HEAD 0c14239e5cb9f295f4571da4bc282e948ed79733 2023-03-13T14:23:42+01:00
      # rqg.pl  : Version 4.2.1 (2022-12)
      #
      # $RQG_HOME/rqg.pl \
      # --grammar=conf/engines/many_indexes.yy \
      # --gendata=conf/engines/many_indexes.zz \
      # --reporters=Mariabackup_linux \
      # --mysqld=--loose-innodb-log-file-size=150M \
      # --mysqld=--loose-innodb_lock_schedule_algorithm=fcfs \
      # --mysqld=--loose-idle_write_transaction_timeout=0 \
      # --mysqld=--loose-idle_transaction_timeout=0 \
      # --mysqld=--loose-idle_readonly_transaction_timeout=0 \
      # --mysqld=--connect_timeout=60 \
      # --mysqld=--interactive_timeout=28800 \
      # --mysqld=--slave_net_timeout=60 \
      # --mysqld=--net_read_timeout=30 \
      # --mysqld=--net_write_timeout=60 \
      # --mysqld=--loose-table_lock_wait_timeout=50 \
      # --mysqld=--wait_timeout=28800 \
      # --mysqld=--lock-wait-timeout=86400 \
      # --mysqld=--innodb-lock-wait-timeout=50 \
      # --no_mask \
      # --queries=10000000 \
      # --seed=random \
      # --reporters=Backtrace \
      # --reporters=ErrorLog \
      # --reporters=Deadlock \
      # --validators=None \
      # --mysqld=--log_output=none \
      # --mysqld=--log_bin_trust_function_creators=1 \
      # --mysqld=--loose-debug_assert_on_not_freed_memory=0 \
      # --engine=InnoDB \
      # --restart_timeout=240 \
      # --mysqld=--plugin-load-add=file_key_management.so \
      # --mysqld=--loose-file-key-management-filename=$RQG_HOME/conf/mariadb/encryption_keys.txt \
      # --mysqld=--plugin-load-add=provider_lzo.so \
      # --mysqld=--plugin-load-add=provider_bzip2.so \
      # --mysqld=--plugin-load-add=provider_lzma.so \
      # --mysqld=--plugin-load-add=provider_snappy.so \
      # --mysqld=--plugin-load-add=provider_lz4.so \
      # --mysqld=--loose-innodb_compression_level=1 \
      # --duration=300 \
      # --mysqld=--loose-innodb_fatal_semaphore_wait_threshold=300 \
      # --mysqld=--loose-innodb_read_only_compressed=OFF \
      # --mysqld=--innodb_stats_persistent=off \
      # --mysqld=--innodb_adaptive_hash_index=off \
      # --mysqld=--loose-innodb_evict_tables_on_commit_debug=on \
      # --mysqld=--loose-max-statement-time=30 \
      # --threads=9 \
      # --mysqld=--innodb_use_native_aio=1 \
      # --mysqld=--innodb_undo_log_truncate=ON \
      # --mysqld=--loose_innodb_change_buffering=purges \
      # --mysqld=--innodb_undo_tablespaces=3 \
      # --mysqld=--innodb_rollback_on_timeout=OFF \
      # --mysqld=--innodb_page_size=4K \
      # --mysqld=--innodb-buffer-pool-size=256M \
      # <local settings>
       
      sdp:/data1/results/1678900131/Freeze1
      gdb -c ./1/data/core.2703112 /data/Server_bin/bb-10.6-MDEV-26055-MDEV-26827A_RelWithDebInfo/bin/mysqld
      
      

      Attachments

        1. TBR-1872.cfg
          44 kB
        2. TBR-1872.yy
          1.0 kB
        3. TBR-1872.zz
          0.2 kB

        Issue Links

          Activity

            Thank you. I think that this is caused by innodb_undo_log_truncate=ON and possibly by MDEV-30671.

            marko Marko Mäkelä added a comment - Thank you. I think that this is caused by innodb_undo_log_truncate=ON and possibly by MDEV-30671 .

            I tried to reproduce this with the test innodb.undo_truncate using the following patch, but failed so far:

            diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h
            index 07c1c6a756b..a5846f30f38 100644
            --- a/storage/innobase/include/trx0types.h
            +++ b/storage/innobase/include/trx0types.h
            @@ -111,6 +111,6 @@ typedef std::vector<trx_id_t, ut_allocator<trx_id_t> >	trx_ids_t;
             
             /** The number of rollback segments; rollback segment id must fit in
             the 7 bits reserved for it in DB_ROLL_PTR. */
            -static constexpr unsigned TRX_SYS_N_RSEGS= 128;
            +static constexpr unsigned TRX_SYS_N_RSEGS= 8;
             /** Maximum number of undo tablespaces (not counting the system tablespace) */
             static constexpr unsigned TRX_SYS_MAX_UNDO_SPACES= TRX_SYS_N_RSEGS - 1;
            diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
            index 3c7c3d348af..aec2c153785 100644
            --- a/storage/innobase/lock/lock0lock.cc
            +++ b/storage/innobase/lock/lock0lock.cc
            @@ -1634,7 +1634,7 @@ inline void lock_sys_t::wait_start()
               wait_count+= WAIT_COUNT_STEP + 1;
               /* The maximum number of concurrently waiting transactions is one less
               than the maximum number of concurrent transactions. */
            -  static_assert(WAIT_COUNT_STEP == UNIV_PAGE_SIZE_MAX / 16 * TRX_SYS_N_RSEGS,
            +  static_assert(WAIT_COUNT_STEP >= UNIV_PAGE_SIZE_MAX / 16 * TRX_SYS_N_RSEGS,
                             "compatibility");
             }
             
            diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
            index d3d55a2138c..b27a0c125f0 100644
            --- a/storage/innobase/log/log0recv.cc
            +++ b/storage/innobase/log/log0recv.cc
            @@ -2424,7 +2424,7 @@ bool recv_sys_t::parse(lsn_t checkpoint_lsn, store_t *store, bool apply)
                       if (!srv_is_undo_tablespace(space_id) ||
                           page_no != SRV_UNDO_TABLESPACE_SIZE_IN_PAGES)
                         goto record_corrupted;
            -          static_assert(UT_ARR_SIZE(truncated_undo_spaces) ==
            +          static_assert(UT_ARR_SIZE(truncated_undo_spaces) >=
                                     TRX_SYS_MAX_UNDO_SPACES, "compatibility");
                       /* The entire undo tablespace will be reinitialized by
                       innodb_undo_log_truncate=ON. Discard old log for all pages. */
            diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
            index d7ab02844bf..6a057c8dcea 100644
            --- a/storage/innobase/trx/trx0trx.cc
            +++ b/storage/innobase/trx/trx0trx.cc
            @@ -813,29 +813,20 @@ static void trx_assign_rseg_low(trx_t *trx)
             	ut_d(if (trx_rseg_n_slots_debug) slot = 0);
             	trx_rseg_t*	rseg;
             
            -#ifdef UNIV_DEBUG
             	ulint	start_scan_slot = slot;
            -	bool	look_for_rollover = false;
            -#endif /* UNIV_DEBUG */
            -
             	bool	allocated;
             
             	do {
             		for (;;) {
             			rseg = &trx_sys.rseg_array[slot];
             
            -#ifdef UNIV_DEBUG
            -			/* Ensure that we are not revisiting the same
            -			slot that we have already inspected. */
            -			if (look_for_rollover) {
            -				ut_ad(start_scan_slot != slot);
            -			}
            -			look_for_rollover = true;
            -#endif /* UNIV_DEBUG */
            -
             			ut_d(if (!trx_rseg_n_slots_debug))
             			slot = (slot + 1) % TRX_SYS_N_RSEGS;
             
            +			/* Ensure that we are not revisiting the same
            +			slot that we have already inspected. */
            +			ut_a(start_scan_slot != slot);
            +
             			if (!rseg->space) {
             				continue;
             			}
            

            Everything before the last hunk is related to reducing the number of rollback segments from 128 to 8, which I hoped could improve the reproducibility.

            The last hunk simplifies a debug check and enables it for non-debug builds, so that the server should crash instead of hitting the infinite loops that we currently observe in non-debug servers.

            marko Marko Mäkelä added a comment - I tried to reproduce this with the test innodb.undo_truncate using the following patch, but failed so far: diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h index 07c1c6a756b..a5846f30f38 100644 --- a/storage/innobase/include/trx0types.h +++ b/storage/innobase/include/trx0types.h @@ -111,6 +111,6 @@ typedef std::vector<trx_id_t, ut_allocator<trx_id_t> > trx_ids_t; /** The number of rollback segments; rollback segment id must fit in the 7 bits reserved for it in DB_ROLL_PTR. */ -static constexpr unsigned TRX_SYS_N_RSEGS= 128; +static constexpr unsigned TRX_SYS_N_RSEGS= 8; /** Maximum number of undo tablespaces (not counting the system tablespace) */ static constexpr unsigned TRX_SYS_MAX_UNDO_SPACES= TRX_SYS_N_RSEGS - 1; diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 3c7c3d348af..aec2c153785 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -1634,7 +1634,7 @@ inline void lock_sys_t::wait_start() wait_count+= WAIT_COUNT_STEP + 1; /* The maximum number of concurrently waiting transactions is one less than the maximum number of concurrent transactions. */ - static_assert(WAIT_COUNT_STEP == UNIV_PAGE_SIZE_MAX / 16 * TRX_SYS_N_RSEGS, + static_assert(WAIT_COUNT_STEP >= UNIV_PAGE_SIZE_MAX / 16 * TRX_SYS_N_RSEGS, "compatibility"); } diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index d3d55a2138c..b27a0c125f0 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -2424,7 +2424,7 @@ bool recv_sys_t::parse(lsn_t checkpoint_lsn, store_t *store, bool apply) if (!srv_is_undo_tablespace(space_id) || page_no != SRV_UNDO_TABLESPACE_SIZE_IN_PAGES) goto record_corrupted; - static_assert(UT_ARR_SIZE(truncated_undo_spaces) == + static_assert(UT_ARR_SIZE(truncated_undo_spaces) >= TRX_SYS_MAX_UNDO_SPACES, "compatibility"); /* The entire undo tablespace will be reinitialized by innodb_undo_log_truncate=ON. Discard old log for all pages. */ diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index d7ab02844bf..6a057c8dcea 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -813,29 +813,20 @@ static void trx_assign_rseg_low(trx_t *trx) ut_d(if (trx_rseg_n_slots_debug) slot = 0); trx_rseg_t* rseg; -#ifdef UNIV_DEBUG ulint start_scan_slot = slot; - bool look_for_rollover = false; -#endif /* UNIV_DEBUG */ - bool allocated; do { for (;;) { rseg = &trx_sys.rseg_array[slot]; -#ifdef UNIV_DEBUG - /* Ensure that we are not revisiting the same - slot that we have already inspected. */ - if (look_for_rollover) { - ut_ad(start_scan_slot != slot); - } - look_for_rollover = true; -#endif /* UNIV_DEBUG */ - ut_d(if (!trx_rseg_n_slots_debug)) slot = (slot + 1) % TRX_SYS_N_RSEGS; + /* Ensure that we are not revisiting the same + slot that we have already inspected. */ + ut_a(start_scan_slot != slot); + if (!rseg->space) { continue; } Everything before the last hunk is related to reducing the number of rollback segments from 128 to 8, which I hoped could improve the reproducibility. The last hunk simplifies a debug check and enables it for non-debug builds, so that the server should crash instead of hitting the infinite loops that we currently observe in non-debug servers.

            Unfortunately, I was unable to reproduce this hang or crash on my system. I did reproduce some io_uring related trouble a few times: MDEV-29610 once, and a hang on write_slots in os_aio_free() a few times. That trouble went away after I rebuilt the server with cmake -DCMAKE_DISABLE_FIND_PACKAGE_URING=1.

            marko Marko Mäkelä added a comment - Unfortunately, I was unable to reproduce this hang or crash on my system. I did reproduce some io_uring related trouble a few times: MDEV-29610 once, and a hang on write_slots in os_aio_free() a few times. That trouble went away after I rebuilt the server with cmake -DCMAKE_DISABLE_FIND_PACKAGE_URING=1 .

             

            origin/10.6 2ddfb838078cb0f81b48ef878d97b966c8d9c494 2023-04-12T19:07:59+05:30 containing the patch from above and build type RelWithDebInfo
            ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
            [rr 355780 3612210]2023-04-13 13:33:59 0x7fddcb009700[rr 355780 3612213] InnoDB: Assertion failure in file /data/Server/10.6T/storage/innobase/trx/trx0trx.cc line 828
            [rr 355780 3612215]InnoDB: Failing assertion: start_scan_slot != slot
            (rr) bt
            #0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
            #1 0x00004be543253859 in __GI_abort () at abort.c:79
            #2 0x0000558592774a96 in ut_dbg_assertion_failed (expr=expr@entry=0x55859323ba62 "start_scan_slot != slot", file=file@entry=0x55859323b600 "/data/Server/10.6T/storage/innobase/trx/trx0trx.cc", line=line@entry=828)
            at /data/Server/10.6T/storage/innobase/ut/ut0dbg.cc:60
            #3 0x00005585927736c7 in trx_assign_rseg_low (trx=0x5e6c068f5680) at /data/Server/10.6T/storage/innobase/trx/trx0trx.cc:828
            #4 0x0000558592e15603 in trx_start_low (trx=0x5e6c068f5680, read_write=<optimized out>) at /data/Server/10.6T/storage/innobase/trx/trx0trx.cc:948
            #5 0x0000558592dd466d in row_insert_for_mysql (mysql_rec=mysql_rec@entry=0x55e8801b3ee0 <incomplete sequence \371>, prebuilt=0x55e8801b5358, ins_mode=ROW_INS_NORMAL)
            at /data/Server/10.6T/storage/innobase/row/row0mysql.cc:1274
            #6 0x0000558592d43af4 in ha_innobase::write_row (this=0x55e8801af690, record=0x55e8801b3ee0 <incomplete sequence \371>) at /data/Server/10.6T/storage/innobase/handler/ha_innodb.cc:7904
            #7 0x0000558592ac3e09 in handler::ha_write_row (this=0x55e8801af690, buf=0x55e8801b3ee0 <incomplete sequence \371>) at /data/Server/10.6T/sql/handler.cc:7603
            #8 0x0000558592860ea5 in write_record (thd=thd@entry=0x7fddd0000c58, table=table@entry=0x55e8801b1c48, info=info@entry=0x7fddcb007ce0, sink=sink@entry=0x0) at /data/Server/10.6T/sql/sql_insert.cc:2157
            #9 0x00005585928670df in mysql_insert (thd=thd@entry=0x7fddd0000c58, table_list=<optimized out>, fields=..., values_list=..., update_fields=..., update_values=..., duplic=<optimized out>, ignore=<optimized out>,
            result=<optimized out>) at /data/Server/10.6T/sql/sql_insert.cc:1129
            #10 0x00005585928a11e5 in mysql_execute_command (thd=0x7fddd0000c58, is_called_from_prepared_stmt=<optimized out>) at /data/Server/10.6T/sql/sql_parse.cc:4570
            #11 0x0000558592891096 in mysql_parse (rawbuf=<optimized out>, length=<optimized out>, parser_state=<optimized out>, thd=0x7fddd0000c58) at /data/Server/10.6T/sql/sql_parse.cc:8036
            #12 mysql_parse (thd=0x7fddd0000c58, rawbuf=<optimized out>, length=<optimized out>, parser_state=<optimized out>) at /data/Server/10.6T/sql/sql_parse.cc:7958
            #13 0x000055859289d462 in dispatch_command (command=COM_QUERY, thd=0x7fddd0000c58, packet=<optimized out>, packet_length=<optimized out>, blocking=<optimized out>) at /data/Server/10.6T/sql/sql_class.h:1388
            #14 0x000055859289f29e in do_command (thd=0x7fddd0000c58, blocking=blocking@entry=true) at /data/Server/10.6T/sql/sql_parse.cc:1409
            #15 0x00005585929a0417 in do_handle_one_connection (connect=<optimized out>, connect@entry=0x69067408c1f8, put_in_cache=put_in_cache@entry=true) at /data/Server/10.6T/sql/sql_connect.cc:1416
            #16 0x00005585929a06fd in handle_one_connection (arg=0x69067408c1f8) at /data/Server/10.6T/sql/sql_connect.cc:1318
            #17 0x000060256d647609 in start_thread (arg=<optimized out>) at pthread_create.c:477
            #18 0x00004be543350293 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
            (rr)
            pluto:/data/results/1681392658/TBR-1872$ _RR_TRACE_DIR=./1/rr/ rr replay --mark-stdio

            mleich Matthias Leich added a comment -   origin/10.6 2ddfb838078cb0f81b48ef878d97b966c8d9c494 2023-04-12T19:07:59+05:30 containing the patch from above and build type RelWithDebInfo ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- [rr 355780 3612210] 2023-04-13 13:33:59 0x7fddcb009700 [rr 355780 3612213] InnoDB: Assertion failure in file /data/Server/10.6T/storage/innobase/trx/trx0trx.cc line 828 [rr 355780 3612215] InnoDB: Failing assertion: start_scan_slot != slot (rr) bt #0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50 #1 0x00004be543253859 in __GI_abort () at abort.c:79 #2 0x0000558592774a96 in ut_dbg_assertion_failed (expr=expr@entry=0x55859323ba62 "start_scan_slot != slot", file=file@entry=0x55859323b600 "/data/Server/10.6T/storage/innobase/trx/trx0trx.cc", line=line@entry=828) at /data/Server/10.6T/storage/innobase/ut/ut0dbg.cc:60 #3 0x00005585927736c7 in trx_assign_rseg_low (trx=0x5e6c068f5680) at /data/Server/10.6T/storage/innobase/trx/trx0trx.cc:828 #4 0x0000558592e15603 in trx_start_low (trx=0x5e6c068f5680, read_write=<optimized out>) at /data/Server/10.6T/storage/innobase/trx/trx0trx.cc:948 #5 0x0000558592dd466d in row_insert_for_mysql (mysql_rec=mysql_rec@entry=0x55e8801b3ee0 <incomplete sequence \371>, prebuilt=0x55e8801b5358, ins_mode=ROW_INS_NORMAL) at /data/Server/10.6T/storage/innobase/row/row0mysql.cc:1274 #6 0x0000558592d43af4 in ha_innobase::write_row (this=0x55e8801af690, record=0x55e8801b3ee0 <incomplete sequence \371>) at /data/Server/10.6T/storage/innobase/handler/ha_innodb.cc:7904 #7 0x0000558592ac3e09 in handler::ha_write_row (this=0x55e8801af690, buf=0x55e8801b3ee0 <incomplete sequence \371>) at /data/Server/10.6T/sql/handler.cc:7603 #8 0x0000558592860ea5 in write_record (thd=thd@entry=0x7fddd0000c58, table=table@entry=0x55e8801b1c48, info=info@entry=0x7fddcb007ce0, sink=sink@entry=0x0) at /data/Server/10.6T/sql/sql_insert.cc:2157 #9 0x00005585928670df in mysql_insert (thd=thd@entry=0x7fddd0000c58, table_list=<optimized out>, fields=..., values_list=..., update_fields=..., update_values=..., duplic=<optimized out>, ignore=<optimized out>, result=<optimized out>) at /data/Server/10.6T/sql/sql_insert.cc:1129 #10 0x00005585928a11e5 in mysql_execute_command (thd=0x7fddd0000c58, is_called_from_prepared_stmt=<optimized out>) at /data/Server/10.6T/sql/sql_parse.cc:4570 #11 0x0000558592891096 in mysql_parse (rawbuf=<optimized out>, length=<optimized out>, parser_state=<optimized out>, thd=0x7fddd0000c58) at /data/Server/10.6T/sql/sql_parse.cc:8036 #12 mysql_parse (thd=0x7fddd0000c58, rawbuf=<optimized out>, length=<optimized out>, parser_state=<optimized out>) at /data/Server/10.6T/sql/sql_parse.cc:7958 #13 0x000055859289d462 in dispatch_command (command=COM_QUERY, thd=0x7fddd0000c58, packet=<optimized out>, packet_length=<optimized out>, blocking=<optimized out>) at /data/Server/10.6T/sql/sql_class.h:1388 #14 0x000055859289f29e in do_command (thd=0x7fddd0000c58, blocking=blocking@entry=true) at /data/Server/10.6T/sql/sql_parse.cc:1409 #15 0x00005585929a0417 in do_handle_one_connection (connect=<optimized out>, connect@entry=0x69067408c1f8, put_in_cache=put_in_cache@entry=true) at /data/Server/10.6T/sql/sql_connect.cc:1416 #16 0x00005585929a06fd in handle_one_connection (arg=0x69067408c1f8) at /data/Server/10.6T/sql/sql_connect.cc:1318 #17 0x000060256d647609 in start_thread (arg=<optimized out>) at pthread_create.c:477 #18 0x00004be543350293 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95 (rr) pluto:/data/results/1681392658/TBR-1872$ _RR_TRACE_DIR=./1/rr/ rr replay --mark-stdio

            I think that the 10.6 version of MDEV-30671 requires the following fixup:

            diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
            index 8b7eacbbc18..1d95b7d2e7a 100644
            --- a/storage/innobase/include/trx0rseg.h
            +++ b/storage/innobase/include/trx0rseg.h
            @@ -129,7 +129,8 @@ struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_rseg_t
             #endif
               }
               /** @return whether the segment is marked for undo truncation */
            -  bool skip_allocation() const { return ref_load() & SKIP; }
            +  bool skip_allocation() const
            +  { return ref.load(std::memory_order_acquire) & SKIP; }
               /** Increment the reference count */
               void acquire()
               { ut_d(auto r=) ref.fetch_add(REF); ut_ad(!(r & SKIP)); }
            diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
            index d30300d70a7..6d95dcf06f1 100644
            --- a/storage/innobase/trx/trx0rseg.cc
            +++ b/storage/innobase/trx/trx0rseg.cc
            @@ -403,6 +403,7 @@ void trx_rseg_t::reinit(uint32_t page)
               last_commit_and_offset= 0;
               last_page_no= FIL_NULL;
               curr_size= 1;
            +  ref.store(0, std::memory_order_release);
             }
             
             /** Read the undo log lists.
            

            In 10.5, the flag would be stored separately in a field skip_allocation. Because the call to reinit() is not protected by any latch, we must use release-acquire ordering.

            As far as I can tell, without this fix, the undo log truncation in 10.6 would never re-enable undo logging in all tablespaces. As soon as truncation has been executed on all undo tablespaces but one, and started on the last not-yet-truncated undo tablespace, InnoDB would hang.

            Apparently, our normal tests only exercise undo log truncation for one or two tablespaces while 3 undo tablespaces are configured.

            marko Marko Mäkelä added a comment - I think that the 10.6 version of MDEV-30671 requires the following fixup: diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h index 8b7eacbbc18..1d95b7d2e7a 100644 --- a/storage/innobase/include/trx0rseg.h +++ b/storage/innobase/include/trx0rseg.h @@ -129,7 +129,8 @@ struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_rseg_t #endif } /** @return whether the segment is marked for undo truncation */ - bool skip_allocation() const { return ref_load() & SKIP; } + bool skip_allocation() const + { return ref.load(std::memory_order_acquire) & SKIP; } /** Increment the reference count */ void acquire() { ut_d(auto r=) ref.fetch_add(REF); ut_ad(!(r & SKIP)); } diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc index d30300d70a7..6d95dcf06f1 100644 --- a/storage/innobase/trx/trx0rseg.cc +++ b/storage/innobase/trx/trx0rseg.cc @@ -403,6 +403,7 @@ void trx_rseg_t::reinit(uint32_t page) last_commit_and_offset= 0; last_page_no= FIL_NULL; curr_size= 1; + ref.store(0, std::memory_order_release); } /** Read the undo log lists. In 10.5, the flag would be stored separately in a field skip_allocation . Because the call to reinit() is not protected by any latch, we must use release-acquire ordering . As far as I can tell, without this fix, the undo log truncation in 10.6 would never re-enable undo logging in all tablespaces. As soon as truncation has been executed on all undo tablespaces but one, and started on the last not-yet-truncated undo tablespace, InnoDB would hang. Apparently, our normal tests only exercise undo log truncation for one or two tablespaces while 3 undo tablespaces are configured.

            origin/10.6 2ddfb838078cb0f81b48ef878d97b966c8d9c494 2023-04-12T19:07:59+05:30
            + the patch from above
            ~ 900 runs of the replay test from above: All tests pass
            So the patch fixes the problem.
            

            mleich Matthias Leich added a comment - origin/10.6 2ddfb838078cb0f81b48ef878d97b966c8d9c494 2023-04-12T19:07:59+05:30 + the patch from above ~ 900 runs of the replay test from above: All tests pass So the patch fixes the problem.

            People

              marko Marko Mäkelä
              mleich Matthias Leich
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                Git Integration

                  Error rendering 'com.xiplink.jira.git.jira_git_plugin:git-issue-webpanel'. Please contact your Jira administrators.